Documentation

Jupyter Workflows

Using Epochly effectively in Jupyter notebooks for interactive data science.

Setup

Install Epochly Jupyter Kernel

# Install the Epochly-enabled Jupyter kernel
epochly jupyter install
# With custom name
epochly jupyter install --name epochly-py312 --display-name "Python 3.12 (Epochly)"
# One-command setup (installs kernel and configures extensions)
epochly jupyter setup

Verify Installation

# List all Jupyter kernels
epochly jupyter list
# Or use Jupyter directly
jupyter kernelspec list

Expected output:

Available kernels:
python3 /usr/local/share/jupyter/kernels/python3
epochly /usr/local/share/jupyter/kernels/epochly

Manual Setup in Existing Notebooks

If you prefer not to use a custom kernel, load Epochly in any notebook:

# Cell 1: Load Epochly extension
%load_ext epochly
# Verify it's loaded
%epochly status

Starting Jupyter

With Epochly Kernel

When creating a new notebook, select Python (epochly) from the kernel dropdown.

The kernel automatically:

  • Loads Epochly
  • Configures optimal defaults for notebooks
  • Enables magic commands

With Standard Kernel

Use magic commands to enable Epochly in any Python kernel:

%load_ext epochly

Magic Commands

Epochly provides line and cell magic commands for convenient control.

Line Magic Commands

# Check Epochly status
%epochly status
# Enable Epochly
%epochly on
# Disable Epochly
%epochly off
# Set optimization level
%epochly level 2
%epochly level 3
# Show help
%epochly help

Cell Magic Commands

Apply optimization to entire cells:

%%epochly
# This entire cell is optimized at default level
import numpy as np
data = np.random.rand(1_000_000)
result = np.sum(data ** 2)
print(f"Result: {result}")
%%epochly level=2
# This cell uses Level 2 (JIT)
def compute(arr):
total = 0
for x in arr:
total += x ** 2
return total
arr = np.random.rand(100_000)
result = compute(arr)
print(f"Computed: {result}")
%%epochly level=3
# This cell uses Level 3 (Multicore)
import pandas as pd
df = pd.DataFrame({
'value': np.random.rand(10_000_000),
'category': np.random.choice(['A', 'B', 'C'], 10_000_000)
})
result = df.groupby('category')['value'].sum()
print(result)

Interactive Benchmarking

Compare performance across different levels in real-time:

# Cell 1: Define function
import numpy as np
def compute_task(n):
arr = np.random.rand(n)
return np.sum(arr ** 2 + np.sin(arr) * np.cos(arr))
# Cell 2: Baseline (no optimization)
%epochly off
%time result = compute_task(5_000_000)
# Cell 3: Level 2 (JIT)
%epochly level 2
%time result = compute_task(5_000_000)
# Cell 4: Level 3 (Multicore)
%epochly level 3
%time result = compute_task(5_000_000)

Automated Comparison

import epochly
import time
import pandas as pd
def benchmark_levels(func, *args):
"""Compare all optimization levels"""
results = []
for level in [0, 1, 2, 3]:
# Configure level
epochly.set_level(level)
# Warmup
func(*args)
# Measure
start = time.perf_counter()
func(*args)
elapsed = time.perf_counter() - start
results.append({
'Level': level,
'Time (s)': elapsed,
'Speedup': None # Will calculate later
})
# Calculate speedup
baseline = results[0]['Time (s)']
for r in results:
r['Speedup'] = f"{baseline / r['Time (s)']:.2f}x"
return pd.DataFrame(results)
# Run benchmark
results_df = benchmark_levels(compute_task, 5_000_000)
results_df

Data Science Workflow

Typical notebook structure with Epochly:

# Cell 1: Setup
%load_ext epochly
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
print("Epochly loaded")
# Cell 2: Load Data (Level 1 - I/O bound)
%%epochly level=1
file_paths = [f'data_{i}.csv' for i in range(10)]
dfs = [pd.read_csv(path) for path in file_paths]
df = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(df)} rows")
# Cell 3: Data Cleaning (Level 2 - CPU bound)
%%epochly level=2
# Remove outliers
df = df[df['value'] > 0]
# Normalize
df['value_normalized'] = (df['value'] - df['value'].mean()) / df['value'].std()
print(f"Cleaned {len(df)} rows")
# Cell 4: Feature Engineering (Level 2 - JIT helps)
%%epochly level=2
# Create polynomial features
df['value_squared'] = df['value'] ** 2
df['value_cubed'] = df['value'] ** 3
# Create interaction features
df['interaction'] = df['value'] * df['other_column']
print(f"Created {len(df.columns)} features")
# Cell 5: Aggregation (Level 3 - Parallel)
%%epochly level=3
# GroupBy aggregation
result = df.groupby('category').agg({
'value': ['sum', 'mean', 'std'],
'value_normalized': ['min', 'max']
})
print(result)
# Cell 6: Visualization (No optimization needed)
%epochly off
plt.figure(figsize=(10, 6))
result['value']['mean'].plot(kind='bar')
plt.title('Mean Value by Category')
plt.ylabel('Mean Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Visualization Integration

Plot performance improvements:

import epochly
import numpy as np
import matplotlib.pyplot as plt
import time
def benchmark_for_plot(func, sizes):
"""Benchmark across data sizes"""
results = {level: [] for level in [0, 2, 3]}
for size in sizes:
data = np.random.rand(size)
for level in [0, 2, 3]:
epochly.set_level(level)
# Warmup
func(data)
# Measure
times = []
for _ in range(3):
start = time.perf_counter()
func(data)
times.append(time.perf_counter() - start)
results[level].append(np.mean(times))
return results
def compute_workload(data):
return np.sum(data ** 2 + np.sin(data))
# Run benchmark
sizes = [100_000, 500_000, 1_000_000, 5_000_000]
results = benchmark_for_plot(compute_workload, sizes)
# Plot results
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
for level, times in results.items():
plt.plot(sizes, times, marker='o', label=f'Level {level}')
plt.xlabel('Data Size')
plt.ylabel('Time (s)')
plt.title('Performance vs Data Size')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
baseline = results[0]
for level, times in results.items():
if level > 0:
speedups = [baseline[i] / times[i] for i in range(len(times))]
plt.plot(sizes, speedups, marker='o', label=f'Level {level}')
plt.xlabel('Data Size')
plt.ylabel('Speedup')
plt.title('Speedup vs Baseline')
plt.legend()
plt.grid(True)
plt.axhline(y=1, color='k', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()

Memory-Efficient Processing

Process large files in chunks:

%%epochly level=3
import pandas as pd
def process_large_csv(file_path, chunksize=100_000):
"""Process large CSV in memory-efficient chunks"""
results = []
with epochly.benchmark_context("processing"):
for chunk in pd.read_csv(file_path, chunksize=chunksize):
# Process chunk
chunk_result = chunk.groupby('category')['value'].sum()
results.append(chunk_result)
# Combine results
final_result = pd.concat(results).groupby(level=0).sum()
return final_result
# Process large file
result = process_large_csv('large_data.csv', chunksize=500_000)
print(result)

Chunk Processing with Progress

%%epochly level=3
from tqdm import tqdm
def process_with_progress(file_path, chunksize=100_000):
"""Process with progress bar"""
# Count total rows first
total_rows = sum(1 for _ in open(file_path)) - 1
total_chunks = (total_rows // chunksize) + 1
results = []
for chunk in tqdm(pd.read_csv(file_path, chunksize=chunksize),
total=total_chunks,
desc="Processing"):
chunk_result = chunk['value'].sum()
results.append(chunk_result)
return sum(results)
total = process_with_progress('large_data.csv')
print(f"Total: {total}")

Debugging in Notebooks

Get Detailed Status

import epochly
# Get comprehensive status
status = epochly.get_status()
print("Epochly Status:")
print(f" Enabled: {status['enabled']}")
print(f" Level: {status['enhancement_level']}")
print(f" JIT available: {status.get('jit_available', 'unknown')}")
print(f" Workers: {status.get('worker_count', 'N/A')}")
print(f" GPU available: {status.get('gpu_available', False)}")

Get Performance Metrics

import epochly
# Get performance metrics
metrics = epochly.get_metrics()
print("Performance Metrics:")
print(f" Total calls: {metrics.get('total_calls', 0)}")
print(f" Total time: {metrics.get('total_time', 0):.3f}s")
print(f" Average time: {metrics.get('avg_time', 0):.3f}s")
print(f" Cache hits: {metrics.get('cache_hits', 0)}")

Enable Verbose Logging

import os
import epochly
# Enable debug logging
os.environ['EPOCHLY_LOG_LEVEL'] = 'DEBUG'
# Reconfigure
epochly.configure_logging(level='DEBUG', console=True)
# Now run your code and see detailed logs
%epochly level 2
result = compute_task(1_000_000)

Best Practices

1. Set Level at Cell Level

Use cell magic to control optimization per cell:

# ✅ GOOD: Explicit level per cell
%%epochly level=1
# I/O operations
%%epochly level=2
# Data transformation
%%epochly level=3
# Heavy computation

2. Use Context Managers for Sections

For fine-grained control within cells:

import epochly
# I/O section
with epochly.optimize_context(level=1):
data = load_large_files()
# Transform section
with epochly.optimize_context(level=2):
transformed = transform_data(data)
# Aggregate section
with epochly.optimize_context(level=3):
result = aggregate_results(transformed)

3. Benchmark Before Committing

Always benchmark before using high levels:

# Test different levels
for level in [0, 2, 3]:
%epochly level {level}
%timeit compute_task(1_000_000)

4. Disable for Debugging

Turn off optimization when debugging:

# Disable for troubleshooting
%epochly off
# Debug your code
result = problematic_function(data)
# Re-enable when fixed
%epochly level 2

5. Use Warmup Runs

JIT needs warmup before measuring:

%%epochly level=2
# Warmup runs
for _ in range(3):
compute_task(10_000)
# Now measure
%timeit compute_task(1_000_000)

Common Issues

Issue: Kernel Not Found

Problem: "Python (epochly)" kernel doesn't appear in Jupyter

Solution:

# Reinstall kernel
epochly jupyter install --force
# Restart Jupyter
jupyter notebook

Issue: Magic Commands Not Loading

Problem: %epochly commands not recognized

Solution:

# Manually load extension
%load_ext epochly
# If that fails, restart kernel and try:
import epochly
epochly.load_ipython_extension(get_ipython())

Issue: No Speedup Observed

Problem: Optimization doesn't improve performance

Solutions:

  1. Check if workload is suitable:
%epochly status
# Verify your workload type
# Level 1: I/O-bound only
# Level 2: Numerical loops
# Level 3: Parallelizable work
  1. Ensure data is large enough:
# Small data may not benefit
data = np.random.rand(100) # Too small
# Use larger data
data = np.random.rand(1_000_000) # Better
  1. Add warmup runs:
%%epochly level=2
# Warmup
compute_task(10_000)
# Then measure
%timeit compute_task(1_000_000)

Issue: Kernel Crashes

Problem: Kernel crashes with high optimization levels

Solution:

# Reduce worker count
import os
os.environ['EPOCHLY_MAX_WORKERS'] = '4'
# Or use lower level
%epochly level 2

Issue: Out of Memory

Problem: Memory errors during processing

Solution:

# Process in smaller batches
def process_batches(data, batch_size=100_000):
results = []
for i in range(0, len(data), batch_size):
batch = data[i:i+batch_size]
result = process(batch)
results.append(result)
return combine(results)
# Or reduce workers
import epochly
epochly.configure(max_workers=4)

Complete Notebook Example

# Cell 1: Setup and Configuration
%load_ext epochly
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
print("Notebook initialized")
%epochly status
# Cell 2: Generate Sample Data (No optimization needed)
%epochly off
np.random.seed(42)
n_samples = 10_000_000
df = pd.DataFrame({
'id': range(n_samples),
'value': np.random.randn(n_samples),
'category': np.random.choice(['A', 'B', 'C', 'D'], n_samples)
})
print(f"Generated {len(df):,} rows")
df.head()
# Cell 3: Feature Engineering (Level 2 - JIT helps)
%%epochly level=2
# Create derived features
df['value_squared'] = df['value'] ** 2
df['value_log'] = np.log(np.abs(df['value']) + 1)
df['value_norm'] = (df['value'] - df['value'].mean()) / df['value'].std()
print(f"Features created: {len(df.columns)}")
# Cell 4: Aggregation (Level 3 - Parallel)
%%epochly level=3
# Complex aggregation
summary = df.groupby('category').agg({
'value': ['count', 'mean', 'std', 'min', 'max'],
'value_squared': 'sum',
'value_log': 'mean'
})
summary
# Cell 5: Visualization (No optimization)
%epochly off
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Distribution
df['value'].hist(bins=50, ax=axes[0, 0])
axes[0, 0].set_title('Value Distribution')
# Category counts
summary['value']['count'].plot(kind='bar', ax=axes[0, 1])
axes[0, 1].set_title('Samples per Category')
# Mean by category
summary['value']['mean'].plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Mean Value by Category')
# Std by category
summary['value']['std'].plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Std Dev by Category')
plt.tight_layout()
plt.show()