Jupyter Workflows
Using Epochly effectively in Jupyter notebooks for interactive data science.
Setup
Install Epochly Jupyter Kernel
# Install the Epochly-enabled Jupyter kernelepochly jupyter install# With custom nameepochly jupyter install --name epochly-py312 --display-name "Python 3.12 (Epochly)"# One-command setup (installs kernel and configures extensions)epochly jupyter setup
Verify Installation
# List all Jupyter kernelsepochly jupyter list# Or use Jupyter directlyjupyter kernelspec list
Expected output:
Available kernels:python3 /usr/local/share/jupyter/kernels/python3epochly /usr/local/share/jupyter/kernels/epochly
Manual Setup in Existing Notebooks
If you prefer not to use a custom kernel, load Epochly in any notebook:
# Cell 1: Load Epochly extension%load_ext epochly# Verify it's loaded%epochly status
Starting Jupyter
With Epochly Kernel
When creating a new notebook, select Python (epochly) from the kernel dropdown.
The kernel automatically:
- Loads Epochly
- Configures optimal defaults for notebooks
- Enables magic commands
With Standard Kernel
Use magic commands to enable Epochly in any Python kernel:
%load_ext epochly
Magic Commands
Epochly provides line and cell magic commands for convenient control.
Line Magic Commands
# Check Epochly status%epochly status# Enable Epochly%epochly on# Disable Epochly%epochly off# Set optimization level%epochly level 2%epochly level 3# Show help%epochly help
Cell Magic Commands
Apply optimization to entire cells:
%%epochly# This entire cell is optimized at default levelimport numpy as npdata = np.random.rand(1_000_000)result = np.sum(data ** 2)print(f"Result: {result}")
%%epochly level=2# This cell uses Level 2 (JIT)def compute(arr):total = 0for x in arr:total += x ** 2return totalarr = np.random.rand(100_000)result = compute(arr)print(f"Computed: {result}")
%%epochly level=3# This cell uses Level 3 (Multicore)import pandas as pddf = pd.DataFrame({'value': np.random.rand(10_000_000),'category': np.random.choice(['A', 'B', 'C'], 10_000_000)})result = df.groupby('category')['value'].sum()print(result)
Interactive Benchmarking
Compare performance across different levels in real-time:
# Cell 1: Define functionimport numpy as npdef compute_task(n):arr = np.random.rand(n)return np.sum(arr ** 2 + np.sin(arr) * np.cos(arr))
# Cell 2: Baseline (no optimization)%epochly off%time result = compute_task(5_000_000)
# Cell 3: Level 2 (JIT)%epochly level 2%time result = compute_task(5_000_000)
# Cell 4: Level 3 (Multicore)%epochly level 3%time result = compute_task(5_000_000)
Automated Comparison
import epochlyimport timeimport pandas as pddef benchmark_levels(func, *args):"""Compare all optimization levels"""results = []for level in [0, 1, 2, 3]:# Configure levelepochly.set_level(level)# Warmupfunc(*args)# Measurestart = time.perf_counter()func(*args)elapsed = time.perf_counter() - startresults.append({'Level': level,'Time (s)': elapsed,'Speedup': None # Will calculate later})# Calculate speedupbaseline = results[0]['Time (s)']for r in results:r['Speedup'] = f"{baseline / r['Time (s)']:.2f}x"return pd.DataFrame(results)# Run benchmarkresults_df = benchmark_levels(compute_task, 5_000_000)results_df
Data Science Workflow
Typical notebook structure with Epochly:
# Cell 1: Setup%load_ext epochlyimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltprint("Epochly loaded")
# Cell 2: Load Data (Level 1 - I/O bound)%%epochly level=1file_paths = [f'data_{i}.csv' for i in range(10)]dfs = [pd.read_csv(path) for path in file_paths]df = pd.concat(dfs, ignore_index=True)print(f"Loaded {len(df)} rows")
# Cell 3: Data Cleaning (Level 2 - CPU bound)%%epochly level=2# Remove outliersdf = df[df['value'] > 0]# Normalizedf['value_normalized'] = (df['value'] - df['value'].mean()) / df['value'].std()print(f"Cleaned {len(df)} rows")
# Cell 4: Feature Engineering (Level 2 - JIT helps)%%epochly level=2# Create polynomial featuresdf['value_squared'] = df['value'] ** 2df['value_cubed'] = df['value'] ** 3# Create interaction featuresdf['interaction'] = df['value'] * df['other_column']print(f"Created {len(df.columns)} features")
# Cell 5: Aggregation (Level 3 - Parallel)%%epochly level=3# GroupBy aggregationresult = df.groupby('category').agg({'value': ['sum', 'mean', 'std'],'value_normalized': ['min', 'max']})print(result)
# Cell 6: Visualization (No optimization needed)%epochly offplt.figure(figsize=(10, 6))result['value']['mean'].plot(kind='bar')plt.title('Mean Value by Category')plt.ylabel('Mean Value')plt.xticks(rotation=45)plt.tight_layout()plt.show()
Visualization Integration
Plot performance improvements:
import epochlyimport numpy as npimport matplotlib.pyplot as pltimport timedef benchmark_for_plot(func, sizes):"""Benchmark across data sizes"""results = {level: [] for level in [0, 2, 3]}for size in sizes:data = np.random.rand(size)for level in [0, 2, 3]:epochly.set_level(level)# Warmupfunc(data)# Measuretimes = []for _ in range(3):start = time.perf_counter()func(data)times.append(time.perf_counter() - start)results[level].append(np.mean(times))return resultsdef compute_workload(data):return np.sum(data ** 2 + np.sin(data))# Run benchmarksizes = [100_000, 500_000, 1_000_000, 5_000_000]results = benchmark_for_plot(compute_workload, sizes)# Plot resultsplt.figure(figsize=(12, 6))plt.subplot(1, 2, 1)for level, times in results.items():plt.plot(sizes, times, marker='o', label=f'Level {level}')plt.xlabel('Data Size')plt.ylabel('Time (s)')plt.title('Performance vs Data Size')plt.legend()plt.grid(True)plt.subplot(1, 2, 2)baseline = results[0]for level, times in results.items():if level > 0:speedups = [baseline[i] / times[i] for i in range(len(times))]plt.plot(sizes, speedups, marker='o', label=f'Level {level}')plt.xlabel('Data Size')plt.ylabel('Speedup')plt.title('Speedup vs Baseline')plt.legend()plt.grid(True)plt.axhline(y=1, color='k', linestyle='--', alpha=0.3)plt.tight_layout()plt.show()
Memory-Efficient Processing
Process large files in chunks:
%%epochly level=3import pandas as pddef process_large_csv(file_path, chunksize=100_000):"""Process large CSV in memory-efficient chunks"""results = []with epochly.benchmark_context("processing"):for chunk in pd.read_csv(file_path, chunksize=chunksize):# Process chunkchunk_result = chunk.groupby('category')['value'].sum()results.append(chunk_result)# Combine resultsfinal_result = pd.concat(results).groupby(level=0).sum()return final_result# Process large fileresult = process_large_csv('large_data.csv', chunksize=500_000)print(result)
Chunk Processing with Progress
%%epochly level=3from tqdm import tqdmdef process_with_progress(file_path, chunksize=100_000):"""Process with progress bar"""# Count total rows firsttotal_rows = sum(1 for _ in open(file_path)) - 1total_chunks = (total_rows // chunksize) + 1results = []for chunk in tqdm(pd.read_csv(file_path, chunksize=chunksize),total=total_chunks,desc="Processing"):chunk_result = chunk['value'].sum()results.append(chunk_result)return sum(results)total = process_with_progress('large_data.csv')print(f"Total: {total}")
Debugging in Notebooks
Get Detailed Status
import epochly# Get comprehensive statusstatus = epochly.get_status()print("Epochly Status:")print(f" Enabled: {status['enabled']}")print(f" Level: {status['enhancement_level']}")print(f" JIT available: {status.get('jit_available', 'unknown')}")print(f" Workers: {status.get('worker_count', 'N/A')}")print(f" GPU available: {status.get('gpu_available', False)}")
Get Performance Metrics
import epochly# Get performance metricsmetrics = epochly.get_metrics()print("Performance Metrics:")print(f" Total calls: {metrics.get('total_calls', 0)}")print(f" Total time: {metrics.get('total_time', 0):.3f}s")print(f" Average time: {metrics.get('avg_time', 0):.3f}s")print(f" Cache hits: {metrics.get('cache_hits', 0)}")
Enable Verbose Logging
import osimport epochly# Enable debug loggingos.environ['EPOCHLY_LOG_LEVEL'] = 'DEBUG'# Reconfigureepochly.configure_logging(level='DEBUG', console=True)# Now run your code and see detailed logs%epochly level 2result = compute_task(1_000_000)
Best Practices
1. Set Level at Cell Level
Use cell magic to control optimization per cell:
# ✅ GOOD: Explicit level per cell%%epochly level=1# I/O operations%%epochly level=2# Data transformation%%epochly level=3# Heavy computation
2. Use Context Managers for Sections
For fine-grained control within cells:
import epochly# I/O sectionwith epochly.optimize_context(level=1):data = load_large_files()# Transform sectionwith epochly.optimize_context(level=2):transformed = transform_data(data)# Aggregate sectionwith epochly.optimize_context(level=3):result = aggregate_results(transformed)
3. Benchmark Before Committing
Always benchmark before using high levels:
# Test different levelsfor level in [0, 2, 3]:%epochly level {level}%timeit compute_task(1_000_000)
4. Disable for Debugging
Turn off optimization when debugging:
# Disable for troubleshooting%epochly off# Debug your coderesult = problematic_function(data)# Re-enable when fixed%epochly level 2
5. Use Warmup Runs
JIT needs warmup before measuring:
%%epochly level=2# Warmup runsfor _ in range(3):compute_task(10_000)# Now measure%timeit compute_task(1_000_000)
Common Issues
Issue: Kernel Not Found
Problem: "Python (epochly)" kernel doesn't appear in Jupyter
Solution:
# Reinstall kernelepochly jupyter install --force# Restart Jupyterjupyter notebook
Issue: Magic Commands Not Loading
Problem: %epochly commands not recognized
Solution:
# Manually load extension%load_ext epochly# If that fails, restart kernel and try:import epochlyepochly.load_ipython_extension(get_ipython())
Issue: No Speedup Observed
Problem: Optimization doesn't improve performance
Solutions:
- Check if workload is suitable:
%epochly status# Verify your workload type# Level 1: I/O-bound only# Level 2: Numerical loops# Level 3: Parallelizable work
- Ensure data is large enough:
# Small data may not benefitdata = np.random.rand(100) # Too small# Use larger datadata = np.random.rand(1_000_000) # Better
- Add warmup runs:
%%epochly level=2# Warmupcompute_task(10_000)# Then measure%timeit compute_task(1_000_000)
Issue: Kernel Crashes
Problem: Kernel crashes with high optimization levels
Solution:
# Reduce worker countimport osos.environ['EPOCHLY_MAX_WORKERS'] = '4'# Or use lower level%epochly level 2
Issue: Out of Memory
Problem: Memory errors during processing
Solution:
# Process in smaller batchesdef process_batches(data, batch_size=100_000):results = []for i in range(0, len(data), batch_size):batch = data[i:i+batch_size]result = process(batch)results.append(result)return combine(results)# Or reduce workersimport epochlyepochly.configure(max_workers=4)
Complete Notebook Example
# Cell 1: Setup and Configuration%load_ext epochlyimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport timeprint("Notebook initialized")%epochly status
# Cell 2: Generate Sample Data (No optimization needed)%epochly offnp.random.seed(42)n_samples = 10_000_000df = pd.DataFrame({'id': range(n_samples),'value': np.random.randn(n_samples),'category': np.random.choice(['A', 'B', 'C', 'D'], n_samples)})print(f"Generated {len(df):,} rows")df.head()
# Cell 3: Feature Engineering (Level 2 - JIT helps)%%epochly level=2# Create derived featuresdf['value_squared'] = df['value'] ** 2df['value_log'] = np.log(np.abs(df['value']) + 1)df['value_norm'] = (df['value'] - df['value'].mean()) / df['value'].std()print(f"Features created: {len(df.columns)}")
# Cell 4: Aggregation (Level 3 - Parallel)%%epochly level=3# Complex aggregationsummary = df.groupby('category').agg({'value': ['count', 'mean', 'std', 'min', 'max'],'value_squared': 'sum','value_log': 'mean'})summary
# Cell 5: Visualization (No optimization)%epochly offfig, axes = plt.subplots(2, 2, figsize=(12, 10))# Distributiondf['value'].hist(bins=50, ax=axes[0, 0])axes[0, 0].set_title('Value Distribution')# Category countssummary['value']['count'].plot(kind='bar', ax=axes[0, 1])axes[0, 1].set_title('Samples per Category')# Mean by categorysummary['value']['mean'].plot(kind='bar', ax=axes[1, 0])axes[1, 0].set_title('Mean Value by Category')# Std by categorysummary['value']['std'].plot(kind='bar', ax=axes[1, 1])axes[1, 1].set_title('Std Dev by Category')plt.tight_layout()plt.show()