Documentation

scikit-learn Optimization

Optimizing machine learning workflows with Epochly for faster model training and evaluation.

Overview

Epochly optimizes scikit-learn operations by parallelizing preprocessing, training, cross-validation, and hyperparameter search.

OperationExamplesRecommended LevelSpeedup
PreprocessingFeature scaling, encoding, custom transformersLevel 2 (JIT)2-5x
TrainingRandomForest, GradientBoosting, ensemble modelsLevel 3 (Multicore)4–12x
Cross-Validationcross_val_score, KFold splittingLevel 3 (Multicore)4-12x
PredictionBatch prediction on large datasetsLevel 2 (JIT)2-4x
Hyperparameter SearchGridSearchCV, RandomizedSearchCVLevel 3 (Multicore)5-20x

Data Preprocessing

Feature Scaling

import epochly
import numpy as np
from sklearn.preprocessing import StandardScaler
@epochly.optimize(level=2)
def optimized_scaling(X):
"""Optimized feature scaling"""
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
return X_scaled, scaler
# Large dataset
X = np.random.rand(1_000_000, 50)
X_scaled, scaler = optimized_scaling(X)
print(f"Scaled features shape: {X_scaled.shape}")

Custom Feature Engineering

import epochly
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class OptimizedFeatureEngineer(BaseEstimator, TransformerMixin):
"""Custom feature engineering with Epochly optimization"""
@epochly.optimize(level=2)
def fit(self, X, y=None):
return self
@epochly.optimize(level=2)
def transform(self, X):
"""Create polynomial and interaction features"""
# Polynomial features
X_poly = X ** 2
# Interaction features
X_interactions = X[:, :5] * X[:, 5:10]
# Combine all features
X_engineered = np.hstack([X, X_poly, X_interactions])
return X_engineered
# Use the optimized transformer
engineer = OptimizedFeatureEngineer()
X = np.random.rand(100_000, 20)
X_engineered = engineer.fit_transform(X)
print(f"Engineered features: {X_engineered.shape}")

Model Training

Parallel Random Forest Training

import epochly
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
@epochly.optimize(level=3)
def train_random_forest(X, y):
"""Train Random Forest with parallel optimization"""
# Epochly automatically manages n_jobs
rf = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
rf.fit(X, y)
return rf
# Generate dataset
X, y = make_classification(n_samples=100_000, n_features=20, random_state=42)
# Train with optimization
model = train_random_forest(X, y)
print(f"Model score: {model.score(X, y):.3f}")

Gradient Boosting Optimization

import epochly
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
@epochly.optimize(level=3)
def train_gradient_boosting(X_train, y_train):
"""Optimized Gradient Boosting training"""
gb = GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=5,
random_state=42
)
gb.fit(X_train, y_train)
return gb
# Split data
X, y = make_classification(n_samples=50_000, n_features=30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Train
model = train_gradient_boosting(X_train, y_train)
print(f"Test score: {model.score(X_test, y_test):.3f}")

Cross-Validation

Parallel Cross-Validation

import epochly
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
@epochly.optimize(level=3)
def parallel_cross_validation(X, y):
"""Optimized cross-validation with multiple folds"""
model = RandomForestClassifier(n_estimators=50)
# 10-fold cross-validation
scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
return scores
X, y = make_classification(n_samples=50_000, n_features=20, random_state=42)
scores = parallel_cross_validation(X, y)
print(f"CV Scores: {scores}")
print(f"Mean: {scores.mean():.3f} (+/- {scores.std():.3f})")

Stratified K-Fold with Custom Scoring

import epochly
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import GradientBoostingClassifier
@epochly.optimize(level=3)
def stratified_cv_multi_metric(X, y):
"""Stratified CV with multiple metrics"""
model = GradientBoostingClassifier(n_estimators=50)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision', 'recall', 'f1']
results = cross_validate(model, X, y, cv=cv, scoring=scoring)
return results
X, y = make_classification(n_samples=30_000, n_features=15, random_state=42)
results = stratified_cv_multi_metric(X, y)
for metric, scores in results.items():
if metric.startswith('test_'):
print(f"{metric}: {scores.mean():.3f}")

GridSearchCV Optimization

import epochly
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
@epochly.optimize(level=3)
def optimized_grid_search(X, y):
"""Parallel hyperparameter grid search"""
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
rf,
param_grid,
cv=5,
scoring='accuracy',
verbose=1
)
grid_search.fit(X, y)
return grid_search
X, y = make_classification(n_samples=10_000, n_features=20, random_state=42)
# Run grid search
search = optimized_grid_search(X, y)
print(f"Best parameters: {search.best_params_}")
print(f"Best score: {search.best_score_:.3f}")

RandomizedSearchCV

import epochly
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint, uniform
@epochly.optimize(level=3)
def optimized_random_search(X, y):
"""Randomized hyperparameter search"""
param_distributions = {
'n_estimators': randint(50, 200),
'learning_rate': uniform(0.01, 0.3),
'max_depth': randint(3, 10),
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10)
}
gb = GradientBoostingClassifier(random_state=42)
random_search = RandomizedSearchCV(
gb,
param_distributions,
n_iter=50,
cv=5,
scoring='accuracy',
random_state=42,
verbose=1
)
random_search.fit(X, y)
return random_search
X, y = make_classification(n_samples=20_000, n_features=25, random_state=42)
search = optimized_random_search(X, y)
print(f"Best parameters: {search.best_params_}")
print(f"Best score: {search.best_score_:.3f}")

Prediction

Batch Prediction for Large Datasets

import epochly
import numpy as np
from sklearn.ensemble import RandomForestClassifier
@epochly.optimize(level=2)
def batch_prediction(model, X, batch_size=10_000):
"""Predict on large dataset in batches"""
predictions = []
for i in range(0, len(X), batch_size):
batch = X[i:i+batch_size]
batch_preds = model.predict(batch)
predictions.extend(batch_preds)
return np.array(predictions)
# Train model
X_train, y_train = make_classification(n_samples=10_000, n_features=20)
model = RandomForestClassifier(n_estimators=50)
model.fit(X_train, y_train)
# Predict on large dataset
X_test = np.random.rand(1_000_000, 20)
predictions = batch_prediction(model, X_test, batch_size=50_000)
print(f"Predictions shape: {predictions.shape}")

Pipeline Optimization

Full ML Pipeline

import epochly
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
@epochly.optimize(level=3)
def train_ml_pipeline(X, y):
"""Optimized end-to-end ML pipeline"""
# Create pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA(n_components=15)),
('classifier', RandomForestClassifier(n_estimators=100))
])
# Train pipeline
pipeline.fit(X, y)
return pipeline
# Generate data
X, y = make_classification(n_samples=50_000, n_features=30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Train and evaluate
pipeline = train_ml_pipeline(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f"Pipeline test score: {score:.3f}")

Pipeline with Feature Engineering

import epochly
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
@epochly.optimize(level=3)
def complex_pipeline(X, y):
"""Pipeline with feature engineering and selection"""
pipeline = Pipeline([
('scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=2)),
('selector', SelectKBest(f_classif, k=20)),
('classifier', LogisticRegression(max_iter=1000))
])
pipeline.fit(X, y)
return pipeline
X, y = make_classification(n_samples=20_000, n_features=15, random_state=42)
pipeline = complex_pipeline(X, y)
print(f"Number of features selected: {pipeline.named_steps['selector'].get_support().sum()}")

Custom Estimator Optimization

Optimized Scaler Example

import epochly
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class OptimizedScaler(BaseEstimator, TransformerMixin):
"""Custom scaler with Epochly optimization"""
def __init__(self):
self.mean_ = None
self.std_ = None
def fit(self, X, y=None):
"""Compute mean and std with optimization"""
with epochly.optimize_context(level=2):
self.mean_ = np.mean(X, axis=0)
self.std_ = np.std(X, axis=0)
return self
def transform(self, X):
"""Transform with optimization"""
with epochly.optimize_context(level=2):
X_scaled = (X - self.mean_) / (self.std_ + 1e-8)
return X_scaled
# Use custom scaler
scaler = OptimizedScaler()
X = np.random.rand(1_000_000, 50)
X_scaled = scaler.fit_transform(X)
print(f"Scaled data shape: {X_scaled.shape}")

Custom Transformer with State

import epochly
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class OptimizedBinning(BaseEstimator, TransformerMixin):
"""Custom binning transformer"""
def __init__(self, n_bins=10):
self.n_bins = n_bins
self.bin_edges_ = None
def fit(self, X, y=None):
"""Compute bin edges"""
with epochly.optimize_context(level=2):
self.bin_edges_ = [
np.percentile(X[:, i], np.linspace(0, 100, self.n_bins + 1))
for i in range(X.shape[1])
]
return self
def transform(self, X):
"""Bin values"""
with epochly.optimize_context(level=2):
X_binned = np.zeros_like(X)
for i in range(X.shape[1]):
X_binned[:, i] = np.digitize(X[:, i], self.bin_edges_[i])
return X_binned
binner = OptimizedBinning(n_bins=5)
X = np.random.rand(100_000, 10)
X_binned = binner.fit_transform(X)

Benchmarking ML Operations

Comparing Performance Across Levels

import epochly
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
def benchmark_training_levels():
"""Benchmark training with different optimization levels"""
X, y = make_classification(n_samples=50_000, n_features=30, random_state=42)
results = {}
# Baseline (no optimization)
start = time.perf_counter()
rf_baseline = RandomForestClassifier(n_estimators=100)
rf_baseline.fit(X, y)
results['baseline'] = time.perf_counter() - start
# Level 2 (JIT)
@epochly.optimize(level=2)
def train_level2(X, y):
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, y)
return rf
start = time.perf_counter()
rf_l2 = train_level2(X, y)
results['level2'] = time.perf_counter() - start
# Level 3 (Multicore)
@epochly.optimize(level=3)
def train_level3(X, y):
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, y)
return rf
start = time.perf_counter()
rf_l3 = train_level3(X, y)
results['level3'] = time.perf_counter() - start
# Print results
print("Training Time Comparison:")
print(f"Baseline: {results['baseline']:.2f}s")
print(f"Level 2: {results['level2']:.2f}s ({results['baseline']/results['level2']:.2f}x)")
print(f"Level 3: {results['level3']:.2f}s ({results['baseline']/results['level3']:.2f}x)")
benchmark_training_levels()

Cross-Validation Benchmark

import epochly
import time
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
def benchmark_cross_validation():
"""Benchmark CV with optimization"""
X, y = make_classification(n_samples=20_000, n_features=25, random_state=42)
model = GradientBoostingClassifier(n_estimators=50)
# Without optimization
start = time.perf_counter()
scores_baseline = cross_val_score(model, X, y, cv=10)
baseline_time = time.perf_counter() - start
# With Level 3 optimization
@epochly.optimize(level=3)
def optimized_cv(model, X, y):
return cross_val_score(model, X, y, cv=10)
start = time.perf_counter()
scores_optimized = optimized_cv(model, X, y)
optimized_time = time.perf_counter() - start
print(f"Baseline CV: {baseline_time:.2f}s")
print(f"Optimized CV: {optimized_time:.2f}s")
print(f"Speedup: {baseline_time / optimized_time:.2f}x")
benchmark_cross_validation()

Automatic n_jobs Configuration

Using auto_configure_sklearn()

Epochly can automatically configure n_jobs for scikit-learn estimators:

import epochly
from sklearn.ensemble import RandomForestClassifier
# Enable automatic n_jobs configuration
epochly.auto_configure_sklearn()
# Now scikit-learn estimators automatically use optimal n_jobs
rf = RandomForestClassifier(n_estimators=100)
# Internally sets n_jobs=-1 (all cores)
# Explicit n_jobs takes precedence
rf_custom = RandomForestClassifier(n_estimators=100, n_jobs=4)
# Uses n_jobs=4 as specified

Configure with Custom Worker Count

import epochly
# Set specific number of workers
epochly.auto_configure_sklearn(n_jobs=8)
# All sklearn estimators will use n_jobs=8 by default

Supported Estimators (40+)

The following scikit-learn estimators support automatic n_jobs configuration:

Ensemble Methods:

  • RandomForestClassifier, RandomForestRegressor
  • ExtraTreesClassifier, ExtraTreesRegressor
  • GradientBoostingClassifier, GradientBoostingRegressor
  • BaggingClassifier, BaggingRegressor
  • VotingClassifier, VotingRegressor

Linear Models:

  • LogisticRegression, LogisticRegressionCV
  • RidgeClassifier, RidgeClassifierCV
  • ElasticNet, ElasticNetCV
  • Lasso, LassoCV

Neighbors:

  • KNeighborsClassifier, KNeighborsRegressor
  • RadiusNeighborsClassifier, RadiusNeighborsRegressor

Model Selection:

  • GridSearchCV, RandomizedSearchCV
  • cross_val_score, cross_validate

And many more...

Manual Control with SklearnAutoConfigurator

import epochly
# Get the configurator
configurator = epochly.get_sklearn_configurator()
# Check configuration status
status = configurator.get_status()
print(f"Auto-config enabled: {status['enabled']}")
print(f"Default n_jobs: {status['n_jobs']}")
# Temporarily disable
configurator.disable()
# Re-enable with custom settings
configurator.enable(n_jobs=16)
# Reset to defaults
configurator.reset()

Example with Multiple Models

import epochly
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
# Enable auto-configuration
epochly.auto_configure_sklearn()
# All models automatically use optimal n_jobs
rf = RandomForestClassifier(n_estimators=100)
gb = GradientBoostingClassifier(n_estimators=100)
# Grid search also benefits
param_grid = {'n_estimators': [50, 100], 'max_depth': [5, 10]}
grid = GridSearchCV(rf, param_grid, cv=5)
# Automatically parallelized across CV folds
X, y = make_classification(n_samples=10_000, n_features=20)
grid.fit(X, y)
print(f"Best score: {grid.best_score_:.3f}")

Best Practices

1. Choose the Right Level

# Level 2 for preprocessing and prediction
@epochly.optimize(level=2)
def preprocess_data(X):
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
return scaler.fit_transform(X)
# Level 3 for training and CV
@epochly.optimize(level=3)
def train_model(X, y):
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, y)
return rf

2. Use auto_configure_sklearn() for Convenience

import epochly
# Enable once at the start
epochly.auto_configure_sklearn()
# All sklearn operations automatically optimized
# No need to manually set n_jobs

3. Combine with Pipeline for Clean Code

import epochly
from sklearn.pipeline import Pipeline
epochly.auto_configure_sklearn()
@epochly.optimize(level=3)
def train_pipeline(X, y):
pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA(n_components=10)),
('classifier', RandomForestClassifier(n_estimators=100))
])
pipeline.fit(X, y)
return pipeline

4. Batch Processing for Large Datasets

@epochly.optimize(level=3)
def process_large_dataset(X, y, batch_size=10_000):
"""Process in batches to manage memory"""
models = []
for i in range(0, len(X), batch_size):
X_batch = X[i:i+batch_size]
y_batch = y[i:i+batch_size]
model = RandomForestClassifier(n_estimators=50)
model.fit(X_batch, y_batch)
models.append(model)
return models

5. Monitor Memory Usage

import epochly
from sklearn.ensemble import RandomForestClassifier
# Configure memory limits
epochly.configure(
enhancement_level=3,
max_workers=4, # Reduce workers if memory-constrained
worker_memory_limit=1024 # MB per worker
)
@epochly.optimize(level=3)
def memory_efficient_training(X, y):
rf = RandomForestClassifier(
n_estimators=100,
max_features='sqrt' # Reduce memory usage
)
rf.fit(X, y)
return rf