scikit-learn Optimization
Optimizing machine learning workflows with Epochly for faster model training and evaluation.
Overview
Epochly optimizes scikit-learn operations by parallelizing preprocessing, training, cross-validation, and hyperparameter search.
ML Operations and Recommended Levels
| Operation | Examples | Recommended Level | Speedup |
|---|---|---|---|
| Preprocessing | Feature scaling, encoding, custom transformers | Level 2 (JIT) | 2-5x |
| Training | RandomForest, GradientBoosting, ensemble models | Level 3 (Multicore) | 4–12x |
| Cross-Validation | cross_val_score, KFold splitting | Level 3 (Multicore) | 4-12x |
| Prediction | Batch prediction on large datasets | Level 2 (JIT) | 2-4x |
| Hyperparameter Search | GridSearchCV, RandomizedSearchCV | Level 3 (Multicore) | 5-20x |
Data Preprocessing
Feature Scaling
import epochlyimport numpy as npfrom sklearn.preprocessing import StandardScaler@epochly.optimize(level=2)def optimized_scaling(X):"""Optimized feature scaling"""scaler = StandardScaler()X_scaled = scaler.fit_transform(X)return X_scaled, scaler# Large datasetX = np.random.rand(1_000_000, 50)X_scaled, scaler = optimized_scaling(X)print(f"Scaled features shape: {X_scaled.shape}")
Custom Feature Engineering
import epochlyimport numpy as npfrom sklearn.base import BaseEstimator, TransformerMixinclass OptimizedFeatureEngineer(BaseEstimator, TransformerMixin):"""Custom feature engineering with Epochly optimization"""@epochly.optimize(level=2)def fit(self, X, y=None):return self@epochly.optimize(level=2)def transform(self, X):"""Create polynomial and interaction features"""# Polynomial featuresX_poly = X ** 2# Interaction featuresX_interactions = X[:, :5] * X[:, 5:10]# Combine all featuresX_engineered = np.hstack([X, X_poly, X_interactions])return X_engineered# Use the optimized transformerengineer = OptimizedFeatureEngineer()X = np.random.rand(100_000, 20)X_engineered = engineer.fit_transform(X)print(f"Engineered features: {X_engineered.shape}")
Model Training
Parallel Random Forest Training
import epochlyfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.datasets import make_classification@epochly.optimize(level=3)def train_random_forest(X, y):"""Train Random Forest with parallel optimization"""# Epochly automatically manages n_jobsrf = RandomForestClassifier(n_estimators=100,max_depth=10,random_state=42)rf.fit(X, y)return rf# Generate datasetX, y = make_classification(n_samples=100_000, n_features=20, random_state=42)# Train with optimizationmodel = train_random_forest(X, y)print(f"Model score: {model.score(X, y):.3f}")
Gradient Boosting Optimization
import epochlyfrom sklearn.ensemble import GradientBoostingClassifierfrom sklearn.model_selection import train_test_split@epochly.optimize(level=3)def train_gradient_boosting(X_train, y_train):"""Optimized Gradient Boosting training"""gb = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=5,random_state=42)gb.fit(X_train, y_train)return gb# Split dataX, y = make_classification(n_samples=50_000, n_features=30, random_state=42)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)# Trainmodel = train_gradient_boosting(X_train, y_train)print(f"Test score: {model.score(X_test, y_test):.3f}")
Cross-Validation
Parallel Cross-Validation
import epochlyfrom sklearn.model_selection import cross_val_scorefrom sklearn.ensemble import RandomForestClassifier@epochly.optimize(level=3)def parallel_cross_validation(X, y):"""Optimized cross-validation with multiple folds"""model = RandomForestClassifier(n_estimators=50)# 10-fold cross-validationscores = cross_val_score(model, X, y, cv=10, scoring='accuracy')return scoresX, y = make_classification(n_samples=50_000, n_features=20, random_state=42)scores = parallel_cross_validation(X, y)print(f"CV Scores: {scores}")print(f"Mean: {scores.mean():.3f} (+/- {scores.std():.3f})")
Stratified K-Fold with Custom Scoring
import epochlyfrom sklearn.model_selection import StratifiedKFold, cross_validatefrom sklearn.ensemble import GradientBoostingClassifier@epochly.optimize(level=3)def stratified_cv_multi_metric(X, y):"""Stratified CV with multiple metrics"""model = GradientBoostingClassifier(n_estimators=50)cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)scoring = ['accuracy', 'precision', 'recall', 'f1']results = cross_validate(model, X, y, cv=cv, scoring=scoring)return resultsX, y = make_classification(n_samples=30_000, n_features=15, random_state=42)results = stratified_cv_multi_metric(X, y)for metric, scores in results.items():if metric.startswith('test_'):print(f"{metric}: {scores.mean():.3f}")
Hyperparameter Search
GridSearchCV Optimization
import epochlyfrom sklearn.model_selection import GridSearchCVfrom sklearn.ensemble import RandomForestClassifier@epochly.optimize(level=3)def optimized_grid_search(X, y):"""Parallel hyperparameter grid search"""param_grid = {'n_estimators': [50, 100, 200],'max_depth': [5, 10, 15, None],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}rf = RandomForestClassifier(random_state=42)grid_search = GridSearchCV(rf,param_grid,cv=5,scoring='accuracy',verbose=1)grid_search.fit(X, y)return grid_searchX, y = make_classification(n_samples=10_000, n_features=20, random_state=42)# Run grid searchsearch = optimized_grid_search(X, y)print(f"Best parameters: {search.best_params_}")print(f"Best score: {search.best_score_:.3f}")
RandomizedSearchCV
import epochlyfrom sklearn.model_selection import RandomizedSearchCVfrom sklearn.ensemble import GradientBoostingClassifierfrom scipy.stats import randint, uniform@epochly.optimize(level=3)def optimized_random_search(X, y):"""Randomized hyperparameter search"""param_distributions = {'n_estimators': randint(50, 200),'learning_rate': uniform(0.01, 0.3),'max_depth': randint(3, 10),'min_samples_split': randint(2, 20),'min_samples_leaf': randint(1, 10)}gb = GradientBoostingClassifier(random_state=42)random_search = RandomizedSearchCV(gb,param_distributions,n_iter=50,cv=5,scoring='accuracy',random_state=42,verbose=1)random_search.fit(X, y)return random_searchX, y = make_classification(n_samples=20_000, n_features=25, random_state=42)search = optimized_random_search(X, y)print(f"Best parameters: {search.best_params_}")print(f"Best score: {search.best_score_:.3f}")
Prediction
Batch Prediction for Large Datasets
import epochlyimport numpy as npfrom sklearn.ensemble import RandomForestClassifier@epochly.optimize(level=2)def batch_prediction(model, X, batch_size=10_000):"""Predict on large dataset in batches"""predictions = []for i in range(0, len(X), batch_size):batch = X[i:i+batch_size]batch_preds = model.predict(batch)predictions.extend(batch_preds)return np.array(predictions)# Train modelX_train, y_train = make_classification(n_samples=10_000, n_features=20)model = RandomForestClassifier(n_estimators=50)model.fit(X_train, y_train)# Predict on large datasetX_test = np.random.rand(1_000_000, 20)predictions = batch_prediction(model, X_test, batch_size=50_000)print(f"Predictions shape: {predictions.shape}")
Pipeline Optimization
Full ML Pipeline
import epochlyfrom sklearn.pipeline import Pipelinefrom sklearn.preprocessing import StandardScalerfrom sklearn.decomposition import PCAfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.model_selection import train_test_split@epochly.optimize(level=3)def train_ml_pipeline(X, y):"""Optimized end-to-end ML pipeline"""# Create pipelinepipeline = Pipeline([('scaler', StandardScaler()),('pca', PCA(n_components=15)),('classifier', RandomForestClassifier(n_estimators=100))])# Train pipelinepipeline.fit(X, y)return pipeline# Generate dataX, y = make_classification(n_samples=50_000, n_features=30, random_state=42)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)# Train and evaluatepipeline = train_ml_pipeline(X_train, y_train)score = pipeline.score(X_test, y_test)print(f"Pipeline test score: {score:.3f}")
Pipeline with Feature Engineering
import epochlyfrom sklearn.pipeline import Pipelinefrom sklearn.preprocessing import StandardScaler, PolynomialFeaturesfrom sklearn.feature_selection import SelectKBest, f_classiffrom sklearn.linear_model import LogisticRegression@epochly.optimize(level=3)def complex_pipeline(X, y):"""Pipeline with feature engineering and selection"""pipeline = Pipeline([('scaler', StandardScaler()),('poly', PolynomialFeatures(degree=2)),('selector', SelectKBest(f_classif, k=20)),('classifier', LogisticRegression(max_iter=1000))])pipeline.fit(X, y)return pipelineX, y = make_classification(n_samples=20_000, n_features=15, random_state=42)pipeline = complex_pipeline(X, y)print(f"Number of features selected: {pipeline.named_steps['selector'].get_support().sum()}")
Custom Estimator Optimization
Optimized Scaler Example
import epochlyimport numpy as npfrom sklearn.base import BaseEstimator, TransformerMixinclass OptimizedScaler(BaseEstimator, TransformerMixin):"""Custom scaler with Epochly optimization"""def __init__(self):self.mean_ = Noneself.std_ = Nonedef fit(self, X, y=None):"""Compute mean and std with optimization"""with epochly.optimize_context(level=2):self.mean_ = np.mean(X, axis=0)self.std_ = np.std(X, axis=0)return selfdef transform(self, X):"""Transform with optimization"""with epochly.optimize_context(level=2):X_scaled = (X - self.mean_) / (self.std_ + 1e-8)return X_scaled# Use custom scalerscaler = OptimizedScaler()X = np.random.rand(1_000_000, 50)X_scaled = scaler.fit_transform(X)print(f"Scaled data shape: {X_scaled.shape}")
Custom Transformer with State
import epochlyimport numpy as npfrom sklearn.base import BaseEstimator, TransformerMixinclass OptimizedBinning(BaseEstimator, TransformerMixin):"""Custom binning transformer"""def __init__(self, n_bins=10):self.n_bins = n_binsself.bin_edges_ = Nonedef fit(self, X, y=None):"""Compute bin edges"""with epochly.optimize_context(level=2):self.bin_edges_ = [np.percentile(X[:, i], np.linspace(0, 100, self.n_bins + 1))for i in range(X.shape[1])]return selfdef transform(self, X):"""Bin values"""with epochly.optimize_context(level=2):X_binned = np.zeros_like(X)for i in range(X.shape[1]):X_binned[:, i] = np.digitize(X[:, i], self.bin_edges_[i])return X_binnedbinner = OptimizedBinning(n_bins=5)X = np.random.rand(100_000, 10)X_binned = binner.fit_transform(X)
Benchmarking ML Operations
Comparing Performance Across Levels
import epochlyimport timefrom sklearn.ensemble import RandomForestClassifierfrom sklearn.datasets import make_classificationdef benchmark_training_levels():"""Benchmark training with different optimization levels"""X, y = make_classification(n_samples=50_000, n_features=30, random_state=42)results = {}# Baseline (no optimization)start = time.perf_counter()rf_baseline = RandomForestClassifier(n_estimators=100)rf_baseline.fit(X, y)results['baseline'] = time.perf_counter() - start# Level 2 (JIT)@epochly.optimize(level=2)def train_level2(X, y):rf = RandomForestClassifier(n_estimators=100)rf.fit(X, y)return rfstart = time.perf_counter()rf_l2 = train_level2(X, y)results['level2'] = time.perf_counter() - start# Level 3 (Multicore)@epochly.optimize(level=3)def train_level3(X, y):rf = RandomForestClassifier(n_estimators=100)rf.fit(X, y)return rfstart = time.perf_counter()rf_l3 = train_level3(X, y)results['level3'] = time.perf_counter() - start# Print resultsprint("Training Time Comparison:")print(f"Baseline: {results['baseline']:.2f}s")print(f"Level 2: {results['level2']:.2f}s ({results['baseline']/results['level2']:.2f}x)")print(f"Level 3: {results['level3']:.2f}s ({results['baseline']/results['level3']:.2f}x)")benchmark_training_levels()
Cross-Validation Benchmark
import epochlyimport timefrom sklearn.model_selection import cross_val_scorefrom sklearn.ensemble import GradientBoostingClassifierdef benchmark_cross_validation():"""Benchmark CV with optimization"""X, y = make_classification(n_samples=20_000, n_features=25, random_state=42)model = GradientBoostingClassifier(n_estimators=50)# Without optimizationstart = time.perf_counter()scores_baseline = cross_val_score(model, X, y, cv=10)baseline_time = time.perf_counter() - start# With Level 3 optimization@epochly.optimize(level=3)def optimized_cv(model, X, y):return cross_val_score(model, X, y, cv=10)start = time.perf_counter()scores_optimized = optimized_cv(model, X, y)optimized_time = time.perf_counter() - startprint(f"Baseline CV: {baseline_time:.2f}s")print(f"Optimized CV: {optimized_time:.2f}s")print(f"Speedup: {baseline_time / optimized_time:.2f}x")benchmark_cross_validation()
Automatic n_jobs Configuration
Using auto_configure_sklearn()
Epochly can automatically configure n_jobs for scikit-learn estimators:
import epochlyfrom sklearn.ensemble import RandomForestClassifier# Enable automatic n_jobs configurationepochly.auto_configure_sklearn()# Now scikit-learn estimators automatically use optimal n_jobsrf = RandomForestClassifier(n_estimators=100)# Internally sets n_jobs=-1 (all cores)# Explicit n_jobs takes precedencerf_custom = RandomForestClassifier(n_estimators=100, n_jobs=4)# Uses n_jobs=4 as specified
Configure with Custom Worker Count
import epochly# Set specific number of workersepochly.auto_configure_sklearn(n_jobs=8)# All sklearn estimators will use n_jobs=8 by default
Supported Estimators (40+)
The following scikit-learn estimators support automatic n_jobs configuration:
Ensemble Methods:
- RandomForestClassifier, RandomForestRegressor
- ExtraTreesClassifier, ExtraTreesRegressor
- GradientBoostingClassifier, GradientBoostingRegressor
- BaggingClassifier, BaggingRegressor
- VotingClassifier, VotingRegressor
Linear Models:
- LogisticRegression, LogisticRegressionCV
- RidgeClassifier, RidgeClassifierCV
- ElasticNet, ElasticNetCV
- Lasso, LassoCV
Neighbors:
- KNeighborsClassifier, KNeighborsRegressor
- RadiusNeighborsClassifier, RadiusNeighborsRegressor
Model Selection:
- GridSearchCV, RandomizedSearchCV
- cross_val_score, cross_validate
And many more...
Manual Control with SklearnAutoConfigurator
import epochly# Get the configuratorconfigurator = epochly.get_sklearn_configurator()# Check configuration statusstatus = configurator.get_status()print(f"Auto-config enabled: {status['enabled']}")print(f"Default n_jobs: {status['n_jobs']}")# Temporarily disableconfigurator.disable()# Re-enable with custom settingsconfigurator.enable(n_jobs=16)# Reset to defaultsconfigurator.reset()
Example with Multiple Models
import epochlyfrom sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifierfrom sklearn.model_selection import GridSearchCV# Enable auto-configurationepochly.auto_configure_sklearn()# All models automatically use optimal n_jobsrf = RandomForestClassifier(n_estimators=100)gb = GradientBoostingClassifier(n_estimators=100)# Grid search also benefitsparam_grid = {'n_estimators': [50, 100], 'max_depth': [5, 10]}grid = GridSearchCV(rf, param_grid, cv=5)# Automatically parallelized across CV foldsX, y = make_classification(n_samples=10_000, n_features=20)grid.fit(X, y)print(f"Best score: {grid.best_score_:.3f}")
Best Practices
1. Choose the Right Level
# Level 2 for preprocessing and prediction@epochly.optimize(level=2)def preprocess_data(X):from sklearn.preprocessing import StandardScalerscaler = StandardScaler()return scaler.fit_transform(X)# Level 3 for training and CV@epochly.optimize(level=3)def train_model(X, y):from sklearn.ensemble import RandomForestClassifierrf = RandomForestClassifier(n_estimators=100)rf.fit(X, y)return rf
2. Use auto_configure_sklearn() for Convenience
import epochly# Enable once at the startepochly.auto_configure_sklearn()# All sklearn operations automatically optimized# No need to manually set n_jobs
3. Combine with Pipeline for Clean Code
import epochlyfrom sklearn.pipeline import Pipelineepochly.auto_configure_sklearn()@epochly.optimize(level=3)def train_pipeline(X, y):pipeline = Pipeline([('scaler', StandardScaler()),('pca', PCA(n_components=10)),('classifier', RandomForestClassifier(n_estimators=100))])pipeline.fit(X, y)return pipeline
4. Batch Processing for Large Datasets
@epochly.optimize(level=3)def process_large_dataset(X, y, batch_size=10_000):"""Process in batches to manage memory"""models = []for i in range(0, len(X), batch_size):X_batch = X[i:i+batch_size]y_batch = y[i:i+batch_size]model = RandomForestClassifier(n_estimators=50)model.fit(X_batch, y_batch)models.append(model)return models
5. Monitor Memory Usage
import epochlyfrom sklearn.ensemble import RandomForestClassifier# Configure memory limitsepochly.configure(enhancement_level=3,max_workers=4, # Reduce workers if memory-constrainedworker_memory_limit=1024 # MB per worker)@epochly.optimize(level=3)def memory_efficient_training(X, y):rf = RandomForestClassifier(n_estimators=100,max_features='sqrt' # Reduce memory usage)rf.fit(X, y)return rf