Monitoring Integration

Comprehensive metrics for monitoring Epochly performance, resource usage, and health in production.

Overview

Epochly provides built-in metrics collection and integration with popular monitoring systems:

Built-in Python API: epochly.get_metrics() and epochly.get_status()
Prometheus: Native Prometheus exporter with /metrics endpoint
StatsD: Push metrics to StatsD-compatible systems
DataDog: Direct DogStatsd integration
Custom integrations: Flexible API for any monitoring system

Built-in Metrics

Python Metrics API

import epochly
# Get performance metrics
metrics = epochly.get_metrics()
print(f"Total calls: {metrics['total_calls']}")
print(f"Total time: {metrics['total_time_ms']} ms")
print(f"Mean time: {metrics['mean_time_ms']} ms")
print(f"Error count: {metrics['error_count']}")
# Get status information
status = epochly.get_status()
print(f"Enabled: {status['enabled']}")
print(f"Level: {status['enhancement_level']}")
print(f"Mode: {status['mode']}")

Returned Metrics

epochly.get_metrics() returns:

total_calls: Total number of optimized function calls
total_time_ms: Total execution time in milliseconds
mean_time_ms: Average execution time per call
error_count: Total errors encountered

epochly.get_status() returns:

enabled: Whether Epochly is active
enhancement_level: Current level (0-4)
mode: Current mode (monitoring, conservative, balanced, aggressive)

Prometheus Integration

Enable Prometheus Metrics

# Via environment variable
export EPOCHLY_PROMETHEUS_PORT=8080

Or programmatically:

import epochly
epochly.configure(
    prometheus_enabled=True,
    prometheus_port=8080
)

Access Metrics Endpoint

curl http://localhost:8080/metrics

Available Metrics

Metric Name	Type	Description
`epochly_enabled`	Gauge	Whether Epochly is enabled (1 = yes, 0 = no)
`epochly_level`	Gauge	Current enhancement level (0-4)
`epochly_calls_total`	Counter	Total number of optimized function calls
`epochly_optimization_seconds`	Histogram	Time spent in optimization (duration distribution)
`epochly_errors_total`	Counter	Total errors encountered
`epochly_memory_bytes`	Gauge	Current memory usage in bytes
`epochly_workers_active`	Gauge	Number of active worker processes

Prometheus Scrape Configuration

scrape_configs:
  - job_name: 'epochly'
    static_configs:
      - targets: ['app:8080']
    scrape_interval: 15s
    scrape_timeout: 10s
    metrics_path: '/metrics'

Grafana Dashboard Suggestions

Panel 1: Request Rate & Latency

# Request rate
rate(epochly_calls_total[5m])
# P95 latency
histogram_quantile(0.95, rate(epochly_optimization_seconds_bucket[5m]))

Panel 2: Enhancement Level Distribution

# Current level by instance
epochly_level

Panel 3: Error Rate

# Error rate per second
rate(epochly_errors_total[5m])

Panel 4: Worker Utilization

# Active workers
epochly_workers_active

Panel 5: Memory Usage

# Memory in MB
epochly_memory_bytes / 1024 / 1024

Custom Metrics Collection

Periodic Collection

import epochly
import time
import threading
def collect_metrics(interval=60):
    """Collect Epochly metrics periodically"""
    while True:
        metrics = epochly.get_metrics()
        status = epochly.get_status()
        
        # Send to your monitoring system
        print(f"Epochly metrics: {metrics}")
        print(f"Epochly status: {status}")
        
        time.sleep(interval)
# Start collection in background thread
collector_thread = threading.Thread(
    target=collect_metrics,
    args=(60,),  # Collect every 60 seconds
    daemon=True
)
collector_thread.start()

FastAPI Health Endpoint

from fastapi import FastAPI
import epochly
app = FastAPI()
@app.get("/metrics/epochly")
def epochly_metrics():
    """Epochly metrics endpoint"""
    status = epochly.get_status()
    metrics = epochly.get_metrics()
    
    return {
        "status": {
            "enabled": status.get("enabled", False),
            "level": status.get("enhancement_level", 0),
            "mode": status.get("mode", "unknown")
        },
        "metrics": {
            "total_calls": metrics.get("total_calls", 0),
            "mean_time_ms": metrics.get("mean_time_ms", 0),
            "error_count": metrics.get("error_count", 0)
        }
    }

StatsD Integration

import epochly
from statsd import StatsClient
import time
# Initialize StatsD client
statsd = StatsClient(host='localhost', port=8125, prefix='epochly')
def report_metrics():
    """Send Epochly metrics to StatsD"""
    metrics = epochly.get_metrics()
    status = epochly.get_status()
    
    # Send metrics
    statsd.gauge('enabled', 1 if status['enabled'] else 0)
    statsd.gauge('level', status['enhancement_level'])
    statsd.gauge('total_calls', metrics['total_calls'])
    statsd.gauge('mean_time_ms', metrics['mean_time_ms'])
    statsd.gauge('error_count', metrics['error_count'])
# Report metrics periodically
while True:
    report_metrics()
    time.sleep(60)

DataDog Integration

import epochly
from datadog import initialize, statsd
import time
# Initialize DataDog
initialize(
    api_key='your-api-key',
    app_key='your-app-key'
)
def report_to_datadog():
    """Send Epochly metrics to DataDog"""
    metrics = epochly.get_metrics()
    status = epochly.get_status()
    
    # Add tags for filtering
    tags = [
        f"level:{status['enhancement_level']}",
        f"mode:{status['mode']}"
    ]
    
    # Send metrics with tags
    statsd.gauge('epochly.enabled', 1 if status['enabled'] else 0, tags=tags)
    statsd.gauge('epochly.level', status['enhancement_level'], tags=tags)
    statsd.gauge('epochly.calls', metrics['total_calls'], tags=tags)
    statsd.gauge('epochly.mean_time_ms', metrics['mean_time_ms'], tags=tags)
    statsd.gauge('epochly.errors', metrics['error_count'], tags=tags)
# Report every minute
while True:
    report_to_datadog()
    time.sleep(60)

Alerting

Key Metrics to Alert On

Metric	Threshold	Severity	Action
Error rate	> 1%	High	Investigate logs, consider emergency disable
Mean latency	> 2x baseline	Warning	Check resource usage, consider lowering level
Worker count	= 0	Critical	Level 3 failed to start, check configuration
Memory usage	> 80%	Warning	Reduce workers or shared memory pool size

Prometheus Alerting Rules

groups:
  - name: epochly_alerts
    rules:
      - alert: EpochlyHighErrorRate
        expr: rate(epochly_errors_total[5m]) / rate(epochly_calls_total[5m]) > 0.01
        for: 5m
        labels:
          severity: critical
          component: epochly
        annotations:
          summary: "Epochly error rate is above 1%"
          description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes"
          runbook: "https://docs.example.com/runbooks/epochly-high-errors"
      
      - alert: EpochlyDisabled
        expr: epochly_enabled == 0
        for: 1m
        labels:
          severity: warning
          component: epochly
        annotations:
          summary: "Epochly has been disabled"
          description: "Epochly is not active on instance {{ $labels.instance }}"
      
      - alert: EpochlyHighLatency
        expr: |
          histogram_quantile(0.95,
            rate(epochly_optimization_seconds_bucket[5m])
          ) > 0.1
        for: 10m
        labels:
          severity: warning
          component: epochly
        annotations:
          summary: "Epochly P95 latency is high"
          description: "P95 latency is {{ $value }}s, above 100ms threshold"
      
      - alert: EpochlyNoWorkers
        expr: epochly_level >= 3 and epochly_workers_active == 0
        for: 2m
        labels:
          severity: critical
          component: epochly
        annotations:
          summary: "Epochly has no active workers"
          description: "Level 3+ requires workers but none are active"

Health Checks

CLI Health Check

# Check Epochly health
epochly health
# Output:
# Status: Healthy
# Enabled: Yes
# Level: 2
# Mode: conservative
# Error Rate: 0.0%

Programmatic Health Check

import epochly
def check_epochly_health():
    """Check if Epochly is healthy"""
    status = epochly.get_status()
    metrics = epochly.get_metrics()
    
    # Check if enabled
    if not status.get('enabled', False):
        return {
            'healthy': False,
            'reason': 'Epochly is disabled'
        }
    
    # Check error rate
    total_calls = metrics.get('total_calls', 0)
    error_count = metrics.get('error_count', 0)
    
    if total_calls > 0:
        error_rate = error_count / total_calls
        if error_rate > 0.01:  # 1% threshold
            return {
                'healthy': False,
                'reason': f'High error rate: {error_rate:.2%}'
            }
    
    return {
        'healthy': True,
        'level': status.get('enhancement_level'),
        'mode': status.get('mode')
    }
# Check health
health = check_epochly_health()
print(f"Healthy: {health['healthy']}")

Logging Integration

Add Epochly Context to Logs

import logging
import epochly
class EpochlyMetricsFilter(logging.Filter):
    """Add Epochly metrics to all log records"""
    
    def filter(self, record):
        status = epochly.get_status()
        record.epochly_level = status.get('enhancement_level', 0)
        record.epochly_enabled = status.get('enabled', False)
        record.epochly_mode = status.get('mode', 'unknown')
        return True
# Configure logging with Epochly context
logger = logging.getLogger()
logger.addFilter(EpochlyMetricsFilter())
# Configure format to include Epochly fields
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - '
    'epochly_level=%(epochly_level)s epochly_enabled=%(epochly_enabled)s - '
    '%(message)s'
)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)

Structured Logging with Epochly Context

import epochly
import logging
import json
def log_with_epochly_context(message, level='INFO'):
    """Log message with Epochly context"""
    status = epochly.get_status()
    metrics = epochly.get_metrics()
    
    log_entry = {
        'timestamp': '2024-01-15T10:30:00Z',
        'level': level,
        'message': message,
        'epochly': {
            'enabled': status.get('enabled', False),
            'level': status.get('enhancement_level', 0),
            'mode': status.get('mode', 'unknown'),
            'total_calls': metrics.get('total_calls', 0),
            'error_count': metrics.get('error_count', 0)
        }
    }
    
    print(json.dumps(log_entry))
# Usage
log_with_epochly_context('Processing request started')

Output:

{
  "timestamp": "2024-01-15T10:30:00Z",
  "level": "INFO",
  "message": "Processing request started",
  "epochly": {
    "enabled": true,
    "level": 2,
    "mode": "conservative",
    "total_calls": 12540,
    "error_count": 3
  }
}

Performance Baselines

Establish Baseline

import epochly
import numpy as np
import time
def establish_baseline(func, iterations=100):
    """Establish performance baseline for a function"""
    times = []
    
    # Run without Epochly
    with epochly.epochly_disabled_context():
        for _ in range(iterations):
            start = time.perf_counter()
            func()
            end = time.perf_counter()
            times.append(end - start)
    
    return {
        'mean': np.mean(times),
        'std': np.std(times),
        'p95': np.percentile(times, 95),
        'p99': np.percentile(times, 99)
    }
# Establish baseline
def my_function():
    return sum(i ** 2 for i in range(10000))
baseline = establish_baseline(my_function)
print(f"Baseline mean: {baseline['mean']:.4f}s")
print(f"Baseline P95: {baseline['p95']:.4f}s")

Check for Performance Regression

import epochly
import time
def check_performance_regression(func, baseline, threshold=1.2):
    """Check if current performance regressed vs baseline"""
    times = []
    
    # Measure with Epochly
    for _ in range(100):
        start = time.perf_counter()
        func()
        end = time.perf_counter()
        times.append(end - start)
    
    current_mean = sum(times) / len(times)
    
    if current_mean > baseline['mean'] * threshold:
        return {
            'regression': True,
            'current_mean': current_mean,
            'baseline_mean': baseline['mean'],
            'slowdown': current_mean / baseline['mean']
        }
    
    return {
        'regression': False,
        'current_mean': current_mean,
        'baseline_mean': baseline['mean'],
        'speedup': baseline['mean'] / current_mean
    }
# Check for regression
result = check_performance_regression(my_function, baseline)
if result['regression']:
    print(f"⚠️  Performance regression detected: {result['slowdown']:.2f}x slower")
else:
    print(f"✓ Performance improved: {result['speedup']:.2f}x faster")