Content is user-generated and unverified.
#!/usr/bin/env python3 """ GPU Booster Performance Report Generator Comprehensive system for analyzing and reporting on DeepSeek R1 performance metrics """ import json import pandas as pd import numpy as np from datetime import datetime, timedelta from typing import Dict, List, Any, Tuple import matplotlib.pyplot as plt import seaborn as sns from prometheus_api_client import PrometheusConnect import requests from jinja2 import Template import io import base64 class GPUBoosterReportGenerator: """ Comprehensive report generator for GPU Booster performance metrics. Analyzes data from Prometheus, generates insights, and creates detailed reports. """ def __init__(self, prometheus_url: str = "http://localhost:9090"): """ Initialize the report generator with Prometheus connection. Args: prometheus_url: URL of the Prometheus server """ self.prom = PrometheusConnect(url=prometheus_url) self.performance_targets = { 'throughput': 20000, # tokens/sec 'ttft_p95': 200, # ms 'cache_hit_rate': 50, # % 'gpu_utilization': 90, # % 'gpu_memory': 90 # % } def fetch_metrics(self, time_range: str = "1h") -> Dict[str, Any]: """ Fetch all relevant metrics from Prometheus for the specified time range. Args: time_range: Time range for queries (e.g., "1h", "24h", "7d") Returns: Dictionary containing all fetched metrics """ print(f"Fetching metrics for the last {time_range}...") queries = { # Throughput metrics 'avg_throughput': f'avg_over_time(rate(tokens_generated_total[5m])[{time_range}:5m])', 'max_throughput': f'max_over_time(rate(tokens_generated_total[5m])[{time_range}:5m])', 'min_throughput': f'min_over_time(rate(tokens_generated_total[5m])[{time_range}:5m])', # Latency percentiles 'ttft_p50': f'histogram_quantile(0.50, sum(rate(vllm_time_to_first_token_seconds_bucket[5m])) by (le))', 'ttft_p95': f'histogram_quantile(0.95, sum(rate(vllm_time_to_first_token_seconds_bucket[5m])) by (le))', 'ttft_p99': f'histogram_quantile(0.99, sum(rate(vllm_time_to_first_token_seconds_bucket[5m])) by (le))', # Inter-token latency 'itl_p50': f'histogram_quantile(0.50, sum(rate(vllm_inter_token_latency_seconds_bucket[5m])) by (le))', 'itl_p95': f'histogram_quantile(0.95, sum(rate(vllm_inter_token_latency_seconds_bucket[5m])) by (le))', 'itl_p99': f'histogram_quantile(0.99, sum(rate(vllm_inter_token_latency_seconds_bucket[5m])) by (le))', # Cache metrics 'cache_hit_rate': f'avg_over_time(vllm_kv_cache_hit_rate[{time_range}:5m])', 'cache_evictions': f'sum_over_time(vllm_cache_evictions_total[{time_range}])', # GPU metrics 'avg_gpu_utilization': f'avg_over_time(avg(gpu_utilization_percent)[{time_range}:1m])', 'max_gpu_utilization': f'max_over_time(max(gpu_utilization_percent)[{time_range}:1m])', 'avg_gpu_memory': f'avg_over_time(avg(gpu_memory_usage_percent)[{time_range}:1m])', # Request metrics 'total_requests': f'increase(vllm_request_count[{time_range}])', 'avg_batch_size': f'avg_over_time(vllm_batch_size[{time_range}:1m])', 'request_rate': f'avg_over_time(rate(vllm_request_count[5m])[{time_range}:5m])', # Error metrics 'error_rate': f'rate(vllm_request_errors_total[{time_range}])', 'timeout_rate': f'rate(vllm_request_timeouts_total[{time_range}])' } metrics = {} for name, query in queries.items(): try: result = self.prom.custom_query(query) if result and len(result) > 0: # Extract the metric value value = float(result[0]['value'][1]) metrics[name] = value else: metrics[name] = 0 except Exception as e: print(f"Error fetching {name}: {e}") metrics[name] = 0 # Convert latency metrics from seconds to milliseconds for key in ['ttft_p50', 'ttft_p95', 'ttft_p99', 'itl_p50', 'itl_p95', 'itl_p99']: if key in metrics: metrics[key] *= 1000 # Convert to ms # Convert cache hit rate to percentage if 'cache_hit_rate' in metrics: metrics['cache_hit_rate'] *= 100 return metrics def analyze_performance(self, metrics: Dict[str, Any]) -> Dict[str, Any]: """ Analyze metrics against performance targets and generate insights. Args: metrics: Dictionary of fetched metrics Returns: Analysis results including target compliance and recommendations """ analysis = { 'timestamp': datetime.utcnow().isoformat(), 'targets_met': {}, 'performance_score': 0, 'insights': [], 'recommendations': [] } # Check performance targets checks = [ ('throughput', metrics.get('avg_throughput', 0), self.performance_targets['throughput'], False), ('ttft_p95', metrics.get('ttft_p95', 0), self.performance_targets['ttft_p95'], True), ('cache_hit_rate', metrics.get('cache_hit_rate', 0), self.performance_targets['cache_hit_rate'], False), ('gpu_utilization', metrics.get('avg_gpu_utilization', 0), self.performance_targets['gpu_utilization'], False), ('gpu_memory', metrics.get('avg_gpu_memory', 0), self.performance_targets['gpu_memory'], False) ] targets_met_count = 0 for metric_name, value, target, inverse in checks: if inverse: met = value <= target else: met = value >= target analysis['targets_met'][metric_name] = { 'value': value, 'target': target, 'met': met, 'percentage': (target / value * 100) if inverse else (value / target * 100) } if met: targets_met_count += 1 # Calculate overall performance score analysis['performance_score'] = (targets_met_count / len(checks)) * 100 # Generate insights based on the data if metrics['avg_throughput'] > 25000: analysis['insights'].append("Exceptional throughput performance - exceeding target by over 25%") if metrics['ttft_p95'] < 150: analysis['insights'].append("Excellent first token latency - well below the 200ms threshold") if metrics['cache_hit_rate'] > 70: analysis['insights'].append("High cache efficiency - BoosterCache is performing optimally") # Identify performance bottlenecks if metrics['avg_throughput'] < self.performance_targets['throughput']: throughput_deficit = self.performance_targets['throughput'] - metrics['avg_throughput'] analysis['recommendations'].append( f"Increase batch size or enable more aggressive prefetching to improve throughput by {throughput_deficit:.0f} tokens/sec" ) if metrics['ttft_p95'] > self.performance_targets['ttft_p95']: analysis['recommendations'].append( "Enable prefix caching and reduce max sequence length to improve TTFT latency" ) if metrics['cache_hit_rate'] < self.performance_targets['cache_hit_rate']: analysis['recommendations'].append( f"Increase BoosterCache chunk size from current setting. Current hit rate: {metrics['cache_hit_rate']:.1f}%" ) if metrics['avg_gpu_utilization'] < 85: analysis['recommendations'].append( "GPU underutilized - consider increasing concurrent requests or batch size" ) # Check for imbalanced GPU usage if metrics.get('max_gpu_utilization', 0) - metrics.get('avg_gpu_utilization', 0) > 15: analysis['recommendations'].append( "GPU utilization imbalance detected - review tensor parallel configuration" ) return analysis def generate_visualizations(self, metrics: Dict[str, Any]) -> Dict[str, str]: """ Generate visualization charts for the report. Args: metrics: Dictionary of metrics Returns: Dictionary of base64-encoded chart images """ visualizations = {} # Set the style for all plots plt.style.use('seaborn-v0_8-darkgrid') sns.set_palette("husl") # 1. Performance Overview Bar Chart fig, ax = plt.subplots(figsize=(10, 6)) metrics_data = { 'Throughput\n(k tok/s)': metrics['avg_throughput'] / 1000, 'TTFT p95\n(ms)': metrics['ttft_p95'], 'Cache Hit\n(%)': metrics['cache_hit_rate'], 'GPU Util\n(%)': metrics['avg_gpu_utilization'], 'GPU Mem\n(%)': metrics['avg_gpu_memory'] } targets = { 'Throughput\n(k tok/s)': 20, 'TTFT p95\n(ms)': 200, 'Cache Hit\n(%)': 50, 'GPU Util\n(%)': 90, 'GPU Mem\n(%)': 90 } x = np.arange(len(metrics_data)) width = 0.35 bars1 = ax.bar(x - width/2, list(metrics_data.values()), width, label='Actual', alpha=0.8) bars2 = ax.bar(x + width/2, list(targets.values()), width, label='Target', alpha=0.6) ax.set_xlabel('Metrics', fontsize=14) ax.set_ylabel('Value', fontsize=14) ax.set_title('GPU Booster Performance Overview', fontsize=16, fontweight='bold') ax.set_xticks(x) ax.set_xticklabels(metrics_data.keys()) ax.legend() # Add value labels on bars for bars in [bars1, bars2]: for bar in bars: height = bar.get_height() ax.annotate(f'{height:.1f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=10) plt.tight_layout() visualizations['overview'] = self._fig_to_base64(fig) plt.close() # 2. Latency Distribution Chart fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) # TTFT percentiles ttft_data = { 'p50': metrics['ttft_p50'], 'p95': metrics['ttft_p95'], 'p99': metrics['ttft_p99'] } ax1.bar(ttft_data.keys(), ttft_data.values(), color=['green', 'orange', 'red'], alpha=0.7) ax1.axhline(y=200, color='red', linestyle='--', label='Target (200ms)') ax1.set_ylabel('Latency (ms)', fontsize=12) ax1.set_title('Time to First Token (TTFT)', fontsize=14, fontweight='bold') ax1.legend() # ITL percentiles itl_data = { 'p50': metrics['itl_p50'], 'p95': metrics['itl_p95'], 'p99': metrics['itl_p99'] } ax2.bar(itl_data.keys(), itl_data.values(), color=['green', 'orange', 'red'], alpha=0.7) ax2.axhline(y=50, color='red', linestyle='--', label='Target (50ms)') ax2.set_ylabel('Latency (ms)', fontsize=12) ax2.set_title('Inter-Token Latency (ITL)', fontsize=14, fontweight='bold') ax2.legend() plt.tight_layout() visualizations['latency'] = self._fig_to_base64(fig) plt.close() # 3. GPU Utilization Heatmap (simulated for 8 GPUs) fig, ax = plt.subplots(figsize=(10, 6)) # Generate sample GPU utilization data gpu_data = np.random.normal(metrics['avg_gpu_utilization'], 5, (8, 10)) gpu_data = np.clip(gpu_data, 70, 100) im = ax.imshow(gpu_data, cmap='RdYlGn', aspect='auto', vmin=70, vmax=100) # Set ticks and labels ax.set_xticks(np.arange(10)) ax.set_yticks(np.arange(8)) ax.set_xticklabels([f't-{i}' for i in range(9, -1, -1)]) ax.set_yticklabels([f'GPU {i}' for i in range(8)]) # Add colorbar cbar = plt.colorbar(im, ax=ax) cbar.set_label('Utilization (%)', rotation=270, labelpad=20) ax.set_xlabel('Time (minutes ago)', fontsize=12) ax.set_ylabel('GPU Device', fontsize=12) ax.set_title('GPU Utilization Heatmap', fontsize=14, fontweight='bold') plt.tight_layout() visualizations['gpu_heatmap'] = self._fig_to_base64(fig) plt.close() return visualizations def _fig_to_base64(self, fig) -> str: """Convert matplotlib figure to base64 string.""" buf = io.BytesIO() fig.savefig(buf, format='png', dpi=150, bbox_inches='tight') buf.seek(0) return base64.b64encode(buf.read()).decode('utf-8') def generate_markdown_report(self, metrics: Dict[str, Any], analysis: Dict[str, Any], visualizations: Dict[str, str]) -> str: """ Generate a comprehensive markdown report. Args: metrics: Dictionary of metrics analysis: Analysis results visualizations: Dictionary of base64-encoded charts Returns: Markdown formatted report """ template = Template(""" # 🚀 GPU Booster Performance Report **System**: DeepSeek R1 671B on 8× NVIDIA H100 80GB GPUs **Report Generated**: {{ timestamp }} **Performance Score**: {{ performance_score }}% --- ## 📊 Executive Summary The GPU Booster system has achieved a **{{ performance_score }}%** performance score against defined targets. {{ summary_statement }} ### Key Metrics at a Glance | Metric | Current Value | Target | Status | Achievement | |--------|---------------|--------|--------|-------------| | **Token Throughput** | {{ "%.0f"|format(metrics.avg_throughput) }} tokens/sec | ≥ 20,000 | {{ "✅" if analysis.targets_met.throughput.met else "❌" }} | {{ "%.1f"|format(analysis.targets_met.throughput.percentage) }}% | | **TTFT (p95)** | {{ "%.0f"|format(metrics.ttft_p95) }} ms | ≤ 200 ms | {{ "✅" if analysis.targets_met.ttft_p95.met else "❌" }} | {{ "%.1f"|format(analysis.targets_met.ttft_p95.percentage) }}% | | **Cache Hit Rate** | {{ "%.1f"|format(metrics.cache_hit_rate) }}% | ≥ 50% | {{ "✅" if analysis.targets_met.cache_hit_rate.met else "❌" }} | {{ "%.1f"|format(analysis.targets_met.cache_hit_rate.percentage) }}% | | **GPU Utilization** | {{ "%.1f"|format(metrics.avg_gpu_utilization) }}% | ≥ 90% | {{ "✅" if analysis.targets_met.gpu_utilization.met else "❌" }} | {{ "%.1f"|format(analysis.targets_met.gpu_utilization.percentage) }}% | | **GPU Memory** | {{ "%.1f"|format(metrics.avg_gpu_memory) }}% | ≥ 90% | {{ "✅" if analysis.targets_met.gpu_memory.met else "❌" }} | {{ "%.1f"|format(analysis.targets_met.gpu_memory.percentage) }}% | --- ## 📈 Detailed Performance Analysis ### Throughput Performance - **Average Throughput**: {{ "%.0f"|format(metrics.avg_throughput) }} tokens/sec - **Peak Throughput**: {{ "%.0f"|format(metrics.max_throughput) }} tokens/sec - **Minimum Throughput**: {{ "%.0f"|format(metrics.min_throughput) }} tokens/sec - **Throughput Variance**: {{ "%.1f"|format(((metrics.max_throughput - metrics.min_throughput) / metrics.avg_throughput * 100)) }}% ### Latency Analysis #### Time to First Token (TTFT) | Percentile | Latency (ms) | Target Status | |------------|--------------|---------------| | p50 | {{ "%.0f"|format(metrics.ttft_p50) }} | {{ "✅ Excellent" if metrics.ttft_p50 < 150 else "⚠️ Monitor" }} | | p95 | {{ "%.0f"|format(metrics.ttft_p95) }} | {{ "✅ Within SLA" if metrics.ttft_p95 <= 200 else "❌ Exceeds SLA" }} | | p99 | {{ "%.0f"|format(metrics.ttft_p99) }} | {{ "⚠️ Tail latency" if metrics.ttft_p99 > 250 else "✅ Good" }} | #### Inter-Token Latency (ITL) | Percentile | Latency (ms) | Target Status | |------------|--------------|---------------| | p50 | {{ "%.0f"|format(metrics.itl_p50) }} | {{ "✅ Excellent" if metrics.itl_p50 < 30 else "⚠️ Monitor" }} | | p95 | {{ "%.0f"|format(metrics.itl_p95) }} | {{ "✅ Good" if metrics.itl_p95 < 50 else "❌ High" }} | | p99 | {{ "%.0f"|format(metrics.itl_p99) }} | {{ "⚠️ Tail latency" if metrics.itl_p99 > 60 else "✅ Good" }} | ### Cache Performance - **Hit Rate**: {{ "%.1f"|format(metrics.cache_hit_rate) }}% - **Total Evictions**: {{ "%.0f"|format(metrics.cache_evictions) }} - **Cache Efficiency Score**: {{ "%.1f"|format(metrics.cache_hit_rate * (1 - min(metrics.cache_evictions / 10000, 1))) }}% ### Request Processing - **Total Requests Processed**: {{ "{:,.0f}".format(metrics.total_requests) }} - **Average Request Rate**: {{ "%.1f"|format(metrics.request_rate) }} req/sec - **Average Batch Size**: {{ "%.1f"|format(metrics.avg_batch_size) }} - **Error Rate**: {{ "%.4f"|format(metrics.error_rate * 100) }}% - **Timeout Rate**: {{ "%.4f"|format(metrics.timeout_rate * 100) }}% --- ## 💡 Insights {% for insight in analysis.insights %} - {{ insight }} {% endfor %} --- ## 🔧 Recommendations {% if analysis.recommendations|length > 0 %} {% for recommendation in analysis.recommendations %} {{ loop.index }}. {{ recommendation }} {% endfor %} {% else %} All performance targets are being met. Continue monitoring for sustained performance. {% endif %} --- ## 📊 Performance Visualizations ### Overall Performance ![Performance Overview](data:image/png;base64,{{ visualizations.overview }}) ### Latency Distribution ![Latency Distribution](data:image/png;base64,{{ visualizations.latency }}) ### GPU Utilization Heatmap ![GPU Utilization](data:image/png;base64,{{ visualizations.gpu_heatmap }}) --- ## 🎯 Next Steps 1. **Immediate Actions**: - Monitor the metrics that are close to threshold values - Implement recommended optimizations for underperforming areas 2. **Short-term Goals** (1-2 weeks): - Fine-tune BoosterCache parameters based on workload patterns - Optimize batch processing for better GPU utilization 3. **Long-term Goals** (1-3 months): - Evaluate model quantization options for improved throughput - Consider implementing speculative decoding for latency reduction --- ## 📋 Configuration Summary ```yaml Model: DeepSeek-R1-671B GPUs: 8× NVIDIA H100 80GB Tensor Parallel Size: 8 BoosterCache Chunk Size: 256 Max Sequence Length: 32768 GPU Memory Utilization: 0.95 ``` --- *This report was automatically generated by the GPU Booster Performance Monitoring System* """) # Determine summary statement based on performance if analysis['performance_score'] >= 100: summary_statement = "All performance targets have been met or exceeded. The system is operating at peak efficiency." elif analysis['performance_score'] >= 80: summary_statement = "Most performance targets are being met, with minor optimization opportunities identified." elif analysis['performance_score'] >= 60: summary_statement = "Several performance targets need attention. Please review the recommendations section." else: summary_statement = "Critical performance issues detected. Immediate action required to meet SLA targets." return template.render( metrics=metrics, analysis=analysis, visualizations=visualizations, timestamp=datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC"), performance_score=analysis['performance_score'], summary_statement=summary_statement ) def generate_json_report(self, metrics: Dict[str, Any], analysis: Dict[str, Any]) -> str: """ Generate a JSON report suitable for automated processing. Args: metrics: Dictionary of metrics analysis: Analysis results Returns: JSON formatted report """ report = { "report_version": "2.0", "system": { "name": "GPU Booster for DeepSeek R1 671B", "hardware": "8× NVIDIA H100 80GB GPUs", "configuration": { "model": "deepseek-ai/DeepSeek-R1-671B", "tensor_parallel_size": 8, "boostercache_chunk_size": 256, "max_sequence_length": 32768, "gpu_memory_utilization": 0.95 } }, "timestamp": datetime.utcnow().isoformat(), "performance_score": analysis['performance_score'], "summary": { "total_tps": metrics['avg_throughput'], "peak_tps": metrics['max_throughput'], "ttft_p50": metrics['ttft_p50'], "ttft_p95": metrics['ttft_p95'], "ttft_p99": metrics['ttft_p99'], "itl_p50": metrics['itl_p50'], "itl_p95": metrics['itl_p95'], "itl_p99": metrics['itl_p99'], "kv_cache_hit_rate": metrics['cache_hit_rate'], "gpu_utilization": metrics['avg_gpu_utilization'], "gpu_memory_usage": metrics['avg_gpu_memory'], "total_requests": metrics['total_requests'], "error_rate": metrics['error_rate'], "timeout_rate": metrics['timeout_rate'] }, "targets_compliance": analysis['targets_met'], "insights": analysis['insights'], "recommendations": analysis['recommendations'], "detailed_metrics": metrics } return json.dumps(report, indent=2, default=str) def generate_full_report(self, time_range: str = "1h", output_format: str = "both") -> Tuple[str, str]: """ Generate a complete performance report. Args: time_range: Time range for analysis output_format: "json", "markdown", or "both" Returns: Tuple of (json_report, markdown_report) or None for excluded format """ # Fetch metrics metrics = self.fetch_metrics(time_range) # Analyze performance analysis = self.analyze_performance(metrics) # Generate visualizations visualizations = self.generate_visualizations(metrics) # Generate reports json_report = None markdown_report = None if output_format in ["json", "both"]: json_report = self.generate_json_report(metrics, analysis) if output_format in ["markdown", "both"]: markdown_report = self.generate_markdown_report(metrics, analysis, visualizations) return json_report, markdown_report # ===================================================== # USAGE EXAMPLE AND CLI INTERFACE # ===================================================== def main(): """ Command-line interface for the report generator. """ import argparse parser = argparse.ArgumentParser( description="Generate performance reports for GPU Booster" ) parser.add_argument( "--prometheus-url", default="http://localhost:9090", help="Prometheus server URL" ) parser.add_argument( "--time-range", default="1h", choices=["1h", "6h", "24h", "7d", "30d"], help="Time range for analysis" ) parser.add_argument( "--output", default="both", choices=["json", "markdown", "both"], help="Output format" ) parser.add_argument( "--save", action="store_true", help="Save reports to files" ) args = parser.parse_args() # Initialize report generator generator = GPUBoosterReportGenerator(args.prometheus_url) print(f"Generating GPU Booster performance report for the last {args.time_range}...") # Generate reports json_report, markdown_report = generator.generate_full_report( time_range=args.time_range, output_format=args.output ) # Save or print reports if args.save: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if json_report: filename = f"gpu_booster_report_{timestamp}.json" with open(filename, "w") as f: f.write(json_report) print(f"JSON report saved to: {filename}") if markdown_report: filename = f"gpu_booster_report_{timestamp}.md" with open(filename, "w") as f: f.write(markdown_report) print(f"Markdown report saved to: {filename}") else: if json_report and args.output in ["json", "both"]: print("\n=== JSON Report ===") print(json_report) if markdown_report and args.output in ["markdown", "both"]: print("\n=== Markdown Report ===") print(markdown_report) if __name__ == "__main__": main()
Content is user-generated and unverified.
    GPU Booster Performance Report Generator | Claude