#!/usr/bin/env python3 """ LLM Benchmark Visualization Tool Generates key performance visualizations: - Throughput vs Concurrency - Time to First Token (TTFT) - Latency Percentiles """ import json import argparse import logging from pathlib import Path from typing import Dict, List import numpy as np import matplotlib.pyplot as plt import seaborn as sns # ============================================================================ # CONFIGURATION # ============================================================================ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') sns.set_style("whitegrid") sns.set_context("notebook", font_scale=1.1) plt.rcParams['font.size'] = 11 COLORS = ['#2E86AB', '#E63946', '#06A77D', '#F77F00', '#9B59B6', '#E74C3C'] # ============================================================================ # DATA LOADING # ============================================================================ def load_results(input_file: str) -> Dict: """ Load benchmark results from JSON file. Parameters: input_file: Path to the JSON file containing benchmark results Returns: Dictionary containing the benchmark results, or None if loading fails """ try: with open(input_file, 'r') as f: data = json.load(f) logging.info(f"Loaded results from {input_file}") return data except FileNotFoundError: logging.error(f"File not found: {input_file}") return None except json.JSONDecodeError as e: logging.error(f"Invalid JSON: {e}") return None def extract_data(results: List[Dict]) -> Dict: """ Extract plotting data from benchmark results. Parameters: results: List of benchmark result dictionaries Returns: Dictionary containing extracted data for plotting """ data = { 'input_tokens': [], 'actual_input_tokens': [], 'batch_sizes': [], 'throughput_tps': [], 'latency_mean': [], 'latency_p50': [], 'latency_p95': [], 'latency_p99': [], 'ttft_mean': [], 'ttft_p50': [], 'ttft_p90': [], 'avg_batch_throughput': [], } for result in results: config = result.get('config', {}) latency = result.get('latency', {}) ttft = result.get('ttft', {}) throughput = result.get('throughput', {}) batch_metrics = result.get('batch_metrics', {}) target_input = config.get('input_tokens', 0) # Get actual input tokens from config (if available) or use target actual_input = config.get('actual_input_tokens', target_input) # Round to nearest 100 for cleaner display actual_input_rounded = round(actual_input / 100) * 100 data['input_tokens'].append(target_input) data['actual_input_tokens'].append(actual_input_rounded) data['batch_sizes'].append(config.get('batch_size', 0)) data['throughput_tps'].append(throughput.get('concurrent_total_tps', 0)) data['latency_mean'].append(latency.get('mean', 0)) data['latency_p50'].append(latency.get('p50', 0)) data['latency_p95'].append(latency.get('p95', 0)) data['latency_p99'].append(latency.get('p99', 0)) data['ttft_mean'].append(ttft.get('mean', 0)) data['ttft_p50'].append(ttft.get('p50', 0)) data['ttft_p90'].append(ttft.get('p90', 0)) data['avg_batch_throughput'].append(batch_metrics.get('avg_batch_throughput', 0)) return data # ============================================================================ # PLOTTING FUNCTIONS # ============================================================================ def plot_throughput(data: Dict, output_dir: Path, model_name: str): """ Plot throughput vs batch size as a line chart. Parameters: data: Dictionary containing extracted benchmark data output_dir: Directory where the plot will be saved model_name: Name of the model being benchmarked """ fig, ax = plt.subplots(figsize=(12, 7)) unique_input_tokens = sorted(set(data['input_tokens'])) for i, input_tok in enumerate(unique_input_tokens): # Filter data for this input token count mask = [it == input_tok for it in data['input_tokens']] batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m] throughput = [data['throughput_tps'][j] for j, m in enumerate(mask) if m] actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m] # Sort by batch size sorted_data = sorted(zip(batch_sizes, throughput)) batch_sizes_sorted = [x[0] for x in sorted_data] throughput_sorted = [x[1] for x in sorted_data] # Use actual average token count for label avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok color = COLORS[i % len(COLORS)] ax.plot( batch_sizes_sorted, throughput_sorted, 'o-', color=color, label=f'{avg_actual:,} tokens', linewidth=3, markersize=12, markeredgewidth=2, markeredgecolor='white', zorder=3 ) ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) ax.set_ylabel('Throughput (tokens/second)', fontweight='bold', fontsize=13) ax.set_title(f'Throughput vs Batch Size - {model_name}', fontweight='bold', fontsize=15, pad=15) ax.legend(fontsize=11, frameon=True, shadow=True, fancybox=True) ax.grid(True, alpha=0.3, linestyle='--') if len(set(data['batch_sizes'])) > 1: ax.set_xscale('log', base=2) ax.set_ylim(bottom=0) plt.tight_layout() output_path = output_dir / 'throughput.png' plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') logging.info(f"Saved: {output_path}") plt.close() def plot_ttft(data: Dict, output_dir: Path, model_name: str): """ Plot Time to First Token vs batch size as a line chart. Parameters: data: Dictionary containing extracted benchmark data output_dir: Directory where the plot will be saved model_name: Name of the model being benchmarked """ fig, ax = plt.subplots(figsize=(12, 7)) unique_input_tokens = sorted(set(data['input_tokens'])) for i, input_tok in enumerate(unique_input_tokens): mask = [it == input_tok for it in data['input_tokens']] batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m] ttft_mean = [data['ttft_mean'][j] for j, m in enumerate(mask) if m] ttft_p90 = [data['ttft_p90'][j] for j, m in enumerate(mask) if m] actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m] sorted_data = sorted(zip(batch_sizes, ttft_mean, ttft_p90)) batch_sizes_sorted = [x[0] for x in sorted_data] ttft_mean_sorted = [x[1] for x in sorted_data] ttft_p90_sorted = [x[2] for x in sorted_data] # Use actual average token count for label avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok color = COLORS[i % len(COLORS)] ax.plot( batch_sizes_sorted, ttft_mean_sorted, 'o-', color=color, label=f'{avg_actual:,} tokens (mean)', linewidth=3, markersize=12, markeredgewidth=2, markeredgecolor='white', zorder=3 ) ax.plot( batch_sizes_sorted, ttft_p90_sorted, 's--', color=color, label=f'{avg_actual:,} tokens (P90)', linewidth=2.5, markersize=9, alpha=0.75, markeredgewidth=1.5, markeredgecolor='white', zorder=3 ) ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) ax.set_ylabel('Time to First Token (seconds)', fontweight='bold', fontsize=13) ax.set_title(f'TTFT vs Batch Size - {model_name}', fontweight='bold', fontsize=15, pad=15) ax.legend(fontsize=10, frameon=True, shadow=True, fancybox=True, ncol=2) ax.grid(True, alpha=0.3, linestyle='--') if len(set(data['batch_sizes'])) > 1: ax.set_xscale('log', base=2) plt.tight_layout() output_path = output_dir / 'ttft.png' plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') logging.info(f"Saved: {output_path}") plt.close() def plot_latency_percentiles(data: Dict, output_dir: Path, model_name: str): """ Plot latency percentiles (mean, P50, P95, P99) vs batch size. Parameters: data: Dictionary containing extracted benchmark data output_dir: Directory where the plot will be saved model_name: Name of the model being benchmarked """ fig, ax = plt.subplots(figsize=(12, 7)) unique_input_tokens = sorted(set(data['input_tokens'])) for i, input_tok in enumerate(unique_input_tokens): mask = [it == input_tok for it in data['input_tokens']] batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m] lat_mean = [data['latency_mean'][j] for j, m in enumerate(mask) if m] lat_p50 = [data['latency_p50'][j] for j, m in enumerate(mask) if m] lat_p95 = [data['latency_p95'][j] for j, m in enumerate(mask) if m] lat_p99 = [data['latency_p99'][j] for j, m in enumerate(mask) if m] actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m] sorted_data = sorted(zip(batch_sizes, lat_mean, lat_p50, lat_p95, lat_p99)) batch_sizes_sorted = [x[0] for x in sorted_data] lat_mean_sorted = [x[1] for x in sorted_data] lat_p50_sorted = [x[2] for x in sorted_data] lat_p95_sorted = [x[3] for x in sorted_data] lat_p99_sorted = [x[4] for x in sorted_data] # Use actual average token count for label avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok color = COLORS[i % len(COLORS)] # Plot mean and percentiles ax.plot( batch_sizes_sorted, lat_mean_sorted, 'o-', color=color, label=f'{avg_actual:,} (mean)', linewidth=2.5, markersize=10, markeredgewidth=1.5, markeredgecolor='white', zorder=4 ) ax.plot( batch_sizes_sorted, lat_p50_sorted, 's-', color=color, label=f'{avg_actual:,} (P50)', linewidth=2, markersize=8, alpha=0.8, markeredgewidth=1.5, markeredgecolor='white', zorder=3 ) ax.plot( batch_sizes_sorted, lat_p95_sorted, '^-', color=color, label=f'{avg_actual:,} (P95)', linewidth=1.8, markersize=7, alpha=0.7, markeredgewidth=1.5, markeredgecolor='white', zorder=2 ) ax.plot( batch_sizes_sorted, lat_p99_sorted, 'v-', color=color, label=f'{avg_actual:,} (P99)', linewidth=1.5, markersize=6, alpha=0.6, markeredgewidth=1.5, markeredgecolor='white', zorder=1 ) ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) ax.set_ylabel('Latency (seconds)', fontweight='bold', fontsize=13) ax.set_title(f'Latency Percentiles - {model_name}', fontweight='bold', fontsize=15, pad=15) ax.legend(fontsize=9, frameon=True, shadow=True, fancybox=True, ncol=len(unique_input_tokens)) ax.grid(True, alpha=0.3, linestyle='--') if len(set(data['batch_sizes'])) > 1: ax.set_xscale('log', base=2) plt.tight_layout() output_path = output_dir / 'latency_percentiles.png' plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') logging.info(f"Saved: {output_path}") plt.close() def plot_throughput_heatmap(data: Dict, output_dir: Path, model_name: str): """ Plot throughput heatmap showing total throughput across batch sizes and input tokens. Parameters: data: Dictionary containing extracted benchmark data output_dir: Directory where the plot will be saved model_name: Name of the model being benchmarked """ # Get unique values unique_batch_sizes = sorted(set(data['batch_sizes'])) unique_input_tokens = sorted(set(data['input_tokens'])) # Create matrix for heatmap and map actual tokens throughput_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes))) actual_token_map = {} # Map target -> actual average for i, input_tok in enumerate(unique_input_tokens): actual_for_this_token = [] for j, batch_size in enumerate(unique_batch_sizes): # Find matching data point for k in range(len(data['input_tokens'])): if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size: throughput_matrix[i, j] = data['throughput_tps'][k] actual_for_this_token.append(data['actual_input_tokens'][k]) break # Average actual tokens for this target if actual_for_this_token: actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token))) # Create heatmap fig, ax = plt.subplots(figsize=(12, 8)) im = ax.imshow(throughput_matrix, cmap='YlOrRd', aspect='auto') # Set ticks and labels using actual token counts ax.set_xticks(np.arange(len(unique_batch_sizes))) ax.set_yticks(np.arange(len(unique_input_tokens))) ax.set_xticklabels(unique_batch_sizes) ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens]) # Labels ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13) ax.set_title(f'Throughput Heatmap (tokens/second) - {model_name}', fontweight='bold', fontsize=15, pad=15) # Add colorbar cbar = plt.colorbar(im, ax=ax) cbar.set_label('Throughput (tokens/s)', fontweight='bold', fontsize=11) # Add text annotations for i in range(len(unique_input_tokens)): for j in range(len(unique_batch_sizes)): value = throughput_matrix[i, j] if value > 0: text = ax.text(j, i, f'{value:.0f}', ha="center", va="center", color="white" if value > throughput_matrix.max() * 0.5 else "black", fontweight='bold', fontsize=10) plt.tight_layout() output_path = output_dir / 'throughput_heatmap.png' plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') logging.info(f"Saved: {output_path}") plt.close() def plot_ttft_heatmap(data: Dict, output_dir: Path, model_name: str): """ Plot TTFT heatmap showing time to first token across batch sizes and input tokens. Parameters: data: Dictionary containing extracted benchmark data output_dir: Directory where the plot will be saved model_name: Name of the model being benchmarked """ # Get unique values unique_batch_sizes = sorted(set(data['batch_sizes'])) unique_input_tokens = sorted(set(data['input_tokens'])) # Create matrix for heatmap and map actual tokens ttft_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes))) actual_token_map = {} # Map target -> actual average for i, input_tok in enumerate(unique_input_tokens): actual_for_this_token = [] for j, batch_size in enumerate(unique_batch_sizes): # Find matching data point for k in range(len(data['input_tokens'])): if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size: ttft_matrix[i, j] = data['ttft_mean'][k] if data['ttft_mean'][k] else 0 actual_for_this_token.append(data['actual_input_tokens'][k]) break # Average actual tokens for this target if actual_for_this_token: actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token))) # Create heatmap fig, ax = plt.subplots(figsize=(12, 8)) # Use reversed colormap (lower TTFT is better, so use cooler colors) im = ax.imshow(ttft_matrix, cmap='YlGnBu', aspect='auto') # Set ticks and labels using actual token counts ax.set_xticks(np.arange(len(unique_batch_sizes))) ax.set_yticks(np.arange(len(unique_input_tokens))) ax.set_xticklabels(unique_batch_sizes) ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens]) # Labels ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13) ax.set_title(f'Time to First Token Heatmap (seconds) - {model_name}', fontweight='bold', fontsize=15, pad=15) # Add colorbar cbar = plt.colorbar(im, ax=ax) cbar.set_label('TTFT (seconds)', fontweight='bold', fontsize=11) # Add text annotations for i in range(len(unique_input_tokens)): for j in range(len(unique_batch_sizes)): value = ttft_matrix[i, j] if value > 0: text = ax.text(j, i, f'{value:.2f}', ha="center", va="center", color="white" if value > ttft_matrix.max() * 0.5 else "black", fontweight='bold', fontsize=10) plt.tight_layout() output_path = output_dir / 'ttft_heatmap.png' plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') logging.info(f"Saved: {output_path}") plt.close() def plot_efficiency_heatmap(data: Dict, output_dir: Path, model_name: str): """ Plot efficiency heatmap showing throughput per request (tokens/s per worker). This shows how efficiently each individual request in a batch is processed, revealing scaling behavior and resource contention patterns. Parameters: data: Dictionary containing extracted benchmark data output_dir: Directory where the plot will be saved model_name: Name of the model being benchmarked """ # Get unique values unique_batch_sizes = sorted(set(data['batch_sizes'])) unique_input_tokens = sorted(set(data['input_tokens'])) # Create matrix for heatmap - calculate throughput per request and map actual tokens efficiency_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes))) actual_token_map = {} # Map target -> actual average for i, input_tok in enumerate(unique_input_tokens): actual_for_this_token = [] for j, batch_size in enumerate(unique_batch_sizes): # Find matching data point for k in range(len(data['input_tokens'])): if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size: # Calculate throughput per request (total throughput / batch size) total_throughput = data['throughput_tps'][k] efficiency_matrix[i, j] = total_throughput / batch_size if batch_size > 0 else 0 actual_for_this_token.append(data['actual_input_tokens'][k]) break # Average actual tokens for this target if actual_for_this_token: actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token))) # Create heatmap fig, ax = plt.subplots(figsize=(12, 8)) # Use viridis colormap (yellow-green-blue like the screenshot) im = ax.imshow(efficiency_matrix, cmap='viridis', aspect='auto') # Set ticks and labels using actual token counts ax.set_xticks(np.arange(len(unique_batch_sizes))) ax.set_yticks(np.arange(len(unique_input_tokens))) ax.set_xticklabels(unique_batch_sizes) ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens]) # Labels ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13) ax.set_title(f'Efficiency Heatmap (tokens/s per request) - {model_name}', fontweight='bold', fontsize=15, pad=15) # Add colorbar cbar = plt.colorbar(im, ax=ax) cbar.set_label('Throughput per Request (tokens/s)', fontweight='bold', fontsize=11) # Add text annotations with white color for better visibility for i in range(len(unique_input_tokens)): for j in range(len(unique_batch_sizes)): value = efficiency_matrix[i, j] if value > 0: # Use white text for dark backgrounds, black for light backgrounds text = ax.text(j, i, f'{value:.1f}', ha="center", va="center", color="white" if value < efficiency_matrix.max() * 0.7 else "black", fontweight='bold', fontsize=10) plt.tight_layout() output_path = output_dir / 'efficiency_heatmap.png' plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') logging.info(f"Saved: {output_path}") plt.close() # ============================================================================ # MAIN # ============================================================================ def main(): parser = argparse.ArgumentParser( description="Visualize LLM benchmark results", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Example: python visualize_results.py --input results/results_model/benchmark_results.json """ ) parser.add_argument( '--input', type=str, required=True, help="Input JSON file with benchmark results" ) parser.add_argument( '--output_dir', type=str, default=None, help="Output directory (default: same as input file)" ) args = parser.parse_args() # Load results results_data = load_results(args.input) if not results_data or 'results' not in results_data or not results_data['results']: logging.error("No valid results found") return # Set output directory if args.output_dir: output_dir = Path(args.output_dir) else: output_dir = Path(args.input).parent output_dir.mkdir(parents=True, exist_ok=True) logging.info(f"Output directory: {output_dir}") # Extract data model_name = results_data.get('model_name', 'Unknown Model') data = extract_data(results_data['results']) # Generate visualizations logging.info("Generating visualizations...") plot_throughput(data, output_dir, model_name) plot_ttft(data, output_dir, model_name) plot_latency_percentiles(data, output_dir, model_name) plot_throughput_heatmap(data, output_dir, model_name) plot_ttft_heatmap(data, output_dir, model_name) plot_efficiency_heatmap(data, output_dir, model_name) logging.info(f"\n{'='*60}") logging.info("VISUALIZATION COMPLETE!") logging.info(f"{'='*60}") logging.info(f"Visualizations saved to: {output_dir}") logging.info("Generated plots:") logging.info(" - throughput.png (line chart)") logging.info(" - throughput_heatmap.png (total throughput)") logging.info(" - efficiency_heatmap.png (throughput per request)") logging.info(" - ttft.png (line chart)") logging.info(" - ttft_heatmap.png") logging.info(" - latency_percentiles.png") if __name__ == '__main__': main()