mindef-overdracht/llm-throughput-tests-mindef-metadateren/visualize_results.py
2026-06-02 11:46:29 +02:00

584 lines
23 KiB
Python

#!/usr/bin/env python3
"""
LLM Benchmark Visualization Tool
Generates key performance visualizations:
- Throughput vs Concurrency
- Time to First Token (TTFT)
- Latency Percentiles
"""
import json
import argparse
import logging
from pathlib import Path
from typing import Dict, List
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# ============================================================================
# CONFIGURATION
# ============================================================================
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.1)
plt.rcParams['font.size'] = 11
COLORS = ['#2E86AB', '#E63946', '#06A77D', '#F77F00', '#9B59B6', '#E74C3C']
# ============================================================================
# DATA LOADING
# ============================================================================
def load_results(input_file: str) -> Dict:
"""
Load benchmark results from JSON file.
Parameters:
input_file: Path to the JSON file containing benchmark results
Returns:
Dictionary containing the benchmark results, or None if loading fails
"""
try:
with open(input_file, 'r') as f:
data = json.load(f)
logging.info(f"Loaded results from {input_file}")
return data
except FileNotFoundError:
logging.error(f"File not found: {input_file}")
return None
except json.JSONDecodeError as e:
logging.error(f"Invalid JSON: {e}")
return None
def extract_data(results: List[Dict]) -> Dict:
"""
Extract plotting data from benchmark results.
Parameters:
results: List of benchmark result dictionaries
Returns:
Dictionary containing extracted data for plotting
"""
data = {
'input_tokens': [],
'actual_input_tokens': [],
'batch_sizes': [],
'throughput_tps': [],
'latency_mean': [],
'latency_p50': [],
'latency_p95': [],
'latency_p99': [],
'ttft_mean': [],
'ttft_p50': [],
'ttft_p90': [],
'avg_batch_throughput': [],
}
for result in results:
config = result.get('config', {})
latency = result.get('latency', {})
ttft = result.get('ttft', {})
throughput = result.get('throughput', {})
batch_metrics = result.get('batch_metrics', {})
target_input = config.get('input_tokens', 0)
# Get actual input tokens from config (if available) or use target
actual_input = config.get('actual_input_tokens', target_input)
# Round to nearest 100 for cleaner display
actual_input_rounded = round(actual_input / 100) * 100
data['input_tokens'].append(target_input)
data['actual_input_tokens'].append(actual_input_rounded)
data['batch_sizes'].append(config.get('batch_size', 0))
data['throughput_tps'].append(throughput.get('concurrent_total_tps', 0))
data['latency_mean'].append(latency.get('mean', 0))
data['latency_p50'].append(latency.get('p50', 0))
data['latency_p95'].append(latency.get('p95', 0))
data['latency_p99'].append(latency.get('p99', 0))
data['ttft_mean'].append(ttft.get('mean', 0))
data['ttft_p50'].append(ttft.get('p50', 0))
data['ttft_p90'].append(ttft.get('p90', 0))
data['avg_batch_throughput'].append(batch_metrics.get('avg_batch_throughput', 0))
return data
# ============================================================================
# PLOTTING FUNCTIONS
# ============================================================================
def plot_throughput(data: Dict, output_dir: Path, model_name: str):
"""
Plot throughput vs batch size as a line chart.
Parameters:
data: Dictionary containing extracted benchmark data
output_dir: Directory where the plot will be saved
model_name: Name of the model being benchmarked
"""
fig, ax = plt.subplots(figsize=(12, 7))
unique_input_tokens = sorted(set(data['input_tokens']))
for i, input_tok in enumerate(unique_input_tokens):
# Filter data for this input token count
mask = [it == input_tok for it in data['input_tokens']]
batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m]
throughput = [data['throughput_tps'][j] for j, m in enumerate(mask) if m]
actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m]
# Sort by batch size
sorted_data = sorted(zip(batch_sizes, throughput))
batch_sizes_sorted = [x[0] for x in sorted_data]
throughput_sorted = [x[1] for x in sorted_data]
# Use actual average token count for label
avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok
color = COLORS[i % len(COLORS)]
ax.plot(
batch_sizes_sorted, throughput_sorted, 'o-', color=color,
label=f'{avg_actual:,} tokens', linewidth=3, markersize=12,
markeredgewidth=2, markeredgecolor='white', zorder=3
)
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
ax.set_ylabel('Throughput (tokens/second)', fontweight='bold', fontsize=13)
ax.set_title(f'Throughput vs Batch Size - {model_name}', fontweight='bold', fontsize=15, pad=15)
ax.legend(fontsize=11, frameon=True, shadow=True, fancybox=True)
ax.grid(True, alpha=0.3, linestyle='--')
if len(set(data['batch_sizes'])) > 1:
ax.set_xscale('log', base=2)
ax.set_ylim(bottom=0)
plt.tight_layout()
output_path = output_dir / 'throughput.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
logging.info(f"Saved: {output_path}")
plt.close()
def plot_ttft(data: Dict, output_dir: Path, model_name: str):
"""
Plot Time to First Token vs batch size as a line chart.
Parameters:
data: Dictionary containing extracted benchmark data
output_dir: Directory where the plot will be saved
model_name: Name of the model being benchmarked
"""
fig, ax = plt.subplots(figsize=(12, 7))
unique_input_tokens = sorted(set(data['input_tokens']))
for i, input_tok in enumerate(unique_input_tokens):
mask = [it == input_tok for it in data['input_tokens']]
batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m]
ttft_mean = [data['ttft_mean'][j] for j, m in enumerate(mask) if m]
ttft_p90 = [data['ttft_p90'][j] for j, m in enumerate(mask) if m]
actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m]
sorted_data = sorted(zip(batch_sizes, ttft_mean, ttft_p90))
batch_sizes_sorted = [x[0] for x in sorted_data]
ttft_mean_sorted = [x[1] for x in sorted_data]
ttft_p90_sorted = [x[2] for x in sorted_data]
# Use actual average token count for label
avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok
color = COLORS[i % len(COLORS)]
ax.plot(
batch_sizes_sorted, ttft_mean_sorted, 'o-', color=color,
label=f'{avg_actual:,} tokens (mean)', linewidth=3, markersize=12,
markeredgewidth=2, markeredgecolor='white', zorder=3
)
ax.plot(
batch_sizes_sorted, ttft_p90_sorted, 's--', color=color,
label=f'{avg_actual:,} tokens (P90)', linewidth=2.5, markersize=9,
alpha=0.75, markeredgewidth=1.5, markeredgecolor='white', zorder=3
)
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
ax.set_ylabel('Time to First Token (seconds)', fontweight='bold', fontsize=13)
ax.set_title(f'TTFT vs Batch Size - {model_name}', fontweight='bold', fontsize=15, pad=15)
ax.legend(fontsize=10, frameon=True, shadow=True, fancybox=True, ncol=2)
ax.grid(True, alpha=0.3, linestyle='--')
if len(set(data['batch_sizes'])) > 1:
ax.set_xscale('log', base=2)
plt.tight_layout()
output_path = output_dir / 'ttft.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
logging.info(f"Saved: {output_path}")
plt.close()
def plot_latency_percentiles(data: Dict, output_dir: Path, model_name: str):
"""
Plot latency percentiles (mean, P50, P95, P99) vs batch size.
Parameters:
data: Dictionary containing extracted benchmark data
output_dir: Directory where the plot will be saved
model_name: Name of the model being benchmarked
"""
fig, ax = plt.subplots(figsize=(12, 7))
unique_input_tokens = sorted(set(data['input_tokens']))
for i, input_tok in enumerate(unique_input_tokens):
mask = [it == input_tok for it in data['input_tokens']]
batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m]
lat_mean = [data['latency_mean'][j] for j, m in enumerate(mask) if m]
lat_p50 = [data['latency_p50'][j] for j, m in enumerate(mask) if m]
lat_p95 = [data['latency_p95'][j] for j, m in enumerate(mask) if m]
lat_p99 = [data['latency_p99'][j] for j, m in enumerate(mask) if m]
actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m]
sorted_data = sorted(zip(batch_sizes, lat_mean, lat_p50, lat_p95, lat_p99))
batch_sizes_sorted = [x[0] for x in sorted_data]
lat_mean_sorted = [x[1] for x in sorted_data]
lat_p50_sorted = [x[2] for x in sorted_data]
lat_p95_sorted = [x[3] for x in sorted_data]
lat_p99_sorted = [x[4] for x in sorted_data]
# Use actual average token count for label
avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok
color = COLORS[i % len(COLORS)]
# Plot mean and percentiles
ax.plot(
batch_sizes_sorted, lat_mean_sorted, 'o-', color=color,
label=f'{avg_actual:,} (mean)', linewidth=2.5, markersize=10,
markeredgewidth=1.5, markeredgecolor='white', zorder=4
)
ax.plot(
batch_sizes_sorted, lat_p50_sorted, 's-', color=color,
label=f'{avg_actual:,} (P50)', linewidth=2, markersize=8,
alpha=0.8, markeredgewidth=1.5, markeredgecolor='white', zorder=3
)
ax.plot(
batch_sizes_sorted, lat_p95_sorted, '^-', color=color,
label=f'{avg_actual:,} (P95)', linewidth=1.8, markersize=7,
alpha=0.7, markeredgewidth=1.5, markeredgecolor='white', zorder=2
)
ax.plot(
batch_sizes_sorted, lat_p99_sorted, 'v-', color=color,
label=f'{avg_actual:,} (P99)', linewidth=1.5, markersize=6,
alpha=0.6, markeredgewidth=1.5, markeredgecolor='white', zorder=1
)
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
ax.set_ylabel('Latency (seconds)', fontweight='bold', fontsize=13)
ax.set_title(f'Latency Percentiles - {model_name}', fontweight='bold', fontsize=15, pad=15)
ax.legend(fontsize=9, frameon=True, shadow=True, fancybox=True, ncol=len(unique_input_tokens))
ax.grid(True, alpha=0.3, linestyle='--')
if len(set(data['batch_sizes'])) > 1:
ax.set_xscale('log', base=2)
plt.tight_layout()
output_path = output_dir / 'latency_percentiles.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
logging.info(f"Saved: {output_path}")
plt.close()
def plot_throughput_heatmap(data: Dict, output_dir: Path, model_name: str):
"""
Plot throughput heatmap showing total throughput across batch sizes and input tokens.
Parameters:
data: Dictionary containing extracted benchmark data
output_dir: Directory where the plot will be saved
model_name: Name of the model being benchmarked
"""
# Get unique values
unique_batch_sizes = sorted(set(data['batch_sizes']))
unique_input_tokens = sorted(set(data['input_tokens']))
# Create matrix for heatmap and map actual tokens
throughput_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes)))
actual_token_map = {} # Map target -> actual average
for i, input_tok in enumerate(unique_input_tokens):
actual_for_this_token = []
for j, batch_size in enumerate(unique_batch_sizes):
# Find matching data point
for k in range(len(data['input_tokens'])):
if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size:
throughput_matrix[i, j] = data['throughput_tps'][k]
actual_for_this_token.append(data['actual_input_tokens'][k])
break
# Average actual tokens for this target
if actual_for_this_token:
actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token)))
# Create heatmap
fig, ax = plt.subplots(figsize=(12, 8))
im = ax.imshow(throughput_matrix, cmap='YlOrRd', aspect='auto')
# Set ticks and labels using actual token counts
ax.set_xticks(np.arange(len(unique_batch_sizes)))
ax.set_yticks(np.arange(len(unique_input_tokens)))
ax.set_xticklabels(unique_batch_sizes)
ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens])
# Labels
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13)
ax.set_title(f'Throughput Heatmap (tokens/second) - {model_name}',
fontweight='bold', fontsize=15, pad=15)
# Add colorbar
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('Throughput (tokens/s)', fontweight='bold', fontsize=11)
# Add text annotations
for i in range(len(unique_input_tokens)):
for j in range(len(unique_batch_sizes)):
value = throughput_matrix[i, j]
if value > 0:
text = ax.text(j, i, f'{value:.0f}',
ha="center", va="center",
color="white" if value > throughput_matrix.max() * 0.5 else "black",
fontweight='bold', fontsize=10)
plt.tight_layout()
output_path = output_dir / 'throughput_heatmap.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
logging.info(f"Saved: {output_path}")
plt.close()
def plot_ttft_heatmap(data: Dict, output_dir: Path, model_name: str):
"""
Plot TTFT heatmap showing time to first token across batch sizes and input tokens.
Parameters:
data: Dictionary containing extracted benchmark data
output_dir: Directory where the plot will be saved
model_name: Name of the model being benchmarked
"""
# Get unique values
unique_batch_sizes = sorted(set(data['batch_sizes']))
unique_input_tokens = sorted(set(data['input_tokens']))
# Create matrix for heatmap and map actual tokens
ttft_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes)))
actual_token_map = {} # Map target -> actual average
for i, input_tok in enumerate(unique_input_tokens):
actual_for_this_token = []
for j, batch_size in enumerate(unique_batch_sizes):
# Find matching data point
for k in range(len(data['input_tokens'])):
if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size:
ttft_matrix[i, j] = data['ttft_mean'][k] if data['ttft_mean'][k] else 0
actual_for_this_token.append(data['actual_input_tokens'][k])
break
# Average actual tokens for this target
if actual_for_this_token:
actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token)))
# Create heatmap
fig, ax = plt.subplots(figsize=(12, 8))
# Use reversed colormap (lower TTFT is better, so use cooler colors)
im = ax.imshow(ttft_matrix, cmap='YlGnBu', aspect='auto')
# Set ticks and labels using actual token counts
ax.set_xticks(np.arange(len(unique_batch_sizes)))
ax.set_yticks(np.arange(len(unique_input_tokens)))
ax.set_xticklabels(unique_batch_sizes)
ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens])
# Labels
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13)
ax.set_title(f'Time to First Token Heatmap (seconds) - {model_name}',
fontweight='bold', fontsize=15, pad=15)
# Add colorbar
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('TTFT (seconds)', fontweight='bold', fontsize=11)
# Add text annotations
for i in range(len(unique_input_tokens)):
for j in range(len(unique_batch_sizes)):
value = ttft_matrix[i, j]
if value > 0:
text = ax.text(j, i, f'{value:.2f}',
ha="center", va="center",
color="white" if value > ttft_matrix.max() * 0.5 else "black",
fontweight='bold', fontsize=10)
plt.tight_layout()
output_path = output_dir / 'ttft_heatmap.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
logging.info(f"Saved: {output_path}")
plt.close()
def plot_efficiency_heatmap(data: Dict, output_dir: Path, model_name: str):
"""
Plot efficiency heatmap showing throughput per request (tokens/s per worker).
This shows how efficiently each individual request in a batch is processed,
revealing scaling behavior and resource contention patterns.
Parameters:
data: Dictionary containing extracted benchmark data
output_dir: Directory where the plot will be saved
model_name: Name of the model being benchmarked
"""
# Get unique values
unique_batch_sizes = sorted(set(data['batch_sizes']))
unique_input_tokens = sorted(set(data['input_tokens']))
# Create matrix for heatmap - calculate throughput per request and map actual tokens
efficiency_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes)))
actual_token_map = {} # Map target -> actual average
for i, input_tok in enumerate(unique_input_tokens):
actual_for_this_token = []
for j, batch_size in enumerate(unique_batch_sizes):
# Find matching data point
for k in range(len(data['input_tokens'])):
if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size:
# Calculate throughput per request (total throughput / batch size)
total_throughput = data['throughput_tps'][k]
efficiency_matrix[i, j] = total_throughput / batch_size if batch_size > 0 else 0
actual_for_this_token.append(data['actual_input_tokens'][k])
break
# Average actual tokens for this target
if actual_for_this_token:
actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token)))
# Create heatmap
fig, ax = plt.subplots(figsize=(12, 8))
# Use viridis colormap (yellow-green-blue like the screenshot)
im = ax.imshow(efficiency_matrix, cmap='viridis', aspect='auto')
# Set ticks and labels using actual token counts
ax.set_xticks(np.arange(len(unique_batch_sizes)))
ax.set_yticks(np.arange(len(unique_input_tokens)))
ax.set_xticklabels(unique_batch_sizes)
ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens])
# Labels
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13)
ax.set_title(f'Efficiency Heatmap (tokens/s per request) - {model_name}',
fontweight='bold', fontsize=15, pad=15)
# Add colorbar
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('Throughput per Request (tokens/s)', fontweight='bold', fontsize=11)
# Add text annotations with white color for better visibility
for i in range(len(unique_input_tokens)):
for j in range(len(unique_batch_sizes)):
value = efficiency_matrix[i, j]
if value > 0:
# Use white text for dark backgrounds, black for light backgrounds
text = ax.text(j, i, f'{value:.1f}',
ha="center", va="center",
color="white" if value < efficiency_matrix.max() * 0.7 else "black",
fontweight='bold', fontsize=10)
plt.tight_layout()
output_path = output_dir / 'efficiency_heatmap.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
logging.info(f"Saved: {output_path}")
plt.close()
# ============================================================================
# MAIN
# ============================================================================
def main():
parser = argparse.ArgumentParser(
description="Visualize LLM benchmark results",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Example:
python visualize_results.py --input results/results_model/benchmark_results.json
"""
)
parser.add_argument(
'--input',
type=str,
required=True,
help="Input JSON file with benchmark results"
)
parser.add_argument(
'--output_dir',
type=str,
default=None,
help="Output directory (default: same as input file)"
)
args = parser.parse_args()
# Load results
results_data = load_results(args.input)
if not results_data or 'results' not in results_data or not results_data['results']:
logging.error("No valid results found")
return
# Set output directory
if args.output_dir:
output_dir = Path(args.output_dir)
else:
output_dir = Path(args.input).parent
output_dir.mkdir(parents=True, exist_ok=True)
logging.info(f"Output directory: {output_dir}")
# Extract data
model_name = results_data.get('model_name', 'Unknown Model')
data = extract_data(results_data['results'])
# Generate visualizations
logging.info("Generating visualizations...")
plot_throughput(data, output_dir, model_name)
plot_ttft(data, output_dir, model_name)
plot_latency_percentiles(data, output_dir, model_name)
plot_throughput_heatmap(data, output_dir, model_name)
plot_ttft_heatmap(data, output_dir, model_name)
plot_efficiency_heatmap(data, output_dir, model_name)
logging.info(f"\n{'='*60}")
logging.info("VISUALIZATION COMPLETE!")
logging.info(f"{'='*60}")
logging.info(f"Visualizations saved to: {output_dir}")
logging.info("Generated plots:")
logging.info(" - throughput.png (line chart)")
logging.info(" - throughput_heatmap.png (total throughput)")
logging.info(" - efficiency_heatmap.png (throughput per request)")
logging.info(" - ttft.png (line chart)")
logging.info(" - ttft_heatmap.png")
logging.info(" - latency_percentiles.png")
if __name__ == '__main__':
main()