584 lines
23 KiB
Python
584 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
LLM Benchmark Visualization Tool
|
|
|
|
Generates key performance visualizations:
|
|
- Throughput vs Concurrency
|
|
- Time to First Token (TTFT)
|
|
- Latency Percentiles
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION
|
|
# ============================================================================
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
sns.set_style("whitegrid")
|
|
sns.set_context("notebook", font_scale=1.1)
|
|
plt.rcParams['font.size'] = 11
|
|
|
|
COLORS = ['#2E86AB', '#E63946', '#06A77D', '#F77F00', '#9B59B6', '#E74C3C']
|
|
|
|
# ============================================================================
|
|
# DATA LOADING
|
|
# ============================================================================
|
|
|
|
def load_results(input_file: str) -> Dict:
|
|
"""
|
|
Load benchmark results from JSON file.
|
|
|
|
Parameters:
|
|
input_file: Path to the JSON file containing benchmark results
|
|
|
|
Returns:
|
|
Dictionary containing the benchmark results, or None if loading fails
|
|
"""
|
|
try:
|
|
with open(input_file, 'r') as f:
|
|
data = json.load(f)
|
|
logging.info(f"Loaded results from {input_file}")
|
|
return data
|
|
except FileNotFoundError:
|
|
logging.error(f"File not found: {input_file}")
|
|
return None
|
|
except json.JSONDecodeError as e:
|
|
logging.error(f"Invalid JSON: {e}")
|
|
return None
|
|
|
|
|
|
def extract_data(results: List[Dict]) -> Dict:
|
|
"""
|
|
Extract plotting data from benchmark results.
|
|
|
|
Parameters:
|
|
results: List of benchmark result dictionaries
|
|
|
|
Returns:
|
|
Dictionary containing extracted data for plotting
|
|
"""
|
|
data = {
|
|
'input_tokens': [],
|
|
'actual_input_tokens': [],
|
|
'batch_sizes': [],
|
|
'throughput_tps': [],
|
|
'latency_mean': [],
|
|
'latency_p50': [],
|
|
'latency_p95': [],
|
|
'latency_p99': [],
|
|
'ttft_mean': [],
|
|
'ttft_p50': [],
|
|
'ttft_p90': [],
|
|
'avg_batch_throughput': [],
|
|
}
|
|
|
|
for result in results:
|
|
config = result.get('config', {})
|
|
latency = result.get('latency', {})
|
|
ttft = result.get('ttft', {})
|
|
throughput = result.get('throughput', {})
|
|
batch_metrics = result.get('batch_metrics', {})
|
|
|
|
target_input = config.get('input_tokens', 0)
|
|
# Get actual input tokens from config (if available) or use target
|
|
actual_input = config.get('actual_input_tokens', target_input)
|
|
# Round to nearest 100 for cleaner display
|
|
actual_input_rounded = round(actual_input / 100) * 100
|
|
|
|
data['input_tokens'].append(target_input)
|
|
data['actual_input_tokens'].append(actual_input_rounded)
|
|
data['batch_sizes'].append(config.get('batch_size', 0))
|
|
data['throughput_tps'].append(throughput.get('concurrent_total_tps', 0))
|
|
data['latency_mean'].append(latency.get('mean', 0))
|
|
data['latency_p50'].append(latency.get('p50', 0))
|
|
data['latency_p95'].append(latency.get('p95', 0))
|
|
data['latency_p99'].append(latency.get('p99', 0))
|
|
data['ttft_mean'].append(ttft.get('mean', 0))
|
|
data['ttft_p50'].append(ttft.get('p50', 0))
|
|
data['ttft_p90'].append(ttft.get('p90', 0))
|
|
data['avg_batch_throughput'].append(batch_metrics.get('avg_batch_throughput', 0))
|
|
|
|
return data
|
|
|
|
|
|
# ============================================================================
|
|
# PLOTTING FUNCTIONS
|
|
# ============================================================================
|
|
|
|
def plot_throughput(data: Dict, output_dir: Path, model_name: str):
|
|
"""
|
|
Plot throughput vs batch size as a line chart.
|
|
|
|
Parameters:
|
|
data: Dictionary containing extracted benchmark data
|
|
output_dir: Directory where the plot will be saved
|
|
model_name: Name of the model being benchmarked
|
|
"""
|
|
fig, ax = plt.subplots(figsize=(12, 7))
|
|
|
|
unique_input_tokens = sorted(set(data['input_tokens']))
|
|
|
|
for i, input_tok in enumerate(unique_input_tokens):
|
|
# Filter data for this input token count
|
|
mask = [it == input_tok for it in data['input_tokens']]
|
|
batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m]
|
|
throughput = [data['throughput_tps'][j] for j, m in enumerate(mask) if m]
|
|
actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m]
|
|
|
|
# Sort by batch size
|
|
sorted_data = sorted(zip(batch_sizes, throughput))
|
|
batch_sizes_sorted = [x[0] for x in sorted_data]
|
|
throughput_sorted = [x[1] for x in sorted_data]
|
|
|
|
# Use actual average token count for label
|
|
avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok
|
|
|
|
color = COLORS[i % len(COLORS)]
|
|
ax.plot(
|
|
batch_sizes_sorted, throughput_sorted, 'o-', color=color,
|
|
label=f'{avg_actual:,} tokens', linewidth=3, markersize=12,
|
|
markeredgewidth=2, markeredgecolor='white', zorder=3
|
|
)
|
|
|
|
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
|
|
ax.set_ylabel('Throughput (tokens/second)', fontweight='bold', fontsize=13)
|
|
ax.set_title(f'Throughput vs Batch Size - {model_name}', fontweight='bold', fontsize=15, pad=15)
|
|
ax.legend(fontsize=11, frameon=True, shadow=True, fancybox=True)
|
|
ax.grid(True, alpha=0.3, linestyle='--')
|
|
|
|
if len(set(data['batch_sizes'])) > 1:
|
|
ax.set_xscale('log', base=2)
|
|
ax.set_ylim(bottom=0)
|
|
|
|
plt.tight_layout()
|
|
output_path = output_dir / 'throughput.png'
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
|
|
logging.info(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
def plot_ttft(data: Dict, output_dir: Path, model_name: str):
|
|
"""
|
|
Plot Time to First Token vs batch size as a line chart.
|
|
|
|
Parameters:
|
|
data: Dictionary containing extracted benchmark data
|
|
output_dir: Directory where the plot will be saved
|
|
model_name: Name of the model being benchmarked
|
|
"""
|
|
fig, ax = plt.subplots(figsize=(12, 7))
|
|
|
|
unique_input_tokens = sorted(set(data['input_tokens']))
|
|
|
|
for i, input_tok in enumerate(unique_input_tokens):
|
|
mask = [it == input_tok for it in data['input_tokens']]
|
|
batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m]
|
|
ttft_mean = [data['ttft_mean'][j] for j, m in enumerate(mask) if m]
|
|
ttft_p90 = [data['ttft_p90'][j] for j, m in enumerate(mask) if m]
|
|
actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m]
|
|
|
|
sorted_data = sorted(zip(batch_sizes, ttft_mean, ttft_p90))
|
|
batch_sizes_sorted = [x[0] for x in sorted_data]
|
|
ttft_mean_sorted = [x[1] for x in sorted_data]
|
|
ttft_p90_sorted = [x[2] for x in sorted_data]
|
|
|
|
# Use actual average token count for label
|
|
avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok
|
|
|
|
color = COLORS[i % len(COLORS)]
|
|
ax.plot(
|
|
batch_sizes_sorted, ttft_mean_sorted, 'o-', color=color,
|
|
label=f'{avg_actual:,} tokens (mean)', linewidth=3, markersize=12,
|
|
markeredgewidth=2, markeredgecolor='white', zorder=3
|
|
)
|
|
ax.plot(
|
|
batch_sizes_sorted, ttft_p90_sorted, 's--', color=color,
|
|
label=f'{avg_actual:,} tokens (P90)', linewidth=2.5, markersize=9,
|
|
alpha=0.75, markeredgewidth=1.5, markeredgecolor='white', zorder=3
|
|
)
|
|
|
|
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
|
|
ax.set_ylabel('Time to First Token (seconds)', fontweight='bold', fontsize=13)
|
|
ax.set_title(f'TTFT vs Batch Size - {model_name}', fontweight='bold', fontsize=15, pad=15)
|
|
ax.legend(fontsize=10, frameon=True, shadow=True, fancybox=True, ncol=2)
|
|
ax.grid(True, alpha=0.3, linestyle='--')
|
|
|
|
if len(set(data['batch_sizes'])) > 1:
|
|
ax.set_xscale('log', base=2)
|
|
|
|
plt.tight_layout()
|
|
output_path = output_dir / 'ttft.png'
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
|
|
logging.info(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
def plot_latency_percentiles(data: Dict, output_dir: Path, model_name: str):
|
|
"""
|
|
Plot latency percentiles (mean, P50, P95, P99) vs batch size.
|
|
|
|
Parameters:
|
|
data: Dictionary containing extracted benchmark data
|
|
output_dir: Directory where the plot will be saved
|
|
model_name: Name of the model being benchmarked
|
|
"""
|
|
fig, ax = plt.subplots(figsize=(12, 7))
|
|
|
|
unique_input_tokens = sorted(set(data['input_tokens']))
|
|
|
|
for i, input_tok in enumerate(unique_input_tokens):
|
|
mask = [it == input_tok for it in data['input_tokens']]
|
|
batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m]
|
|
lat_mean = [data['latency_mean'][j] for j, m in enumerate(mask) if m]
|
|
lat_p50 = [data['latency_p50'][j] for j, m in enumerate(mask) if m]
|
|
lat_p95 = [data['latency_p95'][j] for j, m in enumerate(mask) if m]
|
|
lat_p99 = [data['latency_p99'][j] for j, m in enumerate(mask) if m]
|
|
actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m]
|
|
|
|
sorted_data = sorted(zip(batch_sizes, lat_mean, lat_p50, lat_p95, lat_p99))
|
|
batch_sizes_sorted = [x[0] for x in sorted_data]
|
|
lat_mean_sorted = [x[1] for x in sorted_data]
|
|
lat_p50_sorted = [x[2] for x in sorted_data]
|
|
lat_p95_sorted = [x[3] for x in sorted_data]
|
|
lat_p99_sorted = [x[4] for x in sorted_data]
|
|
|
|
# Use actual average token count for label
|
|
avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok
|
|
|
|
color = COLORS[i % len(COLORS)]
|
|
|
|
# Plot mean and percentiles
|
|
ax.plot(
|
|
batch_sizes_sorted, lat_mean_sorted, 'o-', color=color,
|
|
label=f'{avg_actual:,} (mean)', linewidth=2.5, markersize=10,
|
|
markeredgewidth=1.5, markeredgecolor='white', zorder=4
|
|
)
|
|
ax.plot(
|
|
batch_sizes_sorted, lat_p50_sorted, 's-', color=color,
|
|
label=f'{avg_actual:,} (P50)', linewidth=2, markersize=8,
|
|
alpha=0.8, markeredgewidth=1.5, markeredgecolor='white', zorder=3
|
|
)
|
|
ax.plot(
|
|
batch_sizes_sorted, lat_p95_sorted, '^-', color=color,
|
|
label=f'{avg_actual:,} (P95)', linewidth=1.8, markersize=7,
|
|
alpha=0.7, markeredgewidth=1.5, markeredgecolor='white', zorder=2
|
|
)
|
|
ax.plot(
|
|
batch_sizes_sorted, lat_p99_sorted, 'v-', color=color,
|
|
label=f'{avg_actual:,} (P99)', linewidth=1.5, markersize=6,
|
|
alpha=0.6, markeredgewidth=1.5, markeredgecolor='white', zorder=1
|
|
)
|
|
|
|
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
|
|
ax.set_ylabel('Latency (seconds)', fontweight='bold', fontsize=13)
|
|
ax.set_title(f'Latency Percentiles - {model_name}', fontweight='bold', fontsize=15, pad=15)
|
|
ax.legend(fontsize=9, frameon=True, shadow=True, fancybox=True, ncol=len(unique_input_tokens))
|
|
ax.grid(True, alpha=0.3, linestyle='--')
|
|
|
|
if len(set(data['batch_sizes'])) > 1:
|
|
ax.set_xscale('log', base=2)
|
|
|
|
plt.tight_layout()
|
|
output_path = output_dir / 'latency_percentiles.png'
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
|
|
logging.info(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
def plot_throughput_heatmap(data: Dict, output_dir: Path, model_name: str):
|
|
"""
|
|
Plot throughput heatmap showing total throughput across batch sizes and input tokens.
|
|
|
|
Parameters:
|
|
data: Dictionary containing extracted benchmark data
|
|
output_dir: Directory where the plot will be saved
|
|
model_name: Name of the model being benchmarked
|
|
"""
|
|
# Get unique values
|
|
unique_batch_sizes = sorted(set(data['batch_sizes']))
|
|
unique_input_tokens = sorted(set(data['input_tokens']))
|
|
|
|
# Create matrix for heatmap and map actual tokens
|
|
throughput_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes)))
|
|
actual_token_map = {} # Map target -> actual average
|
|
|
|
for i, input_tok in enumerate(unique_input_tokens):
|
|
actual_for_this_token = []
|
|
for j, batch_size in enumerate(unique_batch_sizes):
|
|
# Find matching data point
|
|
for k in range(len(data['input_tokens'])):
|
|
if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size:
|
|
throughput_matrix[i, j] = data['throughput_tps'][k]
|
|
actual_for_this_token.append(data['actual_input_tokens'][k])
|
|
break
|
|
# Average actual tokens for this target
|
|
if actual_for_this_token:
|
|
actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token)))
|
|
|
|
# Create heatmap
|
|
fig, ax = plt.subplots(figsize=(12, 8))
|
|
|
|
im = ax.imshow(throughput_matrix, cmap='YlOrRd', aspect='auto')
|
|
|
|
# Set ticks and labels using actual token counts
|
|
ax.set_xticks(np.arange(len(unique_batch_sizes)))
|
|
ax.set_yticks(np.arange(len(unique_input_tokens)))
|
|
ax.set_xticklabels(unique_batch_sizes)
|
|
ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens])
|
|
|
|
# Labels
|
|
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
|
|
ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13)
|
|
ax.set_title(f'Throughput Heatmap (tokens/second) - {model_name}',
|
|
fontweight='bold', fontsize=15, pad=15)
|
|
|
|
# Add colorbar
|
|
cbar = plt.colorbar(im, ax=ax)
|
|
cbar.set_label('Throughput (tokens/s)', fontweight='bold', fontsize=11)
|
|
|
|
# Add text annotations
|
|
for i in range(len(unique_input_tokens)):
|
|
for j in range(len(unique_batch_sizes)):
|
|
value = throughput_matrix[i, j]
|
|
if value > 0:
|
|
text = ax.text(j, i, f'{value:.0f}',
|
|
ha="center", va="center",
|
|
color="white" if value > throughput_matrix.max() * 0.5 else "black",
|
|
fontweight='bold', fontsize=10)
|
|
|
|
plt.tight_layout()
|
|
output_path = output_dir / 'throughput_heatmap.png'
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
|
|
logging.info(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
def plot_ttft_heatmap(data: Dict, output_dir: Path, model_name: str):
|
|
"""
|
|
Plot TTFT heatmap showing time to first token across batch sizes and input tokens.
|
|
|
|
Parameters:
|
|
data: Dictionary containing extracted benchmark data
|
|
output_dir: Directory where the plot will be saved
|
|
model_name: Name of the model being benchmarked
|
|
"""
|
|
# Get unique values
|
|
unique_batch_sizes = sorted(set(data['batch_sizes']))
|
|
unique_input_tokens = sorted(set(data['input_tokens']))
|
|
|
|
# Create matrix for heatmap and map actual tokens
|
|
ttft_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes)))
|
|
actual_token_map = {} # Map target -> actual average
|
|
|
|
for i, input_tok in enumerate(unique_input_tokens):
|
|
actual_for_this_token = []
|
|
for j, batch_size in enumerate(unique_batch_sizes):
|
|
# Find matching data point
|
|
for k in range(len(data['input_tokens'])):
|
|
if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size:
|
|
ttft_matrix[i, j] = data['ttft_mean'][k] if data['ttft_mean'][k] else 0
|
|
actual_for_this_token.append(data['actual_input_tokens'][k])
|
|
break
|
|
# Average actual tokens for this target
|
|
if actual_for_this_token:
|
|
actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token)))
|
|
|
|
# Create heatmap
|
|
fig, ax = plt.subplots(figsize=(12, 8))
|
|
|
|
# Use reversed colormap (lower TTFT is better, so use cooler colors)
|
|
im = ax.imshow(ttft_matrix, cmap='YlGnBu', aspect='auto')
|
|
|
|
# Set ticks and labels using actual token counts
|
|
ax.set_xticks(np.arange(len(unique_batch_sizes)))
|
|
ax.set_yticks(np.arange(len(unique_input_tokens)))
|
|
ax.set_xticklabels(unique_batch_sizes)
|
|
ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens])
|
|
|
|
# Labels
|
|
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
|
|
ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13)
|
|
ax.set_title(f'Time to First Token Heatmap (seconds) - {model_name}',
|
|
fontweight='bold', fontsize=15, pad=15)
|
|
|
|
# Add colorbar
|
|
cbar = plt.colorbar(im, ax=ax)
|
|
cbar.set_label('TTFT (seconds)', fontweight='bold', fontsize=11)
|
|
|
|
# Add text annotations
|
|
for i in range(len(unique_input_tokens)):
|
|
for j in range(len(unique_batch_sizes)):
|
|
value = ttft_matrix[i, j]
|
|
if value > 0:
|
|
text = ax.text(j, i, f'{value:.2f}',
|
|
ha="center", va="center",
|
|
color="white" if value > ttft_matrix.max() * 0.5 else "black",
|
|
fontweight='bold', fontsize=10)
|
|
|
|
plt.tight_layout()
|
|
output_path = output_dir / 'ttft_heatmap.png'
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
|
|
logging.info(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
def plot_efficiency_heatmap(data: Dict, output_dir: Path, model_name: str):
|
|
"""
|
|
Plot efficiency heatmap showing throughput per request (tokens/s per worker).
|
|
|
|
This shows how efficiently each individual request in a batch is processed,
|
|
revealing scaling behavior and resource contention patterns.
|
|
|
|
Parameters:
|
|
data: Dictionary containing extracted benchmark data
|
|
output_dir: Directory where the plot will be saved
|
|
model_name: Name of the model being benchmarked
|
|
"""
|
|
# Get unique values
|
|
unique_batch_sizes = sorted(set(data['batch_sizes']))
|
|
unique_input_tokens = sorted(set(data['input_tokens']))
|
|
|
|
# Create matrix for heatmap - calculate throughput per request and map actual tokens
|
|
efficiency_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes)))
|
|
actual_token_map = {} # Map target -> actual average
|
|
|
|
for i, input_tok in enumerate(unique_input_tokens):
|
|
actual_for_this_token = []
|
|
for j, batch_size in enumerate(unique_batch_sizes):
|
|
# Find matching data point
|
|
for k in range(len(data['input_tokens'])):
|
|
if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size:
|
|
# Calculate throughput per request (total throughput / batch size)
|
|
total_throughput = data['throughput_tps'][k]
|
|
efficiency_matrix[i, j] = total_throughput / batch_size if batch_size > 0 else 0
|
|
actual_for_this_token.append(data['actual_input_tokens'][k])
|
|
break
|
|
# Average actual tokens for this target
|
|
if actual_for_this_token:
|
|
actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token)))
|
|
|
|
# Create heatmap
|
|
fig, ax = plt.subplots(figsize=(12, 8))
|
|
|
|
# Use viridis colormap (yellow-green-blue like the screenshot)
|
|
im = ax.imshow(efficiency_matrix, cmap='viridis', aspect='auto')
|
|
|
|
# Set ticks and labels using actual token counts
|
|
ax.set_xticks(np.arange(len(unique_batch_sizes)))
|
|
ax.set_yticks(np.arange(len(unique_input_tokens)))
|
|
ax.set_xticklabels(unique_batch_sizes)
|
|
ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens])
|
|
|
|
# Labels
|
|
ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13)
|
|
ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13)
|
|
ax.set_title(f'Efficiency Heatmap (tokens/s per request) - {model_name}',
|
|
fontweight='bold', fontsize=15, pad=15)
|
|
|
|
# Add colorbar
|
|
cbar = plt.colorbar(im, ax=ax)
|
|
cbar.set_label('Throughput per Request (tokens/s)', fontweight='bold', fontsize=11)
|
|
|
|
# Add text annotations with white color for better visibility
|
|
for i in range(len(unique_input_tokens)):
|
|
for j in range(len(unique_batch_sizes)):
|
|
value = efficiency_matrix[i, j]
|
|
if value > 0:
|
|
# Use white text for dark backgrounds, black for light backgrounds
|
|
text = ax.text(j, i, f'{value:.1f}',
|
|
ha="center", va="center",
|
|
color="white" if value < efficiency_matrix.max() * 0.7 else "black",
|
|
fontweight='bold', fontsize=10)
|
|
|
|
plt.tight_layout()
|
|
output_path = output_dir / 'efficiency_heatmap.png'
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
|
|
logging.info(f"Saved: {output_path}")
|
|
plt.close()
|
|
|
|
|
|
# ============================================================================
|
|
# MAIN
|
|
# ============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Visualize LLM benchmark results",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Example:
|
|
python visualize_results.py --input results/results_model/benchmark_results.json
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--input',
|
|
type=str,
|
|
required=True,
|
|
help="Input JSON file with benchmark results"
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--output_dir',
|
|
type=str,
|
|
default=None,
|
|
help="Output directory (default: same as input file)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load results
|
|
results_data = load_results(args.input)
|
|
if not results_data or 'results' not in results_data or not results_data['results']:
|
|
logging.error("No valid results found")
|
|
return
|
|
|
|
# Set output directory
|
|
if args.output_dir:
|
|
output_dir = Path(args.output_dir)
|
|
else:
|
|
output_dir = Path(args.input).parent
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
logging.info(f"Output directory: {output_dir}")
|
|
|
|
# Extract data
|
|
model_name = results_data.get('model_name', 'Unknown Model')
|
|
data = extract_data(results_data['results'])
|
|
|
|
# Generate visualizations
|
|
logging.info("Generating visualizations...")
|
|
|
|
plot_throughput(data, output_dir, model_name)
|
|
plot_ttft(data, output_dir, model_name)
|
|
plot_latency_percentiles(data, output_dir, model_name)
|
|
plot_throughput_heatmap(data, output_dir, model_name)
|
|
plot_ttft_heatmap(data, output_dir, model_name)
|
|
plot_efficiency_heatmap(data, output_dir, model_name)
|
|
|
|
logging.info(f"\n{'='*60}")
|
|
logging.info("VISUALIZATION COMPLETE!")
|
|
logging.info(f"{'='*60}")
|
|
logging.info(f"Visualizations saved to: {output_dir}")
|
|
logging.info("Generated plots:")
|
|
logging.info(" - throughput.png (line chart)")
|
|
logging.info(" - throughput_heatmap.png (total throughput)")
|
|
logging.info(" - efficiency_heatmap.png (throughput per request)")
|
|
logging.info(" - ttft.png (line chart)")
|
|
logging.info(" - ttft_heatmap.png")
|
|
logging.info(" - latency_percentiles.png")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|