diff --git a/llm-throughput-tests-mindef-metadateren/visualize_results.py b/llm-throughput-tests-mindef-metadateren/visualize_results.py new file mode 100644 index 0000000..4350f1b --- /dev/null +++ b/llm-throughput-tests-mindef-metadateren/visualize_results.py @@ -0,0 +1,583 @@ +#!/usr/bin/env python3 +""" +LLM Benchmark Visualization Tool + +Generates key performance visualizations: +- Throughput vs Concurrency +- Time to First Token (TTFT) +- Latency Percentiles +""" + +import json +import argparse +import logging +from pathlib import Path +from typing import Dict, List + +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +sns.set_style("whitegrid") +sns.set_context("notebook", font_scale=1.1) +plt.rcParams['font.size'] = 11 + +COLORS = ['#2E86AB', '#E63946', '#06A77D', '#F77F00', '#9B59B6', '#E74C3C'] + +# ============================================================================ +# DATA LOADING +# ============================================================================ + +def load_results(input_file: str) -> Dict: + """ + Load benchmark results from JSON file. + + Parameters: + input_file: Path to the JSON file containing benchmark results + + Returns: + Dictionary containing the benchmark results, or None if loading fails + """ + try: + with open(input_file, 'r') as f: + data = json.load(f) + logging.info(f"Loaded results from {input_file}") + return data + except FileNotFoundError: + logging.error(f"File not found: {input_file}") + return None + except json.JSONDecodeError as e: + logging.error(f"Invalid JSON: {e}") + return None + + +def extract_data(results: List[Dict]) -> Dict: + """ + Extract plotting data from benchmark results. + + Parameters: + results: List of benchmark result dictionaries + + Returns: + Dictionary containing extracted data for plotting + """ + data = { + 'input_tokens': [], + 'actual_input_tokens': [], + 'batch_sizes': [], + 'throughput_tps': [], + 'latency_mean': [], + 'latency_p50': [], + 'latency_p95': [], + 'latency_p99': [], + 'ttft_mean': [], + 'ttft_p50': [], + 'ttft_p90': [], + 'avg_batch_throughput': [], + } + + for result in results: + config = result.get('config', {}) + latency = result.get('latency', {}) + ttft = result.get('ttft', {}) + throughput = result.get('throughput', {}) + batch_metrics = result.get('batch_metrics', {}) + + target_input = config.get('input_tokens', 0) + # Get actual input tokens from config (if available) or use target + actual_input = config.get('actual_input_tokens', target_input) + # Round to nearest 100 for cleaner display + actual_input_rounded = round(actual_input / 100) * 100 + + data['input_tokens'].append(target_input) + data['actual_input_tokens'].append(actual_input_rounded) + data['batch_sizes'].append(config.get('batch_size', 0)) + data['throughput_tps'].append(throughput.get('concurrent_total_tps', 0)) + data['latency_mean'].append(latency.get('mean', 0)) + data['latency_p50'].append(latency.get('p50', 0)) + data['latency_p95'].append(latency.get('p95', 0)) + data['latency_p99'].append(latency.get('p99', 0)) + data['ttft_mean'].append(ttft.get('mean', 0)) + data['ttft_p50'].append(ttft.get('p50', 0)) + data['ttft_p90'].append(ttft.get('p90', 0)) + data['avg_batch_throughput'].append(batch_metrics.get('avg_batch_throughput', 0)) + + return data + + +# ============================================================================ +# PLOTTING FUNCTIONS +# ============================================================================ + +def plot_throughput(data: Dict, output_dir: Path, model_name: str): + """ + Plot throughput vs batch size as a line chart. + + Parameters: + data: Dictionary containing extracted benchmark data + output_dir: Directory where the plot will be saved + model_name: Name of the model being benchmarked + """ + fig, ax = plt.subplots(figsize=(12, 7)) + + unique_input_tokens = sorted(set(data['input_tokens'])) + + for i, input_tok in enumerate(unique_input_tokens): + # Filter data for this input token count + mask = [it == input_tok for it in data['input_tokens']] + batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m] + throughput = [data['throughput_tps'][j] for j, m in enumerate(mask) if m] + actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m] + + # Sort by batch size + sorted_data = sorted(zip(batch_sizes, throughput)) + batch_sizes_sorted = [x[0] for x in sorted_data] + throughput_sorted = [x[1] for x in sorted_data] + + # Use actual average token count for label + avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok + + color = COLORS[i % len(COLORS)] + ax.plot( + batch_sizes_sorted, throughput_sorted, 'o-', color=color, + label=f'{avg_actual:,} tokens', linewidth=3, markersize=12, + markeredgewidth=2, markeredgecolor='white', zorder=3 + ) + + ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) + ax.set_ylabel('Throughput (tokens/second)', fontweight='bold', fontsize=13) + ax.set_title(f'Throughput vs Batch Size - {model_name}', fontweight='bold', fontsize=15, pad=15) + ax.legend(fontsize=11, frameon=True, shadow=True, fancybox=True) + ax.grid(True, alpha=0.3, linestyle='--') + + if len(set(data['batch_sizes'])) > 1: + ax.set_xscale('log', base=2) + ax.set_ylim(bottom=0) + + plt.tight_layout() + output_path = output_dir / 'throughput.png' + plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') + logging.info(f"Saved: {output_path}") + plt.close() + + +def plot_ttft(data: Dict, output_dir: Path, model_name: str): + """ + Plot Time to First Token vs batch size as a line chart. + + Parameters: + data: Dictionary containing extracted benchmark data + output_dir: Directory where the plot will be saved + model_name: Name of the model being benchmarked + """ + fig, ax = plt.subplots(figsize=(12, 7)) + + unique_input_tokens = sorted(set(data['input_tokens'])) + + for i, input_tok in enumerate(unique_input_tokens): + mask = [it == input_tok for it in data['input_tokens']] + batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m] + ttft_mean = [data['ttft_mean'][j] for j, m in enumerate(mask) if m] + ttft_p90 = [data['ttft_p90'][j] for j, m in enumerate(mask) if m] + actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m] + + sorted_data = sorted(zip(batch_sizes, ttft_mean, ttft_p90)) + batch_sizes_sorted = [x[0] for x in sorted_data] + ttft_mean_sorted = [x[1] for x in sorted_data] + ttft_p90_sorted = [x[2] for x in sorted_data] + + # Use actual average token count for label + avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok + + color = COLORS[i % len(COLORS)] + ax.plot( + batch_sizes_sorted, ttft_mean_sorted, 'o-', color=color, + label=f'{avg_actual:,} tokens (mean)', linewidth=3, markersize=12, + markeredgewidth=2, markeredgecolor='white', zorder=3 + ) + ax.plot( + batch_sizes_sorted, ttft_p90_sorted, 's--', color=color, + label=f'{avg_actual:,} tokens (P90)', linewidth=2.5, markersize=9, + alpha=0.75, markeredgewidth=1.5, markeredgecolor='white', zorder=3 + ) + + ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) + ax.set_ylabel('Time to First Token (seconds)', fontweight='bold', fontsize=13) + ax.set_title(f'TTFT vs Batch Size - {model_name}', fontweight='bold', fontsize=15, pad=15) + ax.legend(fontsize=10, frameon=True, shadow=True, fancybox=True, ncol=2) + ax.grid(True, alpha=0.3, linestyle='--') + + if len(set(data['batch_sizes'])) > 1: + ax.set_xscale('log', base=2) + + plt.tight_layout() + output_path = output_dir / 'ttft.png' + plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') + logging.info(f"Saved: {output_path}") + plt.close() + + +def plot_latency_percentiles(data: Dict, output_dir: Path, model_name: str): + """ + Plot latency percentiles (mean, P50, P95, P99) vs batch size. + + Parameters: + data: Dictionary containing extracted benchmark data + output_dir: Directory where the plot will be saved + model_name: Name of the model being benchmarked + """ + fig, ax = plt.subplots(figsize=(12, 7)) + + unique_input_tokens = sorted(set(data['input_tokens'])) + + for i, input_tok in enumerate(unique_input_tokens): + mask = [it == input_tok for it in data['input_tokens']] + batch_sizes = [data['batch_sizes'][j] for j, m in enumerate(mask) if m] + lat_mean = [data['latency_mean'][j] for j, m in enumerate(mask) if m] + lat_p50 = [data['latency_p50'][j] for j, m in enumerate(mask) if m] + lat_p95 = [data['latency_p95'][j] for j, m in enumerate(mask) if m] + lat_p99 = [data['latency_p99'][j] for j, m in enumerate(mask) if m] + actual_tokens = [data['actual_input_tokens'][j] for j, m in enumerate(mask) if m] + + sorted_data = sorted(zip(batch_sizes, lat_mean, lat_p50, lat_p95, lat_p99)) + batch_sizes_sorted = [x[0] for x in sorted_data] + lat_mean_sorted = [x[1] for x in sorted_data] + lat_p50_sorted = [x[2] for x in sorted_data] + lat_p95_sorted = [x[3] for x in sorted_data] + lat_p99_sorted = [x[4] for x in sorted_data] + + # Use actual average token count for label + avg_actual = int(round(sum(actual_tokens) / len(actual_tokens))) if actual_tokens else input_tok + + color = COLORS[i % len(COLORS)] + + # Plot mean and percentiles + ax.plot( + batch_sizes_sorted, lat_mean_sorted, 'o-', color=color, + label=f'{avg_actual:,} (mean)', linewidth=2.5, markersize=10, + markeredgewidth=1.5, markeredgecolor='white', zorder=4 + ) + ax.plot( + batch_sizes_sorted, lat_p50_sorted, 's-', color=color, + label=f'{avg_actual:,} (P50)', linewidth=2, markersize=8, + alpha=0.8, markeredgewidth=1.5, markeredgecolor='white', zorder=3 + ) + ax.plot( + batch_sizes_sorted, lat_p95_sorted, '^-', color=color, + label=f'{avg_actual:,} (P95)', linewidth=1.8, markersize=7, + alpha=0.7, markeredgewidth=1.5, markeredgecolor='white', zorder=2 + ) + ax.plot( + batch_sizes_sorted, lat_p99_sorted, 'v-', color=color, + label=f'{avg_actual:,} (P99)', linewidth=1.5, markersize=6, + alpha=0.6, markeredgewidth=1.5, markeredgecolor='white', zorder=1 + ) + + ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) + ax.set_ylabel('Latency (seconds)', fontweight='bold', fontsize=13) + ax.set_title(f'Latency Percentiles - {model_name}', fontweight='bold', fontsize=15, pad=15) + ax.legend(fontsize=9, frameon=True, shadow=True, fancybox=True, ncol=len(unique_input_tokens)) + ax.grid(True, alpha=0.3, linestyle='--') + + if len(set(data['batch_sizes'])) > 1: + ax.set_xscale('log', base=2) + + plt.tight_layout() + output_path = output_dir / 'latency_percentiles.png' + plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') + logging.info(f"Saved: {output_path}") + plt.close() + + +def plot_throughput_heatmap(data: Dict, output_dir: Path, model_name: str): + """ + Plot throughput heatmap showing total throughput across batch sizes and input tokens. + + Parameters: + data: Dictionary containing extracted benchmark data + output_dir: Directory where the plot will be saved + model_name: Name of the model being benchmarked + """ + # Get unique values + unique_batch_sizes = sorted(set(data['batch_sizes'])) + unique_input_tokens = sorted(set(data['input_tokens'])) + + # Create matrix for heatmap and map actual tokens + throughput_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes))) + actual_token_map = {} # Map target -> actual average + + for i, input_tok in enumerate(unique_input_tokens): + actual_for_this_token = [] + for j, batch_size in enumerate(unique_batch_sizes): + # Find matching data point + for k in range(len(data['input_tokens'])): + if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size: + throughput_matrix[i, j] = data['throughput_tps'][k] + actual_for_this_token.append(data['actual_input_tokens'][k]) + break + # Average actual tokens for this target + if actual_for_this_token: + actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token))) + + # Create heatmap + fig, ax = plt.subplots(figsize=(12, 8)) + + im = ax.imshow(throughput_matrix, cmap='YlOrRd', aspect='auto') + + # Set ticks and labels using actual token counts + ax.set_xticks(np.arange(len(unique_batch_sizes))) + ax.set_yticks(np.arange(len(unique_input_tokens))) + ax.set_xticklabels(unique_batch_sizes) + ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens]) + + # Labels + ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) + ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13) + ax.set_title(f'Throughput Heatmap (tokens/second) - {model_name}', + fontweight='bold', fontsize=15, pad=15) + + # Add colorbar + cbar = plt.colorbar(im, ax=ax) + cbar.set_label('Throughput (tokens/s)', fontweight='bold', fontsize=11) + + # Add text annotations + for i in range(len(unique_input_tokens)): + for j in range(len(unique_batch_sizes)): + value = throughput_matrix[i, j] + if value > 0: + text = ax.text(j, i, f'{value:.0f}', + ha="center", va="center", + color="white" if value > throughput_matrix.max() * 0.5 else "black", + fontweight='bold', fontsize=10) + + plt.tight_layout() + output_path = output_dir / 'throughput_heatmap.png' + plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') + logging.info(f"Saved: {output_path}") + plt.close() + + +def plot_ttft_heatmap(data: Dict, output_dir: Path, model_name: str): + """ + Plot TTFT heatmap showing time to first token across batch sizes and input tokens. + + Parameters: + data: Dictionary containing extracted benchmark data + output_dir: Directory where the plot will be saved + model_name: Name of the model being benchmarked + """ + # Get unique values + unique_batch_sizes = sorted(set(data['batch_sizes'])) + unique_input_tokens = sorted(set(data['input_tokens'])) + + # Create matrix for heatmap and map actual tokens + ttft_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes))) + actual_token_map = {} # Map target -> actual average + + for i, input_tok in enumerate(unique_input_tokens): + actual_for_this_token = [] + for j, batch_size in enumerate(unique_batch_sizes): + # Find matching data point + for k in range(len(data['input_tokens'])): + if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size: + ttft_matrix[i, j] = data['ttft_mean'][k] if data['ttft_mean'][k] else 0 + actual_for_this_token.append(data['actual_input_tokens'][k]) + break + # Average actual tokens for this target + if actual_for_this_token: + actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token))) + + # Create heatmap + fig, ax = plt.subplots(figsize=(12, 8)) + + # Use reversed colormap (lower TTFT is better, so use cooler colors) + im = ax.imshow(ttft_matrix, cmap='YlGnBu', aspect='auto') + + # Set ticks and labels using actual token counts + ax.set_xticks(np.arange(len(unique_batch_sizes))) + ax.set_yticks(np.arange(len(unique_input_tokens))) + ax.set_xticklabels(unique_batch_sizes) + ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens]) + + # Labels + ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) + ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13) + ax.set_title(f'Time to First Token Heatmap (seconds) - {model_name}', + fontweight='bold', fontsize=15, pad=15) + + # Add colorbar + cbar = plt.colorbar(im, ax=ax) + cbar.set_label('TTFT (seconds)', fontweight='bold', fontsize=11) + + # Add text annotations + for i in range(len(unique_input_tokens)): + for j in range(len(unique_batch_sizes)): + value = ttft_matrix[i, j] + if value > 0: + text = ax.text(j, i, f'{value:.2f}', + ha="center", va="center", + color="white" if value > ttft_matrix.max() * 0.5 else "black", + fontweight='bold', fontsize=10) + + plt.tight_layout() + output_path = output_dir / 'ttft_heatmap.png' + plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') + logging.info(f"Saved: {output_path}") + plt.close() + + +def plot_efficiency_heatmap(data: Dict, output_dir: Path, model_name: str): + """ + Plot efficiency heatmap showing throughput per request (tokens/s per worker). + + This shows how efficiently each individual request in a batch is processed, + revealing scaling behavior and resource contention patterns. + + Parameters: + data: Dictionary containing extracted benchmark data + output_dir: Directory where the plot will be saved + model_name: Name of the model being benchmarked + """ + # Get unique values + unique_batch_sizes = sorted(set(data['batch_sizes'])) + unique_input_tokens = sorted(set(data['input_tokens'])) + + # Create matrix for heatmap - calculate throughput per request and map actual tokens + efficiency_matrix = np.zeros((len(unique_input_tokens), len(unique_batch_sizes))) + actual_token_map = {} # Map target -> actual average + + for i, input_tok in enumerate(unique_input_tokens): + actual_for_this_token = [] + for j, batch_size in enumerate(unique_batch_sizes): + # Find matching data point + for k in range(len(data['input_tokens'])): + if data['input_tokens'][k] == input_tok and data['batch_sizes'][k] == batch_size: + # Calculate throughput per request (total throughput / batch size) + total_throughput = data['throughput_tps'][k] + efficiency_matrix[i, j] = total_throughput / batch_size if batch_size > 0 else 0 + actual_for_this_token.append(data['actual_input_tokens'][k]) + break + # Average actual tokens for this target + if actual_for_this_token: + actual_token_map[input_tok] = int(round(sum(actual_for_this_token) / len(actual_for_this_token))) + + # Create heatmap + fig, ax = plt.subplots(figsize=(12, 8)) + + # Use viridis colormap (yellow-green-blue like the screenshot) + im = ax.imshow(efficiency_matrix, cmap='viridis', aspect='auto') + + # Set ticks and labels using actual token counts + ax.set_xticks(np.arange(len(unique_batch_sizes))) + ax.set_yticks(np.arange(len(unique_input_tokens))) + ax.set_xticklabels(unique_batch_sizes) + ax.set_yticklabels([f'{actual_token_map.get(tok, tok):,}' for tok in unique_input_tokens]) + + # Labels + ax.set_xlabel('Batch Size (simultaneous requests)', fontweight='bold', fontsize=13) + ax.set_ylabel('Input Tokens', fontweight='bold', fontsize=13) + ax.set_title(f'Efficiency Heatmap (tokens/s per request) - {model_name}', + fontweight='bold', fontsize=15, pad=15) + + # Add colorbar + cbar = plt.colorbar(im, ax=ax) + cbar.set_label('Throughput per Request (tokens/s)', fontweight='bold', fontsize=11) + + # Add text annotations with white color for better visibility + for i in range(len(unique_input_tokens)): + for j in range(len(unique_batch_sizes)): + value = efficiency_matrix[i, j] + if value > 0: + # Use white text for dark backgrounds, black for light backgrounds + text = ax.text(j, i, f'{value:.1f}', + ha="center", va="center", + color="white" if value < efficiency_matrix.max() * 0.7 else "black", + fontweight='bold', fontsize=10) + + plt.tight_layout() + output_path = output_dir / 'efficiency_heatmap.png' + plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white') + logging.info(f"Saved: {output_path}") + plt.close() + + +# ============================================================================ +# MAIN +# ============================================================================ + +def main(): + parser = argparse.ArgumentParser( + description="Visualize LLM benchmark results", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Example: + python visualize_results.py --input results/results_model/benchmark_results.json + """ + ) + + parser.add_argument( + '--input', + type=str, + required=True, + help="Input JSON file with benchmark results" + ) + + parser.add_argument( + '--output_dir', + type=str, + default=None, + help="Output directory (default: same as input file)" + ) + + args = parser.parse_args() + + # Load results + results_data = load_results(args.input) + if not results_data or 'results' not in results_data or not results_data['results']: + logging.error("No valid results found") + return + + # Set output directory + if args.output_dir: + output_dir = Path(args.output_dir) + else: + output_dir = Path(args.input).parent + + output_dir.mkdir(parents=True, exist_ok=True) + logging.info(f"Output directory: {output_dir}") + + # Extract data + model_name = results_data.get('model_name', 'Unknown Model') + data = extract_data(results_data['results']) + + # Generate visualizations + logging.info("Generating visualizations...") + + plot_throughput(data, output_dir, model_name) + plot_ttft(data, output_dir, model_name) + plot_latency_percentiles(data, output_dir, model_name) + plot_throughput_heatmap(data, output_dir, model_name) + plot_ttft_heatmap(data, output_dir, model_name) + plot_efficiency_heatmap(data, output_dir, model_name) + + logging.info(f"\n{'='*60}") + logging.info("VISUALIZATION COMPLETE!") + logging.info(f"{'='*60}") + logging.info(f"Visualizations saved to: {output_dir}") + logging.info("Generated plots:") + logging.info(" - throughput.png (line chart)") + logging.info(" - throughput_heatmap.png (total throughput)") + logging.info(" - efficiency_heatmap.png (throughput per request)") + logging.info(" - ttft.png (line chart)") + logging.info(" - ttft_heatmap.png") + logging.info(" - latency_percentiles.png") + + +if __name__ == '__main__': + main() diff --git a/ubiops-deployments/README.md b/ubiops-deployments/README.md new file mode 100644 index 0000000..069ae2c --- /dev/null +++ b/ubiops-deployments/README.md @@ -0,0 +1,47 @@ +# ubiops-deployments + +UbiOps export (format spec `v8.0`, exported 2026-06-02) bundling the +deployments behind the MinDef metadata/throughput setup. All deployments are +OpenAI-compatible and run in request format (`supports_request_format: true`) +with `plain` input/output. + +## Layout + +``` +deployments/ +├── deployment-gpt-oss-chat/ # the LLM serving deployment +│ └── deployment_gpt-oss-120b.yaml +├── deployments-embedder/ # embedding model +│ └── deployment_bge-m3/ +└── deployments-proxies/ # OpenAI-compatible proxy deployments + ├── deployment_llm-proxy/ + └── deployment_proxy-gpt-oss-batch-3x/ +``` + +Each deployment folder holds its `deployment_*.yaml` (deployment config) and a +`versions/` folder with one `*.yaml` + `*.zip` per version (the YAML is the +version config, the ZIP is the packaged code). + +## Deployments + +| Deployment | Default version | Purpose | +|---|---|---| +| `gpt-oss-120b` | `v-gpt-120b-tool-calling` | Serves `openai/gpt-oss-120b` via vLLM on a `16gb_8vcpu_rtxpro` GPU instance. | +| `bge-m3` | `v3` | BGE-M3 embedding model. | +| `llm-proxy` | `v11` | OpenAI-compatible proxy routing requests to UbiOps deployments. | +| `proxy-gpt-oss-batch-3x` | `v1` | Proxy fanning batch requests across GPT-OSS instances. | + +## Configuration + +Secrets are exported empty and must be set per environment: + +- `gpt-oss-120b` — `HF_TOKEN` (secret), `MODEL_NAME` (`openai/gpt-oss-120b`). + The serving version also sets `VLLM_USE_V1=1`, `GPU_MEMORY_UTILIZATION=0.90`, + and `MAX_MODEL_LEN=125000`, with a `/health` check on port 8000. +- `llm-proxy` — `UBIOPS_API_TOKEN` (secret). + +## Importing + +Import this directory as a UbiOps project export (e.g. via +`ubiops project_export create`), then fill in the secret environment variables +listed above before sending requests. diff --git a/ubiops-deployments/deployments/deployment-gpt-oss-chat/deployment_gpt-oss-120b.yaml b/ubiops-deployments/deployments/deployment-gpt-oss-chat/deployment_gpt-oss-120b.yaml new file mode 100644 index 0000000..d6c0768 --- /dev/null +++ b/ubiops-deployments/deployments/deployment-gpt-oss-chat/deployment_gpt-oss-120b.yaml @@ -0,0 +1,17 @@ +deployment_name: "gpt-oss-120b" +deployment_description: "" +deployment_labels: + type: "openai-compatible" + import: "0a5d8365-be73-4ab7-9933-2fb93468a8de" +default_version: "v-gpt-120b-tool-calling" +supports_request_format: True +input_type: "plain" +input_fields: [] +output_type: "plain" +output_fields: [] +deployment_environment_variables: + - name: "HF_TOKEN" + value: "" + secret: True + - name: "MODEL_NAME" + value: "openai/gpt-oss-120b" \ No newline at end of file diff --git a/ubiops-deployments/deployments/deployment-gpt-oss-chat/info.yaml b/ubiops-deployments/deployments/deployment-gpt-oss-chat/info.yaml new file mode 100644 index 0000000..954e8fd --- /dev/null +++ b/ubiops-deployments/deployments/deployment-gpt-oss-chat/info.yaml @@ -0,0 +1,3 @@ +format_spec: v8.0 +metadata: + export_date: 2026-06-02T07:53:04.101874+00:00 \ No newline at end of file diff --git a/ubiops-deployments/deployments/deployment-gpt-oss-chat/versions/deployment_gpt-oss-120b_version_v-gpt-120b-tool-calling-max-12.yaml b/ubiops-deployments/deployments/deployment-gpt-oss-chat/versions/deployment_gpt-oss-120b_version_v-gpt-120b-tool-calling-max-12.yaml new file mode 100644 index 0000000..7a1fb79 --- /dev/null +++ b/ubiops-deployments/deployments/deployment-gpt-oss-chat/versions/deployment_gpt-oss-120b_version_v-gpt-120b-tool-calling-max-12.yaml @@ -0,0 +1,33 @@ +version_name: "v-gpt-120b-tool-calling-max-12" +version_description: "" +version_labels: + import: "0a5d8365-be73-4ab7-9933-2fb93468a8de" + model-names: "openai/gpt-oss-120b" + openai-compatible: "true" +environment: "python3-12" +instance_type: "16gb_8vcpu_rtxpro" +static_ip: False +minimum_instances: 1 +maximum_instances: 1 +maximum_idle_time: 10 +request_retention_mode: "full" +request_retention_time: 2419200 +maximum_queue_size: 100000 +scaling_strategy: "default" +instance_processes: 20 +health_check: + path: "/health" + port: 8000 + timeout: 3 + interval: 5 + failure_threshold: 3 +ports: [] +version_environment_variables: + - name: "VLLM_USE_V1" + value: "1" + - name: "MODEL_NAME" + value: "openai/gpt-oss-120b" + - name: "GPU_MEMORY_UTILIZATION" + value: "0.90" + - name: "MAX_MODEL_LEN" + value: "125000" \ No newline at end of file diff --git a/ubiops-deployments/deployments/deployment-gpt-oss-chat/versions/deployment_gpt-oss-120b_version_v-gpt-120b-tool-calling-max-12.zip b/ubiops-deployments/deployments/deployment-gpt-oss-chat/versions/deployment_gpt-oss-120b_version_v-gpt-120b-tool-calling-max-12.zip new file mode 100644 index 0000000..a8a4f8c Binary files /dev/null and b/ubiops-deployments/deployments/deployment-gpt-oss-chat/versions/deployment_gpt-oss-120b_version_v-gpt-120b-tool-calling-max-12.zip differ diff --git a/ubiops-deployments/deployments/deployment-gpt-oss-chat/versions/deployment_gpt-oss-120b_version_v-gpt-120b-tool-calling.yaml b/ubiops-deployments/deployments/deployment-gpt-oss-chat/versions/deployment_gpt-oss-120b_version_v-gpt-120b-tool-calling.yaml new file mode 100644 index 0000000..1401ee9 --- /dev/null +++ b/ubiops-deployments/deployments/deployment-gpt-oss-chat/versions/deployment_gpt-oss-120b_version_v-gpt-120b-tool-calling.yaml @@ -0,0 +1,33 @@ +version_name: "v-gpt-120b-tool-calling" +version_description: "" +version_labels: + import: "0a5d8365-be73-4ab7-9933-2fb93468a8de" + model-names: "openai/gpt-oss-120b" + openai-compatible: "true" +environment: "python3-12" +instance_type: "16gb_8vcpu_rtxpro" +static_ip: False +minimum_instances: 1 +maximum_instances: 1 +maximum_idle_time: 10 +request_retention_mode: "full" +request_retention_time: 2419200 +maximum_queue_size: 100000 +scaling_strategy: "default" +instance_processes: 20 +health_check: + path: "/health" + port: 8000 + timeout: 3 + interval: 5 + failure_threshold: 3 +ports: [] +version_environment_variables: + - name: "VLLM_USE_V1" + value: "1" + - name: "MODEL_NAME" + value: "openai/gpt-oss-120b" + - name: "GPU_MEMORY_UTILIZATION" + value: "0.90" + - name: "MAX_MODEL_LEN" + value: "125000" \ No newline at end of file