diff --git a/llm-throughput-tests-mindef-metadateren/results/results_QuantTrio_Qwen3_5-35B-A3B-AWQ/ttft_heatmap.png b/llm-throughput-tests-mindef-metadateren/results/results_QuantTrio_Qwen3_5-35B-A3B-AWQ/ttft_heatmap.png new file mode 100644 index 0000000..c0372a3 Binary files /dev/null and b/llm-throughput-tests-mindef-metadateren/results/results_QuantTrio_Qwen3_5-35B-A3B-AWQ/ttft_heatmap.png differ diff --git a/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-2x/benchmark_results.json b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-2x/benchmark_results.json new file mode 100644 index 0000000..09edc96 --- /dev/null +++ b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-2x/benchmark_results.json @@ -0,0 +1,58 @@ +{ + "timestamp": "2026-03-25T17:31:40.541181", + "model_name": "openai-gpt-oss-120b-2x", + "results": [ + { + "config": { + "input_tokens": 50000, + "output_tokens": 1024, + "batch_size": 64, + "num_batches": 1, + "total_requests": 64, + "actual_input_tokens": 40443 + }, + "success_metrics": { + "success_rate": 50.0, + "successful_requests": 32, + "failed_requests": 32 + }, + "latency": { + "mean": 105.486, + "std": 0.648, + "min": 104.136, + "max": 106.488, + "p50": 105.577, + "p95": 106.447, + "p99": 106.484, + "ci_95_lower": 105.262, + "ci_95_upper": 105.711 + }, + "ttft": { + "mean": 105.486, + "std": 0.648, + "p50": 105.577, + "p90": 106.345 + }, + "tokens": { + "total_generated": 32768, + "content_tokens": 32768, + "reasoning_tokens": 0, + "avg_per_request": 1024.0 + }, + "throughput": { + "concurrent_total_tps": 307.59, + "concurrent_content_tps": 307.59, + "requests_per_second": 0.3, + "actual_wall_time": 106.533, + "efficiency_percent": 49.51 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 32.0, + "avg_batch_throughput": 307.59, + "min_batch_throughput": 307.59, + "max_batch_throughput": 307.59 + } + } + ] +} \ No newline at end of file diff --git a/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-2x/config_used.yaml b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-2x/config_used.yaml new file mode 100644 index 0000000..bf29fb1 --- /dev/null +++ b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-2x/config_used.yaml @@ -0,0 +1,20 @@ +endpoint: + url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1 + api_key: + model_name: openai-gpt-oss-120b-2x +benchmark: + input_tokens: + - 50000 + batch_sizes: + - 64 + num_batches: 1 + output_tokens: 1024 + dataset: test_conversations.json + text: null +runtime: + request_timeout: 1800 + delay_between_runs: 5 + log_io: true + wait_for_ready: true + max_init_retries: 10 + init_retry_delay: 30 diff --git a/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-litellm-streamiing/benchmark_results.json b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-litellm-streamiing/benchmark_results.json new file mode 100644 index 0000000..d28ed56 --- /dev/null +++ b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-litellm-streamiing/benchmark_results.json @@ -0,0 +1,58 @@ +{ + "timestamp": "2026-03-12T09:40:09.623487", + "model_name": "openai-gpt-oss-120b", + "results": [ + { + "config": { + "input_tokens": 50000, + "output_tokens": 512, + "batch_size": 64, + "num_batches": 1, + "total_requests": 64, + "actual_input_tokens": 40613 + }, + "success_metrics": { + "success_rate": 26.56, + "successful_requests": 17, + "failed_requests": 47 + }, + "latency": { + "mean": 101.447, + "std": 0.463, + "min": 100.168, + "max": 102.338, + "p50": 101.438, + "p95": 102.129, + "p99": 102.296, + "ci_95_lower": 101.227, + "ci_95_upper": 101.668 + }, + "ttft": { + "mean": 82.918, + "std": 6.278, + "p50": 82.795, + "p90": 86.47 + }, + "tokens": { + "total_generated": 8704, + "content_tokens": 6245, + "reasoning_tokens": 2459, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 84.04, + "concurrent_content_tps": 60.3, + "requests_per_second": 0.16, + "actual_wall_time": 103.567, + "efficiency_percent": 26.02 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 17.0, + "avg_batch_throughput": 84.04, + "min_batch_throughput": 84.04, + "max_batch_throughput": 84.04 + } + } + ] +} \ No newline at end of file diff --git a/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-litellm-streamiing/config_used.yaml b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-litellm-streamiing/config_used.yaml new file mode 100644 index 0000000..0588794 --- /dev/null +++ b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-litellm-streamiing/config_used.yaml @@ -0,0 +1,20 @@ +endpoint: + url: https://256980a6-7a84-4342-8481-7e0b7d838076.services.external.0a71m37v.ubiops.io/v1 + api_key: + model_name: openai-gpt-oss-120b +benchmark: + input_tokens: + - 50000 + batch_sizes: + - 64 + num_batches: 1 + output_tokens: 512 + dataset: test_conversations.json + text: null +runtime: + request_timeout: 1800 + delay_between_runs: 5 + log_io: true + wait_for_ready: true + max_init_retries: 10 + init_retry_delay: 30 diff --git a/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-max-16/benchmark_results.json b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-max-16/benchmark_results.json new file mode 100644 index 0000000..98eaa27 --- /dev/null +++ b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-max-16/benchmark_results.json @@ -0,0 +1,58 @@ +{ + "timestamp": "2026-03-17T09:42:27.751665", + "model_name": "openai-gpt-oss-120b-max-16", + "results": [ + { + "config": { + "input_tokens": 50000, + "output_tokens": 1024, + "batch_size": 16, + "num_batches": 1, + "total_requests": 16, + "actual_input_tokens": 40691 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 16, + "failed_requests": 0 + }, + "latency": { + "mean": 105.895, + "std": 0.51, + "min": 104.893, + "max": 106.744, + "p50": 105.983, + "p95": 106.564, + "p99": 106.708, + "ci_95_lower": 105.645, + "ci_95_upper": 106.144 + }, + "ttft": { + "mean": 73.976, + "std": 2.6, + "p50": 73.007, + "p90": 76.235 + }, + "tokens": { + "total_generated": 16384, + "content_tokens": 14467, + "reasoning_tokens": 1917, + "avg_per_request": 1024.0 + }, + "throughput": { + "concurrent_total_tps": 153.35, + "concurrent_content_tps": 135.41, + "requests_per_second": 0.15, + "actual_wall_time": 106.84, + "efficiency_percent": 99.11 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 16.0, + "avg_batch_throughput": 153.35, + "min_batch_throughput": 153.35, + "max_batch_throughput": 153.35 + } + } + ] +} \ No newline at end of file diff --git a/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-max-16/config_used.yaml b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-max-16/config_used.yaml new file mode 100644 index 0000000..97fb47a --- /dev/null +++ b/llm-throughput-tests-mindef-metadateren/results/results_openai-gpt-oss-120b-max-16/config_used.yaml @@ -0,0 +1,20 @@ +endpoint: + url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1 + api_key: + model_name: openai-gpt-oss-120b-max-16 +benchmark: + input_tokens: + - 50000 + batch_sizes: + - 128 + num_batches: 1 + output_tokens: 1024 + dataset: test_conversations.json + text: null +runtime: + request_timeout: 1800 + delay_between_runs: 5 + log_io: true + wait_for_ready: true + max_init_retries: 10 + init_retry_delay: 30 diff --git a/llm-throughput-tests-mindef-metadateren/results/results_openai_gpt-oss-120b_services/benchmark_results.json b/llm-throughput-tests-mindef-metadateren/results/results_openai_gpt-oss-120b_services/benchmark_results.json new file mode 100644 index 0000000..ee34156 --- /dev/null +++ b/llm-throughput-tests-mindef-metadateren/results/results_openai_gpt-oss-120b_services/benchmark_results.json @@ -0,0 +1,1254 @@ +{ + "timestamp": "2026-01-22T14:46:31.276437", + "model_name": "openai/gpt-oss-120b", + "results": [ + { + "config": { + "input_tokens": 1000, + "output_tokens": 512, + "batch_size": 1, + "num_batches": 1, + "total_requests": 1, + "actual_input_tokens": 1268 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 1, + "failed_requests": 0 + }, + "latency": { + "mean": 3.25, + "std": 0.0, + "min": 3.25, + "max": 3.25, + "p50": 3.25, + "p95": 3.25, + "p99": 3.25, + "ci_95_lower": 3.25, + "ci_95_upper": 3.25 + }, + "ttft": { + "mean": 1.228, + "std": 0.0, + "p50": 1.228, + "p90": 1.228 + }, + "tokens": { + "total_generated": 512, + "content_tokens": 512, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 157.56, + "concurrent_content_tps": 157.56, + "requests_per_second": 0.31, + "actual_wall_time": 3.25, + "efficiency_percent": 100.0 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 1.0, + "avg_batch_throughput": 157.56, + "min_batch_throughput": 157.56, + "max_batch_throughput": 157.56 + } + }, + { + "config": { + "input_tokens": 1000, + "output_tokens": 512, + "batch_size": 8, + "num_batches": 1, + "total_requests": 8, + "actual_input_tokens": 1268 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 8, + "failed_requests": 0 + }, + "latency": { + "mean": 6.513, + "std": 0.018, + "min": 6.467, + "max": 6.524, + "p50": 6.52, + "p95": 6.523, + "p99": 6.524, + "ci_95_lower": 6.501, + "ci_95_upper": 6.525 + }, + "ttft": { + "mean": 3.362, + "std": 1.481, + "p50": 2.592, + "p90": 5.281 + }, + "tokens": { + "total_generated": 4096, + "content_tokens": 4096, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 627.74, + "concurrent_content_tps": 627.74, + "requests_per_second": 1.23, + "actual_wall_time": 6.525, + "efficiency_percent": 99.81 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 8.0, + "avg_batch_throughput": 627.74, + "min_batch_throughput": 627.74, + "max_batch_throughput": 627.74 + } + }, + { + "config": { + "input_tokens": 1000, + "output_tokens": 512, + "batch_size": 16, + "num_batches": 1, + "total_requests": 16, + "actual_input_tokens": 1268 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 16, + "failed_requests": 0 + }, + "latency": { + "mean": 8.828, + "std": 0.013, + "min": 8.798, + "max": 8.844, + "p50": 8.825, + "p95": 8.842, + "p99": 8.844, + "ci_95_lower": 8.821, + "ci_95_upper": 8.834 + }, + "ttft": { + "mean": 3.498, + "std": 1.405, + "p50": 2.913, + "p90": 6.162 + }, + "tokens": { + "total_generated": 8192, + "content_tokens": 8192, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 925.82, + "concurrent_content_tps": 925.82, + "requests_per_second": 1.81, + "actual_wall_time": 8.848, + "efficiency_percent": 99.77 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 16.0, + "avg_batch_throughput": 925.82, + "min_batch_throughput": 925.82, + "max_batch_throughput": 925.82 + } + }, + { + "config": { + "input_tokens": 1000, + "output_tokens": 512, + "batch_size": 24, + "num_batches": 1, + "total_requests": 24, + "actual_input_tokens": 1268 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 24, + "failed_requests": 0 + }, + "latency": { + "mean": 10.38, + "std": 0.031, + "min": 10.299, + "max": 10.414, + "p50": 10.393, + "p95": 10.404, + "p99": 10.412, + "ci_95_lower": 10.367, + "ci_95_upper": 10.392 + }, + "ttft": { + "mean": 4.489, + "std": 2.111, + "p50": 3.683, + "p90": 8.168 + }, + "tokens": { + "total_generated": 12288, + "content_tokens": 12288, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1179.3, + "concurrent_content_tps": 1179.3, + "requests_per_second": 2.3, + "actual_wall_time": 10.42, + "efficiency_percent": 99.62 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 24.0, + "avg_batch_throughput": 1179.3, + "min_batch_throughput": 1179.3, + "max_batch_throughput": 1179.3 + } + }, + { + "config": { + "input_tokens": 1000, + "output_tokens": 512, + "batch_size": 32, + "num_batches": 1, + "total_requests": 32, + "actual_input_tokens": 1268 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 32, + "failed_requests": 0 + }, + "latency": { + "mean": 11.608, + "std": 0.043, + "min": 11.464, + "max": 11.642, + "p50": 11.632, + "p95": 11.64, + "p99": 11.641, + "ci_95_lower": 11.593, + "ci_95_upper": 11.623 + }, + "ttft": { + "mean": 4.908, + "std": 2.125, + "p50": 4.134, + "p90": 8.842 + }, + "tokens": { + "total_generated": 16384, + "content_tokens": 16384, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1405.72, + "concurrent_content_tps": 1405.72, + "requests_per_second": 2.75, + "actual_wall_time": 11.655, + "efficiency_percent": 99.6 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 32.0, + "avg_batch_throughput": 1405.72, + "min_batch_throughput": 1405.72, + "max_batch_throughput": 1405.72 + } + }, + { + "config": { + "input_tokens": 1000, + "output_tokens": 512, + "batch_size": 64, + "num_batches": 1, + "total_requests": 64, + "actual_input_tokens": 1268 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 64, + "failed_requests": 0 + }, + "latency": { + "mean": 15.496, + "std": 0.096, + "min": 15.255, + "max": 15.789, + "p50": 15.516, + "p95": 15.597, + "p99": 15.67, + "ci_95_lower": 15.473, + "ci_95_upper": 15.52 + }, + "ttft": { + "mean": 5.896, + "std": 2.086, + "p50": 5.354, + "p90": 8.428 + }, + "tokens": { + "total_generated": 32768, + "content_tokens": 32768, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 2072.97, + "concurrent_content_tps": 2072.97, + "requests_per_second": 4.05, + "actual_wall_time": 15.807, + "efficiency_percent": 98.03 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 64.0, + "avg_batch_throughput": 2072.97, + "min_batch_throughput": 2072.97, + "max_batch_throughput": 2072.97 + } + }, + { + "config": { + "input_tokens": 2500, + "output_tokens": 512, + "batch_size": 1, + "num_batches": 1, + "total_requests": 1, + "actual_input_tokens": 3053 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 1, + "failed_requests": 0 + }, + "latency": { + "mean": 3.44, + "std": 0.0, + "min": 3.44, + "max": 3.44, + "p50": 3.44, + "p95": 3.44, + "p99": 3.44, + "ci_95_lower": 3.44, + "ci_95_upper": 3.44 + }, + "ttft": { + "mean": 1.375, + "std": 0.0, + "p50": 1.375, + "p90": 1.375 + }, + "tokens": { + "total_generated": 512, + "content_tokens": 512, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 148.84, + "concurrent_content_tps": 148.84, + "requests_per_second": 0.29, + "actual_wall_time": 3.44, + "efficiency_percent": 100.0 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 1.0, + "avg_batch_throughput": 148.84, + "min_batch_throughput": 148.84, + "max_batch_throughput": 148.84 + } + }, + { + "config": { + "input_tokens": 2500, + "output_tokens": 512, + "batch_size": 8, + "num_batches": 1, + "total_requests": 8, + "actual_input_tokens": 3053 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 8, + "failed_requests": 0 + }, + "latency": { + "mean": 6.921, + "std": 0.006, + "min": 6.905, + "max": 6.927, + "p50": 6.922, + "p95": 6.927, + "p99": 6.927, + "ci_95_lower": 6.916, + "ci_95_upper": 6.925 + }, + "ttft": { + "mean": 2.777, + "std": 0.805, + "p50": 2.474, + "p90": 3.941 + }, + "tokens": { + "total_generated": 4096, + "content_tokens": 4096, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 591.17, + "concurrent_content_tps": 591.17, + "requests_per_second": 1.15, + "actual_wall_time": 6.929, + "efficiency_percent": 99.89 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 8.0, + "avg_batch_throughput": 591.17, + "min_batch_throughput": 591.17, + "max_batch_throughput": 591.17 + } + }, + { + "config": { + "input_tokens": 2500, + "output_tokens": 512, + "batch_size": 16, + "num_batches": 1, + "total_requests": 16, + "actual_input_tokens": 3053 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 16, + "failed_requests": 0 + }, + "latency": { + "mean": 9.419, + "std": 0.01, + "min": 9.404, + "max": 9.431, + "p50": 9.425, + "p95": 9.43, + "p99": 9.431, + "ci_95_lower": 9.414, + "ci_95_upper": 9.424 + }, + "ttft": { + "mean": 3.433, + "std": 0.733, + "p50": 3.253, + "p90": 4.342 + }, + "tokens": { + "total_generated": 8192, + "content_tokens": 8192, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 868.43, + "concurrent_content_tps": 868.43, + "requests_per_second": 1.7, + "actual_wall_time": 9.433, + "efficiency_percent": 99.85 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 16.0, + "avg_batch_throughput": 868.43, + "min_batch_throughput": 868.43, + "max_batch_throughput": 868.43 + } + }, + { + "config": { + "input_tokens": 2500, + "output_tokens": 512, + "batch_size": 24, + "num_batches": 1, + "total_requests": 24, + "actual_input_tokens": 3053 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 24, + "failed_requests": 0 + }, + "latency": { + "mean": 11.23, + "std": 0.054, + "min": 10.977, + "max": 11.251, + "p50": 11.245, + "p95": 11.25, + "p99": 11.25, + "ci_95_lower": 11.208, + "ci_95_upper": 11.252 + }, + "ttft": { + "mean": 3.744, + "std": 1.427, + "p50": 3.333, + "p90": 4.496 + }, + "tokens": { + "total_generated": 12288, + "content_tokens": 12288, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1091.4, + "concurrent_content_tps": 1091.4, + "requests_per_second": 2.13, + "actual_wall_time": 11.259, + "efficiency_percent": 99.74 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 24.0, + "avg_batch_throughput": 1091.4, + "min_batch_throughput": 1091.4, + "max_batch_throughput": 1091.4 + } + }, + { + "config": { + "input_tokens": 2500, + "output_tokens": 512, + "batch_size": 32, + "num_batches": 1, + "total_requests": 32, + "actual_input_tokens": 3053 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 32, + "failed_requests": 0 + }, + "latency": { + "mean": 12.53, + "std": 0.038, + "min": 12.424, + "max": 12.571, + "p50": 12.546, + "p95": 12.568, + "p99": 12.57, + "ci_95_lower": 12.517, + "ci_95_upper": 12.544 + }, + "ttft": { + "mean": 4.884, + "std": 1.795, + "p50": 4.274, + "p90": 6.106 + }, + "tokens": { + "total_generated": 16384, + "content_tokens": 16384, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1302.77, + "concurrent_content_tps": 1302.77, + "requests_per_second": 2.54, + "actual_wall_time": 12.576, + "efficiency_percent": 99.63 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 32.0, + "avg_batch_throughput": 1302.77, + "min_batch_throughput": 1302.77, + "max_batch_throughput": 1302.77 + } + }, + { + "config": { + "input_tokens": 2500, + "output_tokens": 512, + "batch_size": 64, + "num_batches": 1, + "total_requests": 64, + "actual_input_tokens": 3053 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 64, + "failed_requests": 0 + }, + "latency": { + "mean": 16.78, + "std": 0.07, + "min": 16.507, + "max": 16.953, + "p50": 16.778, + "p95": 16.934, + "p99": 16.95, + "ci_95_lower": 16.763, + "ci_95_upper": 16.797 + }, + "ttft": { + "mean": 6.451, + "std": 2.606, + "p50": 5.536, + "p90": 10.157 + }, + "tokens": { + "total_generated": 32768, + "content_tokens": 32768, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1931.74, + "concurrent_content_tps": 1931.74, + "requests_per_second": 3.77, + "actual_wall_time": 16.963, + "efficiency_percent": 98.92 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 64.0, + "avg_batch_throughput": 1931.74, + "min_batch_throughput": 1931.74, + "max_batch_throughput": 1931.74 + } + }, + { + "config": { + "input_tokens": 5000, + "output_tokens": 512, + "batch_size": 1, + "num_batches": 1, + "total_requests": 1, + "actual_input_tokens": 6024 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 1, + "failed_requests": 0 + }, + "latency": { + "mean": 3.725, + "std": 0.0, + "min": 3.725, + "max": 3.725, + "p50": 3.725, + "p95": 3.725, + "p99": 3.725, + "ci_95_lower": 3.725, + "ci_95_upper": 3.725 + }, + "ttft": { + "mean": 1.855, + "std": 0.0, + "p50": 1.855, + "p90": 1.855 + }, + "tokens": { + "total_generated": 512, + "content_tokens": 512, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 137.46, + "concurrent_content_tps": 137.46, + "requests_per_second": 0.27, + "actual_wall_time": 3.725, + "efficiency_percent": 100.0 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 1.0, + "avg_batch_throughput": 137.46, + "min_batch_throughput": 137.46, + "max_batch_throughput": 137.46 + } + }, + { + "config": { + "input_tokens": 5000, + "output_tokens": 512, + "batch_size": 8, + "num_batches": 1, + "total_requests": 8, + "actual_input_tokens": 6024 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 8, + "failed_requests": 0 + }, + "latency": { + "mean": 7.418, + "std": 0.058, + "min": 7.269, + "max": 7.448, + "p50": 7.444, + "p95": 7.447, + "p99": 7.448, + "ci_95_lower": 7.378, + "ci_95_upper": 7.458 + }, + "ttft": { + "mean": 3.301, + "std": 1.58, + "p50": 2.914, + "p90": 4.562 + }, + "tokens": { + "total_generated": 4096, + "content_tokens": 4096, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 549.89, + "concurrent_content_tps": 549.89, + "requests_per_second": 1.07, + "actual_wall_time": 7.449, + "efficiency_percent": 99.58 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 8.0, + "avg_batch_throughput": 549.89, + "min_batch_throughput": 549.89, + "max_batch_throughput": 549.89 + } + }, + { + "config": { + "input_tokens": 5000, + "output_tokens": 512, + "batch_size": 16, + "num_batches": 1, + "total_requests": 16, + "actual_input_tokens": 6024 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 16, + "failed_requests": 0 + }, + "latency": { + "mean": 9.992, + "std": 0.024, + "min": 9.937, + "max": 10.019, + "p50": 10.001, + "p95": 10.016, + "p99": 10.019, + "ci_95_lower": 9.98, + "ci_95_upper": 10.003 + }, + "ttft": { + "mean": 3.948, + "std": 1.636, + "p50": 3.491, + "p90": 5.599 + }, + "tokens": { + "total_generated": 8192, + "content_tokens": 8192, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 817.4, + "concurrent_content_tps": 817.4, + "requests_per_second": 1.6, + "actual_wall_time": 10.022, + "efficiency_percent": 99.7 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 16.0, + "avg_batch_throughput": 817.4, + "min_batch_throughput": 817.4, + "max_batch_throughput": 817.4 + } + }, + { + "config": { + "input_tokens": 5000, + "output_tokens": 512, + "batch_size": 24, + "num_batches": 1, + "total_requests": 24, + "actual_input_tokens": 6024 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 24, + "failed_requests": 0 + }, + "latency": { + "mean": 12.189, + "std": 0.038, + "min": 12.013, + "max": 12.21, + "p50": 12.197, + "p95": 12.209, + "p99": 12.21, + "ci_95_lower": 12.174, + "ci_95_upper": 12.204 + }, + "ttft": { + "mean": 4.238, + "std": 1.059, + "p50": 3.938, + "p90": 5.769 + }, + "tokens": { + "total_generated": 12288, + "content_tokens": 12288, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1005.93, + "concurrent_content_tps": 1005.93, + "requests_per_second": 1.96, + "actual_wall_time": 12.216, + "efficiency_percent": 99.78 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 24.0, + "avg_batch_throughput": 1005.93, + "min_batch_throughput": 1005.93, + "max_batch_throughput": 1005.93 + } + }, + { + "config": { + "input_tokens": 5000, + "output_tokens": 512, + "batch_size": 32, + "num_batches": 1, + "total_requests": 32, + "actual_input_tokens": 6024 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 32, + "failed_requests": 0 + }, + "latency": { + "mean": 13.535, + "std": 0.07, + "min": 13.146, + "max": 13.563, + "p50": 13.546, + "p95": 13.56, + "p99": 13.563, + "ci_95_lower": 13.511, + "ci_95_upper": 13.559 + }, + "ttft": { + "mean": 4.996, + "std": 1.647, + "p50": 4.524, + "p90": 6.854 + }, + "tokens": { + "total_generated": 16384, + "content_tokens": 16384, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1207.9, + "concurrent_content_tps": 1207.9, + "requests_per_second": 2.36, + "actual_wall_time": 13.564, + "efficiency_percent": 99.78 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 32.0, + "avg_batch_throughput": 1207.9, + "min_batch_throughput": 1207.9, + "max_batch_throughput": 1207.9 + } + }, + { + "config": { + "input_tokens": 5000, + "output_tokens": 512, + "batch_size": 64, + "num_batches": 1, + "total_requests": 64, + "actual_input_tokens": 6024 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 64, + "failed_requests": 0 + }, + "latency": { + "mean": 18.236, + "std": 0.071, + "min": 17.676, + "max": 18.258, + "p50": 18.245, + "p95": 18.257, + "p99": 18.258, + "ci_95_lower": 18.218, + "ci_95_upper": 18.253 + }, + "ttft": { + "mean": 6.521, + "std": 2.744, + "p50": 5.802, + "p90": 8.623 + }, + "tokens": { + "total_generated": 32768, + "content_tokens": 32768, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1793.73, + "concurrent_content_tps": 1793.73, + "requests_per_second": 3.5, + "actual_wall_time": 18.268, + "efficiency_percent": 99.82 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 64.0, + "avg_batch_throughput": 1793.73, + "min_batch_throughput": 1793.73, + "max_batch_throughput": 1793.73 + } + }, + { + "config": { + "input_tokens": 9000, + "output_tokens": 512, + "batch_size": 1, + "num_batches": 1, + "total_requests": 1, + "actual_input_tokens": 10777 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 1, + "failed_requests": 0 + }, + "latency": { + "mean": 4.17, + "std": 0.0, + "min": 4.17, + "max": 4.17, + "p50": 4.17, + "p95": 4.17, + "p99": 4.17, + "ci_95_lower": 4.17, + "ci_95_upper": 4.17 + }, + "ttft": { + "mean": 1.79, + "std": 0.0, + "p50": 1.79, + "p90": 1.79 + }, + "tokens": { + "total_generated": 512, + "content_tokens": 512, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 122.79, + "concurrent_content_tps": 122.79, + "requests_per_second": 0.24, + "actual_wall_time": 4.17, + "efficiency_percent": 100.0 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 1.0, + "avg_batch_throughput": 122.79, + "min_batch_throughput": 122.79, + "max_batch_throughput": 122.79 + } + }, + { + "config": { + "input_tokens": 9000, + "output_tokens": 512, + "batch_size": 8, + "num_batches": 1, + "total_requests": 8, + "actual_input_tokens": 10777 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 8, + "failed_requests": 0 + }, + "latency": { + "mean": 7.837, + "std": 0.011, + "min": 7.808, + "max": 7.846, + "p50": 7.84, + "p95": 7.845, + "p99": 7.846, + "ci_95_lower": 7.829, + "ci_95_upper": 7.845 + }, + "ttft": { + "mean": 2.73, + "std": 0.413, + "p50": 2.727, + "p90": 3.176 + }, + "tokens": { + "total_generated": 4096, + "content_tokens": 4096, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 521.94, + "concurrent_content_tps": 521.94, + "requests_per_second": 1.02, + "actual_wall_time": 7.848, + "efficiency_percent": 99.86 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 8.0, + "avg_batch_throughput": 521.94, + "min_batch_throughput": 521.94, + "max_batch_throughput": 521.94 + } + }, + { + "config": { + "input_tokens": 9000, + "output_tokens": 512, + "batch_size": 16, + "num_batches": 1, + "total_requests": 16, + "actual_input_tokens": 10777 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 16, + "failed_requests": 0 + }, + "latency": { + "mean": 10.825, + "std": 0.051, + "min": 10.645, + "max": 10.858, + "p50": 10.843, + "p95": 10.856, + "p99": 10.858, + "ci_95_lower": 10.8, + "ci_95_upper": 10.85 + }, + "ttft": { + "mean": 3.809, + "std": 0.481, + "p50": 3.923, + "p90": 4.335 + }, + "tokens": { + "total_generated": 8192, + "content_tokens": 8192, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 754.44, + "concurrent_content_tps": 754.44, + "requests_per_second": 1.47, + "actual_wall_time": 10.858, + "efficiency_percent": 99.69 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 16.0, + "avg_batch_throughput": 754.44, + "min_batch_throughput": 754.44, + "max_batch_throughput": 754.44 + } + }, + { + "config": { + "input_tokens": 9000, + "output_tokens": 512, + "batch_size": 24, + "num_batches": 1, + "total_requests": 24, + "actual_input_tokens": 10777 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 24, + "failed_requests": 0 + }, + "latency": { + "mean": 13.617, + "std": 0.082, + "min": 13.31, + "max": 13.728, + "p50": 13.61, + "p95": 13.726, + "p99": 13.727, + "ci_95_lower": 13.585, + "ci_95_upper": 13.65 + }, + "ttft": { + "mean": 5.393, + "std": 2.261, + "p50": 4.893, + "p90": 8.595 + }, + "tokens": { + "total_generated": 12288, + "content_tokens": 12288, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 895.03, + "concurrent_content_tps": 895.03, + "requests_per_second": 1.75, + "actual_wall_time": 13.729, + "efficiency_percent": 99.18 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 24.0, + "avg_batch_throughput": 895.03, + "min_batch_throughput": 895.03, + "max_batch_throughput": 895.03 + } + }, + { + "config": { + "input_tokens": 9000, + "output_tokens": 512, + "batch_size": 32, + "num_batches": 1, + "total_requests": 32, + "actual_input_tokens": 10777 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 32, + "failed_requests": 0 + }, + "latency": { + "mean": 15.071, + "std": 0.058, + "min": 14.788, + "max": 15.128, + "p50": 15.075, + "p95": 15.121, + "p99": 15.127, + "ci_95_lower": 15.051, + "ci_95_upper": 15.091 + }, + "ttft": { + "mean": 6.012, + "std": 2.11, + "p50": 5.568, + "p90": 9.381 + }, + "tokens": { + "total_generated": 16384, + "content_tokens": 16384, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1082.91, + "concurrent_content_tps": 1082.91, + "requests_per_second": 2.12, + "actual_wall_time": 15.13, + "efficiency_percent": 99.61 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 32.0, + "avg_batch_throughput": 1082.91, + "min_batch_throughput": 1082.91, + "max_batch_throughput": 1082.91 + } + }, + { + "config": { + "input_tokens": 9000, + "output_tokens": 512, + "batch_size": 64, + "num_batches": 1, + "total_requests": 64, + "actual_input_tokens": 10777 + }, + "success_metrics": { + "success_rate": 100.0, + "successful_requests": 64, + "failed_requests": 0 + }, + "latency": { + "mean": 20.583, + "std": 0.133, + "min": 19.9, + "max": 20.765, + "p50": 20.581, + "p95": 20.731, + "p99": 20.762, + "ci_95_lower": 20.55, + "ci_95_upper": 20.615 + }, + "ttft": { + "mean": 7.616, + "std": 2.21, + "p50": 7.112, + "p90": 9.38 + }, + "tokens": { + "total_generated": 32768, + "content_tokens": 32768, + "reasoning_tokens": 0, + "avg_per_request": 512.0 + }, + "throughput": { + "concurrent_total_tps": 1575.94, + "concurrent_content_tps": 1575.94, + "requests_per_second": 3.08, + "actual_wall_time": 20.793, + "efficiency_percent": 98.99 + }, + "batch_metrics": { + "num_batches": 1, + "avg_batch_size": 64.0, + "avg_batch_throughput": 1575.94, + "min_batch_throughput": 1575.94, + "max_batch_throughput": 1575.94 + } + } + ] +} \ No newline at end of file diff --git a/llm-throughput-tests-mindef-metadateren/results/results_openai_gpt-oss-120b_services/config_used.yaml b/llm-throughput-tests-mindef-metadateren/results/results_openai_gpt-oss-120b_services/config_used.yaml new file mode 100644 index 0000000..2a6b8cd --- /dev/null +++ b/llm-throughput-tests-mindef-metadateren/results/results_openai_gpt-oss-120b_services/config_used.yaml @@ -0,0 +1,28 @@ +endpoint: + url: https://b60dd657-9ce2-4ba0-ad45-754b5be29238.services.external.0a71m37v.ubiops.io/v1 + api_key: + model_name: openai/gpt-oss-120b +benchmark: + input_tokens: + - 1000 + - 2500 + - 5000 + - 9000 + batch_sizes: + - 1 + - 8 + - 16 + - 24 + - 32 + - 64 + num_batches: 1 + output_tokens: 512 + dataset: null + text: null +runtime: + request_timeout: 300 + delay_between_runs: 5 + log_io: true + wait_for_ready: true + max_init_retries: 10 + init_retry_delay: 30