endpoint: # internal litellm ubiops #url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1 #api_key: #model_name: openai-gpt-oss-120b-max-16 #url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1 #api_key: #model_name: openai-gpt-oss-120b url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1 api_key: model_name: openai-gpt-oss-120b-2x #url: https://b60dd657-9ce2-4ba0-ad45-754b5be29238.services.external.0a71m37v.ubiops.io/v1 #api_key: #model_name: openai/gpt-oss-120b # staging litellm #url: https://f1dfa3fc-3314-4d49-be06-98bfd3d1f5fd.services.staging.ubiops.dev/v1 #api_key: #model_name: llama-1b # staging vllm #url: https://dde9ea35-6a02-4242-a3f3-5a7e7e29e7a7.services.staging.ubiops.dev/v1 #api_key: #model_name: meta-llama/Llama-3.2-1B-Instruct benchmark: # Input token counts to testfhtt input_tokens: [50000] # Batch sizes to test (number of simultaneous requests per batch) # Each batch sends N requests at the exact same time batch_sizes: [64] num_batches: 1 # Maximum output tokens per request output_tokens: 1024 # Optional: Path to conversation dataset JSON file # Generate with: python create_test_dataset.py # If not provided, uses synthetic prompts dataset: test_conversations.json # or "test_conversations.json" # Optional: Custom text to use as input for all requests # Uses the same text for every request (ignores input_tokens) # Priority: text > dataset > generated prompts # Example: "Analyze this document about machine learning..." text: null runtime: # Timeout for each request (seconds) request_timeout: 1800 # Delay between benchmark runs (seconds) delay_between_runs: 5 # Enable detailed I/O logging (input prompts + outputs) log_io: true # Wait for model initialization before starting wait_for_ready: true # Maximum initialization check attempts max_init_retries: 10 # Delay between initialization checks (seconds) init_retry_delay: 30