70 lines
2.1 KiB
YAML
70 lines
2.1 KiB
YAML
endpoint:
|
|
# internal litellm ubiops
|
|
#url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1
|
|
#api_key:
|
|
#model_name: openai-gpt-oss-120b-max-16
|
|
|
|
#url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1
|
|
#api_key:
|
|
#model_name: openai-gpt-oss-120b
|
|
|
|
url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1
|
|
api_key:
|
|
model_name: openai-gpt-oss-120b-2x
|
|
|
|
#url: https://b60dd657-9ce2-4ba0-ad45-754b5be29238.services.external.0a71m37v.ubiops.io/v1
|
|
#api_key:
|
|
#model_name: openai/gpt-oss-120b
|
|
|
|
|
|
# staging litellm
|
|
#url: https://f1dfa3fc-3314-4d49-be06-98bfd3d1f5fd.services.staging.ubiops.dev/v1
|
|
#api_key:
|
|
#model_name: llama-1b
|
|
|
|
# staging vllm
|
|
#url: https://dde9ea35-6a02-4242-a3f3-5a7e7e29e7a7.services.staging.ubiops.dev/v1
|
|
#api_key:
|
|
#model_name: meta-llama/Llama-3.2-1B-Instruct
|
|
benchmark:
|
|
# Input token counts to testfhtt
|
|
input_tokens: [50000]
|
|
|
|
# Batch sizes to test (number of simultaneous requests per batch)
|
|
# Each batch sends N requests at the exact same time
|
|
batch_sizes: [64]
|
|
|
|
num_batches: 1
|
|
# Maximum output tokens per request
|
|
output_tokens: 1024
|
|
|
|
# Optional: Path to conversation dataset JSON file
|
|
# Generate with: python create_test_dataset.py
|
|
# If not provided, uses synthetic prompts
|
|
dataset: test_conversations.json # or "test_conversations.json"
|
|
|
|
# Optional: Custom text to use as input for all requests
|
|
# Uses the same text for every request (ignores input_tokens)
|
|
# Priority: text > dataset > generated prompts
|
|
# Example: "Analyze this document about machine learning..."
|
|
text: null
|
|
|
|
runtime:
|
|
# Timeout for each request (seconds)
|
|
request_timeout: 1800
|
|
|
|
# Delay between benchmark runs (seconds)
|
|
delay_between_runs: 5
|
|
|
|
# Enable detailed I/O logging (input prompts + outputs)
|
|
log_io: true
|
|
|
|
# Wait for model initialization before starting
|
|
wait_for_ready: true
|
|
|
|
# Maximum initialization check attempts
|
|
max_init_retries: 10
|
|
|
|
# Delay between initialization checks (seconds)
|
|
init_retry_delay: 30
|