mindef-overdracht/llm-throughput-tests-mindef-metadateren/benchmark_config.yaml

endpoint:
  # internal litellm ubiops
  #url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1
  #api_key:
  #model_name: openai-gpt-oss-120b-max-16

  #url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1
  #api_key:
  #model_name: openai-gpt-oss-120b

  url: https://46e73bba-0ed9-4853-b2b0-d4509aaab06b.services.external.0a71m37v.ubiops.io/v1
  api_key:
  model_name: openai-gpt-oss-120b-2x

  #url: https://b60dd657-9ce2-4ba0-ad45-754b5be29238.services.external.0a71m37v.ubiops.io/v1
  #api_key:
  #model_name: openai/gpt-oss-120b


  # staging litellm
  #url: https://f1dfa3fc-3314-4d49-be06-98bfd3d1f5fd.services.staging.ubiops.dev/v1
  #api_key:
  #model_name: llama-1b

  # staging vllm
  #url: https://dde9ea35-6a02-4242-a3f3-5a7e7e29e7a7.services.staging.ubiops.dev/v1
  #api_key:
  #model_name: meta-llama/Llama-3.2-1B-Instruct
benchmark:
  # Input token counts to testfhtt
  input_tokens: [50000]

  # Batch sizes to test (number of simultaneous requests per batch)
  # Each batch sends N requests at the exact same time
  batch_sizes: [64]

  num_batches: 1
  # Maximum output tokens per request
  output_tokens: 1024

  # Optional: Path to conversation dataset JSON file
  # Generate with: python create_test_dataset.py
  # If not provided, uses synthetic prompts
  dataset: test_conversations.json  # or "test_conversations.json"

  # Optional: Custom text to use as input for all requests
  # Uses the same text for every request (ignores input_tokens)
  # Priority: text > dataset > generated prompts
  # Example: "Analyze this document about machine learning..."
  text: null

runtime:
  # Timeout for each request (seconds)
  request_timeout: 1800

  # Delay between benchmark runs (seconds)
  delay_between_runs: 5

  # Enable detailed I/O logging (input prompts + outputs)
  log_io: true

  # Wait for model initialization before starting
  wait_for_ready: true

  # Maximum initialization check attempts
  max_init_retries: 10

  # Delay between initialization checks (seconds)
  init_retry_delay: 30