model:
  base_url: "http://127.0.0.1:8000"
  max_tokens: 256
  temperature: 0.1
  timeout: 30

datasets:
  benchmark_dataset:
    file_path: "Personal_De-identifier_Benchmark_SFT.jsonl"
    sample_size: 100  # Use first 100 examples for quick benchmarking
    instruction_field: "instruction"
    input_field: "input"
    expected_output_field: "response"

metrics:
  # Primary metrics for HuggingFace
  pii_detection:
    name: "PII Detection Rate"
    description: "Percentage of personal identifiers correctly identified and masked"
    type: "accuracy"

  completeness:
    name: "Completeness Score"
    description: "Percentage of texts where all PII was successfully removed"
    type: "binary_accuracy"

  semantic_preservation:
    name: "Semantic Preservation"
    description: "How well the original meaning is preserved (placeholder-based similarity)"
    type: "similarity"

  latency:
    name: "Average Latency"
    description: "Average response time in milliseconds"
    type: "latency"

  # Domain-specific performance
  domain_performance:
    medical:
      name: "Medical Records"
      keywords: ["patient", "doctor", "hospital", "medical", "diagnosis"]
    legal:
      name: "Legal Documents"
      keywords: ["deponent", "attorney", "case", "court", "legal"]
    hr:
      name: "HR Records"
      keywords: ["employee", "salary", "hr", "personnel", "recruitment"]
    customer_service:
      name: "Customer Service"
      keywords: ["customer", "complaint", "service", "support", "inquiry"]
    research:
      name: "Research Data"
      keywords: ["participant", "study", "research", "consent", "ethics"]

output:
  results_file: "benchmarks.txt"
  detailed_results_file: "benchmark_results.json"
  include_examples: true
  max_examples: 10