model: base_url: "http://127.0.0.1:8000" max_tokens: 256 temperature: 0.1 timeout: 30 datasets: benchmark_dataset: file_path: "Personal_De-identifier_Benchmark_SFT.jsonl" sample_size: 100 # Use first 100 examples for quick benchmarking instruction_field: "instruction" input_field: "input" expected_output_field: "response" metrics: # Primary metrics for HuggingFace pii_detection: name: "PII Detection Rate" description: "Percentage of personal identifiers correctly identified and masked" type: "accuracy" completeness: name: "Completeness Score" description: "Percentage of texts where all PII was successfully removed" type: "binary_accuracy" semantic_preservation: name: "Semantic Preservation" description: "How well the original meaning is preserved (placeholder-based similarity)" type: "similarity" latency: name: "Average Latency" description: "Average response time in milliseconds" type: "latency" # Domain-specific performance domain_performance: medical: name: "Medical Records" keywords: ["patient", "doctor", "hospital", "medical", "diagnosis"] legal: name: "Legal Documents" keywords: ["deponent", "attorney", "case", "court", "legal"] hr: name: "HR Records" keywords: ["employee", "salary", "hr", "personnel", "recruitment"] customer_service: name: "Customer Service" keywords: ["customer", "complaint", "service", "support", "inquiry"] research: name: "Research Data" keywords: ["participant", "study", "research", "consent", "ethics"] output: results_file: "benchmarks.txt" detailed_results_file: "benchmark_results.json" include_examples: true max_examples: 10