| model: |
| base_url: "http://127.0.0.1:8000" |
| max_tokens: 256 |
| temperature: 0.1 |
| timeout: 30 |
|
|
| datasets: |
| benchmark_dataset: |
| file_path: "Personal_De-identifier_Benchmark_SFT.jsonl" |
| sample_size: 100 |
| instruction_field: "instruction" |
| input_field: "input" |
| expected_output_field: "response" |
|
|
| metrics: |
| |
| pii_detection: |
| name: "PII Detection Rate" |
| description: "Percentage of personal identifiers correctly identified and masked" |
| type: "accuracy" |
|
|
| completeness: |
| name: "Completeness Score" |
| description: "Percentage of texts where all PII was successfully removed" |
| type: "binary_accuracy" |
|
|
| semantic_preservation: |
| name: "Semantic Preservation" |
| description: "How well the original meaning is preserved (placeholder-based similarity)" |
| type: "similarity" |
|
|
| latency: |
| name: "Average Latency" |
| description: "Average response time in milliseconds" |
| type: "latency" |
|
|
| |
| domain_performance: |
| medical: |
| name: "Medical Records" |
| keywords: ["patient", "doctor", "hospital", "medical", "diagnosis"] |
| legal: |
| name: "Legal Documents" |
| keywords: ["deponent", "attorney", "case", "court", "legal"] |
| hr: |
| name: "HR Records" |
| keywords: ["employee", "salary", "hr", "personnel", "recruitment"] |
| customer_service: |
| name: "Customer Service" |
| keywords: ["customer", "complaint", "service", "support", "inquiry"] |
| research: |
| name: "Research Data" |
| keywords: ["participant", "study", "research", "consent", "ethics"] |
|
|
| output: |
| results_file: "benchmarks.txt" |
| detailed_results_file: "benchmark_results.json" |
| include_examples: true |
| max_examples: 10 |
|
|