| use serde::Deserialize; |
| use std::collections::HashMap; |
| use std::io::{self, BufRead, Write}; |
| use std::time::Instant; |
| use unicode_normalization::UnicodeNormalization; |
|
|
| |
|
|
| #[derive(Deserialize)] |
| struct MappingData { |
| province_mapping: HashMap<String, String>, |
| province_names: HashMap<String, ProvinceInfo>, |
| old_province_names: HashMap<String, ProvinceInfo>, |
| ward_mapping: Vec<WardRecord>, |
| } |
|
|
| #[derive(Deserialize, Clone)] |
| struct ProvinceInfo { |
| name: String, |
| short: String, |
| #[allow(dead_code)] |
| code: String, |
| } |
|
|
| #[derive(Deserialize, Clone)] |
| struct WardRecord { |
| #[allow(dead_code)] |
| old_province: String, |
| old_province_key: String, |
| #[allow(dead_code)] |
| old_district: String, |
| old_district_key: String, |
| #[allow(dead_code)] |
| old_ward: String, |
| old_ward_key: String, |
| #[allow(dead_code)] |
| new_province: String, |
| #[allow(dead_code)] |
| new_province_key: String, |
| new_ward: String, |
| #[allow(dead_code)] |
| new_ward_key: String, |
| #[allow(dead_code)] |
| mapping_type: String, |
| #[serde(default)] |
| is_default: bool, |
| } |
|
|
| |
|
|
| struct Index { |
| province_mapping: HashMap<String, String>, |
| province_names: HashMap<String, ProvinceInfo>, |
| province_keywords: HashMap<String, String>, |
| exact: HashMap<(String, String, String), Vec<usize>>, |
| ward_only: HashMap<(String, String), Vec<usize>>, |
| records: Vec<WardRecord>, |
| } |
|
|
| fn build_index(data: MappingData) -> Index { |
| let mut province_keywords: HashMap<String, String> = HashMap::new(); |
|
|
| for (key, info) in &data.old_province_names { |
| province_keywords.insert(normalize_key(&info.name), key.clone()); |
| province_keywords.insert(normalize_key(&info.short), key.clone()); |
| province_keywords.insert(key.clone(), key.clone()); |
| } |
|
|
| let mut exact: HashMap<(String, String, String), Vec<usize>> = HashMap::new(); |
| let mut ward_only: HashMap<(String, String), Vec<usize>> = HashMap::new(); |
|
|
| for (i, rec) in data.ward_mapping.iter().enumerate() { |
| let ek = ( |
| rec.old_province_key.clone(), |
| rec.old_district_key.clone(), |
| rec.old_ward_key.clone(), |
| ); |
| exact.entry(ek).or_default().push(i); |
|
|
| let wk = (rec.old_province_key.clone(), rec.old_ward_key.clone()); |
| ward_only.entry(wk).or_default().push(i); |
| } |
|
|
| Index { |
| province_mapping: data.province_mapping, |
| province_names: data.province_names, |
| province_keywords, |
| exact, |
| ward_only, |
| records: data.ward_mapping, |
| } |
| } |
|
|
| |
|
|
| fn remove_diacritics(text: &str) -> String { |
| let nfkd: String = text.nfkd().collect(); |
| let mut result = String::with_capacity(nfkd.len()); |
| for c in nfkd.chars() { |
| if c == 'Δ' { |
| result.push('d'); |
| } else if c == 'Δ' { |
| result.push('D'); |
| } else if !unicode_normalization::char::is_combining_mark(c) { |
| result.push(c); |
| } |
| } |
| result |
| } |
|
|
| fn normalize_key(text: &str) -> String { |
| let lower = text.to_lowercase(); |
| let lower = lower.trim(); |
| let no_dia = remove_diacritics(lower); |
| no_dia |
| .chars() |
| .filter(|c| c.is_ascii_alphanumeric()) |
| .collect() |
| } |
|
|
| |
| const ABBREVIATIONS: &[(&str, &str)] = &[ |
| ("t.p.", "thΓ nh phα» "), |
| ("t.p ", "thΓ nh phα» "), |
| ("t.x.", "thα» xΓ£ "), |
| ("t.t.", "thα» trαΊ₯n "), |
| ("tp.", "thΓ nh phα» "), |
| ("tp ", "thΓ nh phα» "), |
| ("tx.", "thα» xΓ£ "), |
| ("tt.", "thα» trαΊ₯n "), |
| ("p.", "phΖ°α»ng "), |
| ("q.", "quαΊn "), |
| ("h.", "huyα»n "), |
| ("x.", "xΓ£ "), |
| ]; |
|
|
| fn expand_abbreviations(text: &str) -> String { |
| let mut result = text.to_lowercase(); |
| result = result.trim().to_string(); |
| for &(abbr, full) in ABBREVIATIONS { |
| result = result.replace(abbr, full); |
| } |
| result.trim().to_string() |
| } |
|
|
| fn normalize_for_matching(text: &str) -> String { |
| let expanded = expand_abbreviations(text); |
| normalize_key(&expanded) |
| } |
|
|
| |
|
|
| const WARD_PREFIXES: &[&str] = &["phΖ°α»ng", "xΓ£", "thα» trαΊ₯n"]; |
|
|
| struct AdminUnit { |
| province: String, |
| district: String, |
| ward: String, |
| street: String, |
| } |
|
|
| impl AdminUnit { |
| fn empty() -> Self { |
| AdminUnit { |
| province: String::new(), |
| district: String::new(), |
| ward: String::new(), |
| street: String::new(), |
| } |
| } |
|
|
| } |
|
|
| fn parse_address(address: &str) -> AdminUnit { |
| let expanded = expand_abbreviations(address); |
| let parts: Vec<&str> = expanded |
| .split(',') |
| .map(|s| s.trim()) |
| .filter(|s| !s.is_empty()) |
| .collect(); |
|
|
| if parts.is_empty() { |
| return AdminUnit::empty(); |
| } |
|
|
| let mut unit = AdminUnit::empty(); |
|
|
| let n = parts.len(); |
| if n >= 1 { |
| unit.province = parts[n - 1].to_string(); |
| } |
| if n >= 2 { |
| unit.district = parts[n - 2].to_string(); |
| } |
| if n >= 3 { |
| unit.ward = parts[n - 3].to_string(); |
| } |
| if n >= 4 { |
| unit.street = parts[..n - 3].join(", "); |
| } |
|
|
| |
| if n == 2 { |
| let lower = parts[0].to_lowercase(); |
| let lower = lower.trim(); |
| for prefix in WARD_PREFIXES { |
| if lower.starts_with(prefix) { |
| unit.ward = unit.district.clone(); |
| unit.district = String::new(); |
| break; |
| } |
| } |
| } |
|
|
| unit |
| } |
|
|
| |
|
|
| fn resolve_province(index: &Index, text: &str) -> Option<String> { |
| let normalized = normalize_for_matching(text); |
| index.province_keywords.get(&normalized).cloned() |
| } |
|
|
| fn find_mapping(index: &Index, prov: &str, dist: &str, ward: &str) -> Vec<usize> { |
| |
| let ek = (prov.to_string(), dist.to_string(), ward.to_string()); |
| if let Some(ids) = index.exact.get(&ek) { |
| if !ids.is_empty() { |
| return ids.clone(); |
| } |
| } |
| |
| let wk = (prov.to_string(), ward.to_string()); |
| if let Some(ids) = index.ward_only.get(&wk) { |
| return ids.clone(); |
| } |
| Vec::new() |
| } |
|
|
| fn select_best(index: &Index, ids: &[usize]) -> Option<usize> { |
| if ids.is_empty() { |
| return None; |
| } |
| if ids.len() == 1 { |
| return Some(ids[0]); |
| } |
| for &id in ids { |
| if index.records[id].is_default { |
| return Some(id); |
| } |
| } |
| Some(ids[0]) |
| } |
|
|
| fn convert_address(index: &Index, address: &str) -> String { |
| let parsed = parse_address(address); |
|
|
| |
| let mut old_prov_key = resolve_province(index, &parsed.province); |
| if old_prov_key.is_none() && !parsed.district.is_empty() { |
| old_prov_key = resolve_province(index, &parsed.district); |
| } |
| let old_prov_key = match old_prov_key { |
| Some(k) => k, |
| None => return String::new(), |
| }; |
|
|
| |
| let new_prov_key = match index.province_mapping.get(&old_prov_key) { |
| Some(k) => k.clone(), |
| None => return String::new(), |
| }; |
| let new_prov_name = index |
| .province_names |
| .get(&new_prov_key) |
| .map(|i| i.name.as_str()) |
| .unwrap_or(""); |
|
|
| |
| if parsed.ward.is_empty() && parsed.district.is_empty() { |
| return new_prov_name.to_string(); |
| } |
|
|
| let old_dist_key = if !parsed.district.is_empty() { |
| normalize_key(&parsed.district) |
| } else { |
| String::new() |
| }; |
| let old_ward_key = if !parsed.ward.is_empty() { |
| normalize_key(&parsed.ward) |
| } else { |
| String::new() |
| }; |
|
|
| let mut ids = find_mapping(index, &old_prov_key, &old_dist_key, &old_ward_key); |
|
|
| |
| if ids.is_empty() && !parsed.ward.is_empty() { |
| let alt_ward = if !parsed.district.is_empty() { |
| normalize_key(&parsed.district) |
| } else { |
| String::new() |
| }; |
| if !alt_ward.is_empty() { |
| ids = find_mapping(index, &old_prov_key, "", &alt_ward); |
| } |
| } |
|
|
| if ids.is_empty() { |
| |
| let mut parts: Vec<&str> = Vec::new(); |
| if !parsed.street.is_empty() { |
| parts.push(&parsed.street); |
| } |
| parts.push(new_prov_name); |
| return parts.join(", "); |
| } |
|
|
| let best_id = select_best(index, &ids).unwrap(); |
| let rec = &index.records[best_id]; |
|
|
| let mut parts: Vec<&str> = Vec::new(); |
| if !parsed.street.is_empty() { |
| parts.push(&parsed.street); |
| } |
| if !rec.new_ward.is_empty() { |
| parts.push(&rec.new_ward); |
| } |
| parts.push(new_prov_name); |
| parts.join(", ") |
| } |
|
|
| |
|
|
| fn load_index(data_path: &str) -> Index { |
| let data_bytes = std::fs::read(data_path).expect("Failed to read mapping.json"); |
| let data: MappingData = serde_json::from_slice(&data_bytes).expect("Failed to parse JSON"); |
| build_index(data) |
| } |
|
|
| fn main() { |
| let args: Vec<String> = std::env::args().collect(); |
| if args.len() < 2 { |
| eprintln!("Usage: address-converter <convert|bench> [address]"); |
| std::process::exit(1); |
| } |
|
|
| let data_path = std::env::var("MAPPING_JSON") |
| .unwrap_or_else(|_| { |
| |
| let exe = std::env::current_exe().unwrap(); |
| let project_root = exe |
| .parent().unwrap() |
| .parent().unwrap() |
| .parent().unwrap(); |
| project_root |
| .parent().unwrap() |
| .join("data") |
| .join("mapping.json") |
| .to_string_lossy() |
| .to_string() |
| }); |
|
|
| match args[1].as_str() { |
| "convert" => { |
| if args.len() < 3 { |
| eprintln!("Usage: address-converter convert <address>"); |
| std::process::exit(1); |
| } |
| let index = load_index(&data_path); |
| let result = convert_address(&index, &args[2]); |
| println!("{}", result); |
| } |
| "bench" => { |
| |
| let index = load_index(&data_path); |
| let stdin = io::stdin(); |
| let addresses: Vec<String> = stdin |
| .lock() |
| .lines() |
| .map(|l| l.expect("Failed to read line")) |
| .collect(); |
|
|
| let n = addresses.len(); |
| let start = Instant::now(); |
| let mut results: Vec<String> = Vec::with_capacity(n); |
| for addr in &addresses { |
| results.push(convert_address(&index, addr)); |
| } |
| let elapsed = start.elapsed(); |
|
|
| |
| let stdout = io::stdout(); |
| let mut out = stdout.lock(); |
| for r in &results { |
| writeln!(out, "{}", r).unwrap(); |
| } |
|
|
| |
| eprintln!( |
| "BENCH: {} addresses in {:.6} s ({:.3} us/addr)", |
| n, |
| elapsed.as_secs_f64(), |
| elapsed.as_secs_f64() * 1_000_000.0 / n.max(1) as f64 |
| ); |
| } |
| _ => { |
| eprintln!("Unknown command: {}. Use 'convert' or 'bench'.", args[1]); |
| std::process::exit(1); |
| } |
| } |
| } |
|
|