| import argparse |
| import json |
| from pathlib import Path |
| from typing import Any |
|
|
| from openai import OpenAI |
|
|
|
|
| PROMPT_PATH = Path("/home/mshahidul/readctrl/prompts/translation_prompt.txt") |
| API_KEY_PATH = Path("/home/mshahidul/api_new.json") |
|
|
|
|
| def parse_csv_list(raw: str) -> list[str]: |
| if not raw: |
| return [] |
| return [part.strip() for part in raw.split(",") if part.strip()] |
|
|
|
|
| def parse_indices(raw: str) -> list[int]: |
| out: list[int] = [] |
| for part in parse_csv_list(raw): |
| try: |
| out.append(int(part)) |
| except ValueError as exc: |
| raise ValueError(f"Invalid index '{part}'. Indices must be integers.") from exc |
| return out |
|
|
|
|
| def load_json(path: Path) -> Any: |
| with path.open("r", encoding="utf-8") as f: |
| return json.load(f) |
|
|
|
|
| def save_json(path: Path, data: Any) -> None: |
| with path.open("w", encoding="utf-8") as f: |
| json.dump(data, f, indent=2, ensure_ascii=False) |
|
|
|
|
| def build_prompt( |
| prompt_template: str, |
| medical_text: str, |
| source_language: str, |
| target_language: str, |
| ) -> str: |
| return ( |
| prompt_template.replace("<MEDICAL_TEXT>", medical_text) |
| .replace("<SOURCE_LANGUAGE>", source_language) |
| .replace("<TARGET_LANGUAGE>", target_language) |
| ) |
|
|
|
|
| def translate_text( |
| client: OpenAI, |
| prompt: str, |
| model: str = "gpt-5", |
| ) -> str | None: |
| try: |
| response = client.chat.completions.create( |
| model=model, |
| messages=[ |
| { |
| "role": "system", |
| "content": "You are a helpful assistant that outputs only valid JSON.", |
| }, |
| {"role": "user", "content": prompt}, |
| ], |
| response_format={"type": "json_object"}, |
| ) |
| content = response.choices[0].message.content.strip() |
| cleaned = content.replace("```json", "").replace("```", "").strip() |
| parsed = json.loads(cleaned) |
| if isinstance(parsed, dict): |
| return parsed.get("translated_medical_note") |
| return None |
| except Exception as exc: |
| print(f"[WARN] API/parsing error: {exc}") |
| return None |
|
|
|
|
| def get_target_positions( |
| data: list[dict[str, Any]], |
| target_indices: set[int], |
| target_ids: set[str], |
| ) -> list[int]: |
| positions: set[int] = set() |
|
|
| |
| for pos, item in enumerate(data): |
| if pos in target_indices: |
| positions.add(pos) |
| item_index = item.get("index") |
| if isinstance(item_index, int) and item_index in target_indices: |
| positions.add(pos) |
|
|
| |
| for pos, item in enumerate(data): |
| item_id = item.get("id") |
| if item_id is not None and str(item_id) in target_ids: |
| positions.add(pos) |
|
|
| return sorted(positions) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser( |
| description=( |
| "Retranslate selected records' fulltext using gpt-5, " |
| "selected by array index/item index and/or id." |
| ) |
| ) |
| parser.add_argument( |
| "--input", |
| required=True, |
| help="Path to JSON file (list of records).", |
| ) |
| parser.add_argument( |
| "--output", |
| default=None, |
| help="Optional output path. Defaults to in-place overwrite of --input.", |
| ) |
| parser.add_argument( |
| "--indices", |
| default="36,40,44,48", |
| help="Comma-separated list of indices (e.g., 36,40,44,48).", |
| ) |
| parser.add_argument( |
| "--ids", |
| default="", |
| help='Comma-separated list of ids (e.g., "a.txt,b.txt").', |
| ) |
| parser.add_argument( |
| "--source-language", |
| default="English", |
| help="Source language name for prompt replacement.", |
| ) |
| parser.add_argument( |
| "--target-language", |
| default="Bengali", |
| help="Target language name for prompt replacement.", |
| ) |
| parser.add_argument( |
| "--model", |
| default="gpt-5", |
| help="OpenAI model name (default: gpt-5).", |
| ) |
| parser.add_argument( |
| "--save-every", |
| type=int, |
| default=1, |
| help="Incremental save frequency in processed items (default: 1).", |
| ) |
| args = parser.parse_args() |
|
|
| input_path = Path(args.input) |
| output_path = Path(args.output) if args.output else input_path |
|
|
| data = load_json(input_path) |
| if not isinstance(data, list): |
| raise ValueError("Input JSON must be a list of records.") |
|
|
| indices = set(parse_indices(args.indices)) |
| ids = set(parse_csv_list(args.ids)) |
| if not indices and not ids: |
| raise ValueError("Provide at least one selector: --indices and/or --ids.") |
|
|
| prompt_template = PROMPT_PATH.read_text(encoding="utf-8") |
| api_keys = load_json(API_KEY_PATH) |
| openai_api_key = api_keys["openai"] |
| client = OpenAI(api_key=openai_api_key) |
|
|
| target_positions = get_target_positions(data, indices, ids) |
| if not target_positions: |
| print("No matching records found for provided indices/ids.") |
| return |
|
|
| print(f"Matched {len(target_positions)} record(s): {target_positions}") |
| processed = 0 |
|
|
| for pos in target_positions: |
| item = data[pos] |
| fulltext = item.get("fulltext") |
| if not isinstance(fulltext, str) or not fulltext.strip(): |
| print(f"[SKIP] pos={pos} id={item.get('id')} has empty fulltext.") |
| continue |
|
|
| prompt = build_prompt( |
| prompt_template=prompt_template, |
| medical_text=fulltext, |
| source_language=args.source_language, |
| target_language=args.target_language, |
| ) |
| translated = translate_text(client=client, prompt=prompt, model=args.model) |
| if translated is None: |
| print(f"[WARN] pos={pos} id={item.get('id')} translation failed.") |
| continue |
|
|
| item["translated_fulltext"] = translated |
| processed += 1 |
| print(f"[OK] pos={pos} id={item.get('id')} translated_fulltext updated.") |
|
|
| if processed % max(args.save_every, 1) == 0: |
| save_json(output_path, data) |
| print(f"[SAVE] Incremental save after {processed} item(s) -> {output_path}") |
|
|
| save_json(output_path, data) |
| print(f"Done. Total updated records: {processed}. Saved to: {output_path}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|
|
|