readCtrl_lambda / code /translation /retranslate_fulltext_by_index_or_id.py
mshahidul
Initial commit of readCtrl code without large models
030876e
import argparse
import json
from pathlib import Path
from typing import Any
from openai import OpenAI
PROMPT_PATH = Path("/home/mshahidul/readctrl/prompts/translation_prompt.txt")
API_KEY_PATH = Path("/home/mshahidul/api_new.json")
def parse_csv_list(raw: str) -> list[str]:
if not raw:
return []
return [part.strip() for part in raw.split(",") if part.strip()]
def parse_indices(raw: str) -> list[int]:
out: list[int] = []
for part in parse_csv_list(raw):
try:
out.append(int(part))
except ValueError as exc:
raise ValueError(f"Invalid index '{part}'. Indices must be integers.") from exc
return out
def load_json(path: Path) -> Any:
with path.open("r", encoding="utf-8") as f:
return json.load(f)
def save_json(path: Path, data: Any) -> None:
with path.open("w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def build_prompt(
prompt_template: str,
medical_text: str,
source_language: str,
target_language: str,
) -> str:
return (
prompt_template.replace("<MEDICAL_TEXT>", medical_text)
.replace("<SOURCE_LANGUAGE>", source_language)
.replace("<TARGET_LANGUAGE>", target_language)
)
def translate_text(
client: OpenAI,
prompt: str,
model: str = "gpt-5",
) -> str | None:
try:
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": "You are a helpful assistant that outputs only valid JSON.",
},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
)
content = response.choices[0].message.content.strip()
cleaned = content.replace("```json", "").replace("```", "").strip()
parsed = json.loads(cleaned)
if isinstance(parsed, dict):
return parsed.get("translated_medical_note")
return None
except Exception as exc:
print(f"[WARN] API/parsing error: {exc}")
return None
def get_target_positions(
data: list[dict[str, Any]],
target_indices: set[int],
target_ids: set[str],
) -> list[int]:
positions: set[int] = set()
# Match by array position and by item["index"].
for pos, item in enumerate(data):
if pos in target_indices:
positions.add(pos)
item_index = item.get("index")
if isinstance(item_index, int) and item_index in target_indices:
positions.add(pos)
# Match by item["id"].
for pos, item in enumerate(data):
item_id = item.get("id")
if item_id is not None and str(item_id) in target_ids:
positions.add(pos)
return sorted(positions)
def main() -> None:
parser = argparse.ArgumentParser(
description=(
"Retranslate selected records' fulltext using gpt-5, "
"selected by array index/item index and/or id."
)
)
parser.add_argument(
"--input",
required=True,
help="Path to JSON file (list of records).",
)
parser.add_argument(
"--output",
default=None,
help="Optional output path. Defaults to in-place overwrite of --input.",
)
parser.add_argument(
"--indices",
default="36,40,44,48",
help="Comma-separated list of indices (e.g., 36,40,44,48).",
)
parser.add_argument(
"--ids",
default="",
help='Comma-separated list of ids (e.g., "a.txt,b.txt").',
)
parser.add_argument(
"--source-language",
default="English",
help="Source language name for prompt replacement.",
)
parser.add_argument(
"--target-language",
default="Bengali",
help="Target language name for prompt replacement.",
)
parser.add_argument(
"--model",
default="gpt-5",
help="OpenAI model name (default: gpt-5).",
)
parser.add_argument(
"--save-every",
type=int,
default=1,
help="Incremental save frequency in processed items (default: 1).",
)
args = parser.parse_args()
input_path = Path(args.input)
output_path = Path(args.output) if args.output else input_path
data = load_json(input_path)
if not isinstance(data, list):
raise ValueError("Input JSON must be a list of records.")
indices = set(parse_indices(args.indices))
ids = set(parse_csv_list(args.ids))
if not indices and not ids:
raise ValueError("Provide at least one selector: --indices and/or --ids.")
prompt_template = PROMPT_PATH.read_text(encoding="utf-8")
api_keys = load_json(API_KEY_PATH)
openai_api_key = api_keys["openai"]
client = OpenAI(api_key=openai_api_key)
target_positions = get_target_positions(data, indices, ids)
if not target_positions:
print("No matching records found for provided indices/ids.")
return
print(f"Matched {len(target_positions)} record(s): {target_positions}")
processed = 0
for pos in target_positions:
item = data[pos]
fulltext = item.get("fulltext")
if not isinstance(fulltext, str) or not fulltext.strip():
print(f"[SKIP] pos={pos} id={item.get('id')} has empty fulltext.")
continue
prompt = build_prompt(
prompt_template=prompt_template,
medical_text=fulltext,
source_language=args.source_language,
target_language=args.target_language,
)
translated = translate_text(client=client, prompt=prompt, model=args.model)
if translated is None:
print(f"[WARN] pos={pos} id={item.get('id')} translation failed.")
continue
item["translated_fulltext"] = translated
processed += 1
print(f"[OK] pos={pos} id={item.get('id')} translated_fulltext updated.")
if processed % max(args.save_every, 1) == 0:
save_json(output_path, data)
print(f"[SAVE] Incremental save after {processed} item(s) -> {output_path}")
save_json(output_path, data)
print(f"Done. Total updated records: {processed}. Saved to: {output_path}")
if __name__ == "__main__":
main()