#!/usr/bin/env python3 """Convert the raw Tatoeba sentence dump into a lean parquet cache.""" from __future__ import annotations import argparse from pathlib import Path from tatoeba import TATOEBA_PARQUET_PATH, build_tatoeba_text_parquet def build_arg_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--input-path", type=Path, default=Path(__file__).with_name("sentences.csv"), help="Path to the raw Tatoeba TSV dump.", ) parser.add_argument( "--output-path", type=Path, default=TATOEBA_PARQUET_PATH, help="Where to write the lean parquet cache.", ) return parser def main() -> None: args = build_arg_parser().parse_args() build_tatoeba_text_parquet(args.input_path, args.output_path) if __name__ == "__main__": main()