| |
| """Convert the raw Tatoeba sentence dump into a lean parquet cache.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| from pathlib import Path |
|
|
| from tatoeba import TATOEBA_PARQUET_PATH, build_tatoeba_text_parquet |
|
|
|
|
| def build_arg_parser() -> argparse.ArgumentParser: |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument( |
| "--input-path", |
| type=Path, |
| default=Path(__file__).with_name("sentences.csv"), |
| help="Path to the raw Tatoeba TSV dump.", |
| ) |
| parser.add_argument( |
| "--output-path", |
| type=Path, |
| default=TATOEBA_PARQUET_PATH, |
| help="Where to write the lean parquet cache.", |
| ) |
| return parser |
|
|
|
|
| def main() -> None: |
| args = build_arg_parser().parse_args() |
| build_tatoeba_text_parquet(args.input_path, args.output_path) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|