language-extractor-demo / convert_tatoeba_sentences.py
DerivedFunction1's picture
add
1d100ed
raw
history blame contribute delete
893 Bytes
#!/usr/bin/env python3
"""Convert the raw Tatoeba sentence dump into a lean parquet cache."""
from __future__ import annotations
import argparse
from pathlib import Path
from tatoeba import TATOEBA_PARQUET_PATH, build_tatoeba_text_parquet
def build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--input-path",
type=Path,
default=Path(__file__).with_name("sentences.csv"),
help="Path to the raw Tatoeba TSV dump.",
)
parser.add_argument(
"--output-path",
type=Path,
default=TATOEBA_PARQUET_PATH,
help="Where to write the lean parquet cache.",
)
return parser
def main() -> None:
args = build_arg_parser().parse_args()
build_tatoeba_text_parquet(args.input_path, args.output_path)
if __name__ == "__main__":
main()