Spaces:

polyglot-tagger
/

language-extractor-demo

Running

language-extractor-demo / convert_tatoeba_sentences.py

add

1d100ed 5 days ago

893 Bytes

	#!/usr/bin/env python3
	"""Convert the raw Tatoeba sentence dump into a lean parquet cache."""

	from __future__ import annotations

	import argparse
	from pathlib import Path

	from tatoeba import TATOEBA_PARQUET_PATH, build_tatoeba_text_parquet


	def build_arg_parser() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--input-path",
	type=Path,
	default=Path(__file__).with_name("sentences.csv"),
	help="Path to the raw Tatoeba TSV dump.",
	)
	parser.add_argument(
	"--output-path",
	type=Path,
	default=TATOEBA_PARQUET_PATH,
	help="Where to write the lean parquet cache.",
	)
	return parser


	def main() -> None:
	args = build_arg_parser().parse_args()
	build_tatoeba_text_parquet(args.input_path, args.output_path)


	if __name__ == "__main__":
	main()