| """ |
| OPUS (http://opus.nlpl.eu/) is a great collection of different parallel datasets for more than 400 languages. |
| On the website, you can download parallel datasets for many languages in different formats. I found that |
| the format "Bottom-left triangle: download plain text files (MOSES/GIZA++)" requires minimal |
| overhead for post-processing to get it into a suitable format for this library. |
| |
| You can use the OPUS dataset to create multilingual sentence embeddings. This script contains code to download |
| OPUS datasets for the desired languages and to create training files in the right format. |
| |
| 1) First, you need to install OpusTools (https://github.com/Helsinki-NLP/OpusTools/tree/master/opustools_pkg): |
| pip install opustools |
| |
| 2) Once you have OpusTools installed, you can download data in the right format via: |
| mkdir parallel-sentences |
| opus_read -d [CORPUS] -s [SRC_LANG] -t [TRG_LANG] --write parallel-sentences/[FILENAME].tsv.gz -wm moses -dl opus -p raw |
| |
| For example: |
| mkdir parallel-sentences |
| opus_read -d JW300 -s en -t de --write parallel-sentences/JW300-en-de.tsv.gz -wm moses -dl opus -p raw |
| |
| This downloads the JW300 Corpus (http://opus.nlpl.eu/JW300.php) for English (en) and German (de) and write the output to |
| parallel-sentences/JW300-en-de.tsv.gz |
| |
| |
| #################### |
| |
| This python code automates the download and creation of the parallel sentences files. |
| |
| |
| """ |
| from opustools import OpusRead |
| import os |
|
|
|
|
| corpora = ['JW300'] |
| source_languages = ['en'] |
| target_languages = ['de', 'es', 'it', 'fr', 'ar', 'tr'] |
|
|
| output_folder = 'parallel-sentences' |
| opus_download_folder = './opus' |
|
|
| |
| os.makedirs(output_folder, exist_ok=True) |
|
|
| for corpus in corpora: |
| for src_lang in source_languages: |
| for trg_lang in target_languages: |
| output_filename = os.path.join(output_folder, "{}-{}-{}.tsv.gz".format(corpus, src_lang, trg_lang)) |
| if not os.path.exists(output_filename): |
| print("Create:", output_filename) |
| try: |
| read = OpusRead(directory=corpus, source=src_lang, target=trg_lang, write=[output_filename], download_dir=opus_download_folder, preprocess='raw', write_mode='moses', suppress_prompts=True) |
| read.printPairs() |
| except: |
| print("An error occured during the creation of", output_filename) |