| #!/bin/bash |
|
|
| |
|
|
|
|
| echo `date` |
| infname=$1 |
| outfname=$2 |
| src_lang=$3 |
| tgt_lang=$4 |
| ckpt_dir=$5 |
|
|
|
|
| |
| src_transliterate="true" |
| if [[ $src_lang == *"Arab"* ]] || [[ $src_lang == *"Olck"* ]] || \ |
| [[ $src_lang == *"Mtei"* ]] || [[ $src_lang == *"Latn"* ]]; then |
| src_transliterate="false" |
| fi |
|
|
|
|
| |
| tgt_transliterate="true" |
| if [[ $tgt_lang == *"Arab"* ]] || [[ $tgt_lang == *"Olck"* ]] || \ |
| [[ $tgt_lang == *"Mtei"* ]] || [[ $tgt_lang == *"Latn"* ]]; then |
| tgt_transliterate="false" |
| fi |
|
|
|
|
| |
| SRC_PREFIX='SRC' |
| TGT_PREFIX='TGT' |
|
|
|
|
| echo "Normalizing punctuations" |
| bash normalize_punctuation.sh $src_lang < $infname > $outfname._norm |
|
|
| echo "Adding do not translate tags" |
| python3 scripts/normalize_regex_inference.py $outfname._norm $outfname.norm |
| rm -rf $outfname._norm && mv $outfname.norm $outfname._norm |
|
|
| echo "Applying normalization and script conversion" |
| input_size=`python scripts/preprocess_translate.py $outfname._norm $outfname.norm $src_lang $src_transliterate false` |
| echo "Number of sentences in input: $input_size" |
|
|
|
|
| echo "Applying sentence piece" |
| spm_encode --model $ckpt_dir/vocab/model.SRC \ |
| --output_format=piece \ |
| < $outfname.norm \ |
| > $outfname._bpe |
|
|
| echo "Adding language tags" |
| python scripts/add_tags_translate.py $outfname._bpe $outfname.bpe $src_lang $tgt_lang |
|
|
|
|
| echo "Decoding" |
| fairseq-interactive $ckpt_dir/final_bin \ |
| -s $SRC_PREFIX -t $TGT_PREFIX \ |
| --distributed-world-size 1 --fp16 \ |
| --path $ckpt_dir/model/checkpoint_best.pt \ |
| --task translation \ |
| --user-dir model_configs \ |
| --skip-invalid-size-inputs-valid-test \ |
| --batch-size 128 --buffer-size 2500 --beam 5 \ |
| --input $outfname.bpe > $outfname.log 2>&1 |
|
|
|
|
| echo "Extracting translations, script conversion and detokenization" |
| |
| python scripts/postprocess_translate.py $outfname.log $outfname $input_size $tgt_lang $tgt_transliterate $ckpt_dir/vocab/model.TGT |
|
|
| echo "Translation completed" |
|
|