| |
|
|
| |
| |
| |
|
|
|
|
| echo `date` |
| exp_dir=$1 |
| vocab_dir=${2:-"$exp_dir/vocab"} |
| train_data_dir=${3:-"$exp_dir/train"} |
| devtest_data_dir=${4:-"$exp_dir/devtest/all"} |
|
|
| root=$(dirname $0) |
|
|
| echo "Running experiment ${exp_dir}" |
|
|
| train_processed_dir=$exp_dir/data |
| devtest_processed_dir=$exp_dir/data |
| out_data_dir=$exp_dir/final_bin |
|
|
| mkdir -p $train_processed_dir |
| mkdir -p $devtest_processed_dir |
| mkdir -p $out_data_dir |
|
|
| parallel_installed=false |
|
|
| |
| if command -v parallel &> /dev/null; then |
| echo "GNU Parallel is installed. Version information:" |
| parallel --version |
| parallel_installed=true |
| fi |
|
|
| |
| pairs=$(ls -d $train_data_dir/* | sort) |
|
|
|
|
| |
| for pair in ${pairs[@]}; do |
| |
| pair=$(basename $pair) |
| src_lang=$(echo "$pair" | cut -d "-" -f 1) |
| tgt_lang=$(echo "$pair" | cut -d "-" -f 2) |
| echo "$src_lang - $tgt_lang" |
|
|
| train_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang |
| devtest_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang |
| mkdir -p $train_norm_dir |
| mkdir -p $devtest_norm_dir |
|
|
|
|
| |
| src_transliterate="true" |
| if [[ $src_lang == *"Arab"* ]] || [[ $src_lang == *"Olck"* ]] || \ |
| [[ $src_lang == *"Mtei"* ]] || [[ $src_lang == *"Latn"* ]]; then |
| src_transliterate="false" |
| fi |
| |
| |
| tgt_transliterate="true" |
| if [[ $tgt_lang == *"Arab"* ]] || [[ $tgt_lang == *"Olck"* ]] || \ |
| [[ $tgt_lang == *"Mtei"* ]] || [[ $tgt_lang == *"Latn"* ]]; then |
| tgt_transliterate="false" |
| fi |
|
|
|
|
| |
| |
| |
| train_infname_src=$train_data_dir/${src_lang}-${tgt_lang}/train.$src_lang |
| train_infname_tgt=$train_data_dir/${src_lang}-${tgt_lang}/train.$tgt_lang |
| train_outfname_src=$train_norm_dir/train.$src_lang |
| train_outfname_tgt=$train_norm_dir/train.$tgt_lang |
|
|
| echo "Normalizing punctuations for train" |
| if $parallel_installed; then |
| parallel --pipe --keep-order bash $root/normalize_punctuation.sh $src_lang < $train_infname_src > $train_outfname_src._norm |
| parallel --pipe --keep-order bash $root/normalize_punctuation.sh $tgt_lang < $train_infname_tgt > $train_outfname_tgt._norm |
| else |
| bash $root/normalize_punctuation.sh $src_lang < $train_infname_src > $train_outfname_src._norm |
| bash $root/normalize_punctuation.sh $tgt_lang < $train_infname_tgt > $train_outfname_tgt._norm |
| fi |
|
|
| |
| echo "Applying do not translate tags for train" |
| python3 scripts/normalize_regex.py $train_outfname_src._norm $train_outfname_tgt._norm $train_outfname_src.norm $train_outfname_tgt.norm |
|
|
| echo "Applying normalization and script conversion for train" |
| |
| input_size=`python3 scripts/preprocess_translate.py $train_outfname_src.norm $train_outfname_src $src_lang $src_transliterate false` |
| input_size=`python3 scripts/preprocess_translate.py $train_outfname_tgt.norm $train_outfname_tgt $tgt_lang $tgt_transliterate true` |
| echo "Number of sentences in train: $input_size" |
|
|
|
|
| |
| |
| |
| dev_infname_src=$devtest_data_dir/${src_lang}-${tgt_lang}/dev.$src_lang |
| dev_infname_tgt=$devtest_data_dir/${src_lang}-${tgt_lang}/dev.$tgt_lang |
| dev_outfname_src=$devtest_norm_dir/dev.$src_lang |
| dev_outfname_tgt=$devtest_norm_dir/dev.$tgt_lang |
|
|
| echo "Normalizing punctuations for dev" |
| if $parallel_installed; then |
| parallel --pipe --keep-order bash normalize_punctuation.sh $src_lang < $dev_infname_src > $dev_outfname_src._norm |
| parallel --pipe --keep-order bash normalize_punctuation.sh $tgt_lang < $dev_infname_tgt > $dev_outfname_tgt._norm |
| else |
| bash normalize_punctuation.sh $src_lang < $dev_infname_src > $dev_outfname_src._norm |
| bash normalize_punctuation.sh $tgt_lang < $dev_infname_tgt > $dev_outfname_tgt._norm |
| fi |
|
|
| |
| echo "Applying do not translate tags for dev" |
| python3 scripts/normalize_regex.py $dev_outfname_src._norm $dev_outfname_tgt._norm $dev_outfname_src.norm $dev_outfname_tgt.norm |
|
|
| echo "Applying normalization and script conversion for dev" |
| |
| input_size=`python scripts/preprocess_translate.py $dev_outfname_src.norm $dev_outfname_src $src_lang $src_transliterate false` |
| input_size=`python scripts/preprocess_translate.py $dev_outfname_tgt.norm $dev_outfname_tgt $tgt_lang $tgt_transliterate true` |
| echo "Number of sentences in dev: $input_size" |
| done |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data 'train' |
| python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data 'dev' |
|
|
|
|
| |
| mkdir -p $exp_dir/bpe |
|
|
| splits=(train dev) |
| for split in ${splits[@]}; do |
| echo "Applying sentence piece for $split" |
| bash apply_sentence_piece.sh $exp_dir $exp_dir/data $exp_dir/bpe SRC TGT $split $parallel_installed |
| done |
|
|
|
|
| |
| |
| |
| mkdir -p $exp_dir/final |
|
|
| echo "Adding language tags" |
| python scripts/add_joint_tags_translate.py $exp_dir 'train' |
| python scripts/add_joint_tags_translate.py $exp_dir 'dev' |
|
|
|
|
| |
| |
| |
| |
| |
| |
|
|
|
|
| echo "Binarizing data" |
|
|
| |
| |
| num_workers=`python -c "import multiprocessing; print(multiprocessing.cpu_count())"` |
|
|
| data_dir=$exp_dir/final |
| out_data_dir=$exp_dir/final_bin |
|
|
| rm -rf $out_data_dir |
|
|
| fairseq-preprocess \ |
| --source-lang SRC --target-lang TGT \ |
| --trainpref $data_dir/train \ |
| --validpref $data_dir/dev \ |
| --destdir $out_data_dir \ |
| --workers $num_workers \ |
| --thresholdtgt 5 |
|
|