| #!/usr/bin/env bash |
| set -e |
|
|
| |
| |
|
|
| if [ -z "$2" ]; then |
| echo 'Usage: print-vocabulary <MODEL_DIR> <VOCAB_DIR>' |
| exit 1 |
| fi |
|
|
| model_dir="$1" |
| vocab_dir="$2" |
|
|
| mkdir -p "${vocab_dir}" |
|
|
| temp_dir="$(mktemp -d)" |
| function finish { |
| rm -rf "${temp_dir}" |
| } |
|
|
| trap finish EXIT |
|
|
| find "${model_dir}" -name '*.zip' -type f | \ |
| while read -r zip_file; do |
| model_name="$(basename "${zip_file}" .zip)" |
| vocab_file="${vocab_dir}/${model_name}.txt" |
|
|
| if [ -s "${vocab_file}" ]; then |
| echo "Skipping ${model_name} (${vocab_file})" |
| continue |
| fi |
|
|
| model_dir="${temp_dir}/${model_name}" |
| mkdir -p "${model_dir}" |
| unzip -j "${zip_file}" "${model_name}/graph/Gr.fst" -d "${model_dir}" || \ |
| unzip -j "${zip_file}" "${model_name}/Gr.fst" -d "${model_dir}" || \ |
| unzip -j "${zip_file}" "${model_name}/words.txt" -d "${model_dir}" || \ |
| unzip -j "${zip_file}" "${model_name}/graph/words.txt" -d "${model_dir}" || \ |
| true |
|
|
| if [ -f "${model_dir}/words.txt" ]; then |
| cut -d' ' -f1 < "${model_dir}/words.txt" | sort | uniq > "${vocab_file}" |
| elif [ -f "${model_dir}/Gr.fst" ]; then |
| fstprint "${model_dir}/Gr.fst" | cut -f3 | sort | uniq > "${vocab_file}" |
| else |
| echo "ERROR: can't get vocabulary for ${model_name}" |
| fi |
|
|
| done |
|
|