rhasspy
/

vosk-models

Model card Files Files and versions

vosk-models / _script /print-vocabulary.sh

Michael Hansen

Add vocab

87f72f6 over 2 years ago

history blame contribute delete

1.53 kB

	#!/usr/bin/env bash
	set -e

	# Print out the vocabulary from Gr.fst for all zipped models in a directory.
	# Assumes fstprint is in PATH and ngramfst.so is in LD_LIBRARY_PATH.

	if [ -z "$2" ]; then
	echo 'Usage: print-vocabulary <MODEL_DIR> <VOCAB_DIR>'
	exit 1
	fi

	model_dir="$1"
	vocab_dir="$2"

	mkdir -p "${vocab_dir}"

	temp_dir="$(mktemp -d)"
	function finish {
	rm -rf "${temp_dir}"
	}

	trap finish EXIT

	find "${model_dir}" -name '*.zip' -type f \| \
	while read -r zip_file; do
	model_name="$(basename "${zip_file}" .zip)"
	vocab_file="${vocab_dir}/${model_name}.txt"

	if [ -s "${vocab_file}" ]; then
	echo "Skipping ${model_name} (${vocab_file})"
	continue
	fi

	model_dir="${temp_dir}/${model_name}"
	mkdir -p "${model_dir}"
	unzip -j "${zip_file}" "${model_name}/graph/Gr.fst" -d "${model_dir}" \|\| \
	unzip -j "${zip_file}" "${model_name}/Gr.fst" -d "${model_dir}" \|\| \
	unzip -j "${zip_file}" "${model_name}/words.txt" -d "${model_dir}" \|\| \
	unzip -j "${zip_file}" "${model_name}/graph/words.txt" -d "${model_dir}" \|\| \
	true

	if [ -f "${model_dir}/words.txt" ]; then
	cut -d' ' -f1 < "${model_dir}/words.txt" \| sort \| uniq > "${vocab_file}"
	elif [ -f "${model_dir}/Gr.fst" ]; then
	fstprint "${model_dir}/Gr.fst" \| cut -f3 \| sort \| uniq > "${vocab_file}"
	else
	echo "ERROR: can't get vocabulary for ${model_name}"
	fi

	done