| """ |
| Test script to verify model downloading and loading from sema-utils repository |
| """ |
|
|
| import os |
| import sys |
| from huggingface_hub import hf_hub_download, snapshot_download |
| import ctranslate2 |
| import sentencepiece as spm |
| import fasttext |
|
|
| def test_model_download(): |
| """Test downloading models from sematech/sema-utils""" |
| |
| REPO_ID = "sematech/sema-utils" |
| MODELS_DIR = "test_models" |
| |
| print("π§ͺ Testing model download from sematech/sema-utils...") |
| |
| |
| os.makedirs(MODELS_DIR, exist_ok=True) |
| |
| try: |
| |
| print("\n1οΈβ£ Testing SentencePiece model download...") |
| smp_path = hf_hub_download( |
| repo_id=REPO_ID, |
| filename="spm.model", |
| local_dir=MODELS_DIR |
| ) |
| print(f"β
SentencePiece model downloaded to: {smp_path}") |
| |
| |
| print("\n2οΈβ£ Testing language detection model download...") |
| ft_path = hf_hub_download( |
| repo_id=REPO_ID, |
| filename="lid218e.bin", |
| local_dir=MODELS_DIR |
| ) |
| print(f"β
Language detection model downloaded to: {ft_path}") |
| |
| |
| print("\n3οΈβ£ Testing translation model download...") |
| ct_model_path = snapshot_download( |
| repo_id=REPO_ID, |
| allow_patterns="translation_models/sematrans-3.3B/*", |
| local_dir=MODELS_DIR |
| ) |
| print(f"β
Translation model downloaded to: {ct_model_path}") |
| |
| |
| ct_model_full_path = os.path.join(MODELS_DIR, "translation_models", "sematrans-3.3B") |
| print(f"\nπ Translation model directory: {ct_model_full_path}") |
| |
| if os.path.exists(ct_model_full_path): |
| files = os.listdir(ct_model_full_path) |
| print(f"π Files in translation model directory: {files}") |
| else: |
| print("β Translation model directory not found!") |
| return False |
| |
| return smp_path, ft_path, ct_model_full_path |
| |
| except Exception as e: |
| print(f"β Error during download: {e}") |
| return False |
|
|
| def test_model_loading(smp_path, ft_path, ct_model_path): |
| """Test loading the downloaded models""" |
| |
| print("\nπ Testing model loading...") |
| |
| try: |
| |
| fasttext.FastText.eprint = lambda x: None |
| |
| |
| print("\n1οΈβ£ Testing language detection model loading...") |
| lang_model = fasttext.load_model(ft_path) |
| print("β
Language detection model loaded successfully") |
| |
| |
| test_text = "Habari ya asubuhi" |
| predictions = lang_model.predict(test_text, k=1) |
| detected_lang = predictions[0][0].replace('__label__', '') |
| print(f"π Detected language for '{test_text}': {detected_lang}") |
| |
| |
| print("\n2οΈβ£ Testing SentencePiece model loading...") |
| sp_model = spm.SentencePieceProcessor() |
| sp_model.load(smp_path) |
| print("β
SentencePiece model loaded successfully") |
| |
| |
| tokens = sp_model.encode(test_text, out_type=str) |
| print(f"π€ Tokenized '{test_text}': {tokens}") |
| |
| |
| print("\n3οΈβ£ Testing translation model loading...") |
| translator = ctranslate2.Translator(ct_model_path, device="cpu") |
| print("β
Translation model loaded successfully") |
| |
| return lang_model, sp_model, translator |
| |
| except Exception as e: |
| print(f"β Error during model loading: {e}") |
| return False |
|
|
| def test_translation(lang_model, sp_model, translator): |
| """Test the complete translation pipeline""" |
| |
| print("\nπ Testing complete translation pipeline...") |
| |
| test_text = "Habari ya asubuhi, ulimwengu" |
| target_lang = "eng_Latn" |
| |
| try: |
| |
| predictions = lang_model.predict(test_text.replace('\n', ' '), k=1) |
| source_lang = predictions[0][0].replace('__label__', '') |
| print(f"π Detected source language: {source_lang}") |
| |
| |
| source_sents = [test_text.strip()] |
| source_sents_subworded = sp_model.encode(source_sents, out_type=str) |
| source_sents_subworded = [[source_lang] + sent + ["</s>"] for sent in source_sents_subworded] |
| print(f"π€ Tokenized input: {source_sents_subworded[0][:10]}...") |
| |
| |
| target_prefix = [[target_lang]] |
| translations = translator.translate_batch( |
| source_sents_subworded, |
| batch_type="tokens", |
| max_batch_size=2048, |
| beam_size=1, |
| target_prefix=target_prefix, |
| ) |
| |
| |
| translations = [translation[0]['tokens'] for translation in translations] |
| translations_desubword = sp_model.decode(translations) |
| translated_text = translations_desubword[0][len(target_lang):] |
| |
| print(f"\nπ Translation successful!") |
| print(f"π Original: {test_text}") |
| print(f"π Source language: {source_lang}") |
| print(f"π― Target language: {target_lang}") |
| print(f"β¨ Translation: {translated_text}") |
| |
| return True |
| |
| except Exception as e: |
| print(f"β Error during translation: {e}") |
| return False |
|
|
| def cleanup_test_files(): |
| """Clean up test files""" |
| import shutil |
| |
| test_dir = "test_models" |
| if os.path.exists(test_dir): |
| print(f"\nπ§Ή Cleaning up test directory: {test_dir}") |
| shutil.rmtree(test_dir) |
| print("β
Cleanup complete") |
|
|
| if __name__ == "__main__": |
| print("π Starting Sema Utils Model Test\n") |
| |
| |
| download_result = test_model_download() |
| if not download_result: |
| print("β Model download test failed!") |
| sys.exit(1) |
| |
| smp_path, ft_path, ct_model_path = download_result |
| |
| |
| loading_result = test_model_loading(smp_path, ft_path, ct_model_path) |
| if not loading_result: |
| print("β Model loading test failed!") |
| sys.exit(1) |
| |
| lang_model, sp_model, translator = loading_result |
| |
| |
| translation_result = test_translation(lang_model, sp_model, translator) |
| if not translation_result: |
| print("β Translation test failed!") |
| sys.exit(1) |
| |
| print("\nπ All tests passed! Your sema-utils repository is working correctly.") |
| |
| |
| response = input("\nπ§Ή Do you want to clean up test files? (y/n): ") |
| if response.lower() in ['y', 'yes']: |
| cleanup_test_files() |
| |
| print("\nβ
Test complete!") |
|
|