Not able to use the model

#12
by Rajdgt - opened

I am trying to use the model ai4bharat/indictrans2-en-indic-dist-200M BUT IT IS NOT RECOGNISING THE LANGUAGE.
Can someone help me to use it as simple english text to text translation.

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

MODEL_NAME = "ai4bharat/indictrans2-en-indic-dist-200M"
CACHE_DIR = "./model_cache"

--- The Test Case ---

source_text = "This is a test from the command line"
source_lang = "2en"
target_lang = "2hi"

--- THE CORRECT AND FINAL INPUT FORMAT ---

The model requires both source and target tags prefixed to the text.

input_text = f"<{source_lang}> <{target_lang}> {source_text}"

inputs = tokenizer(input_text, return_tensors="pt")
print("--- Tokenization successful. ---")

# NOTE: No forced_bos_token_id is needed because the target is now specified in the input.
print("\n--- Step 4: Generating translation from model ---")
generated_tokens = model.generate(
    **inputs,
    num_return_sequences=1,
    num_beams=5,
    max_length=256
)

It is failing and not giving that it invalid language. I am missing small part since it may be updated recently.

I used NLLB model and its working. But not sure why ai4bharat/indictrans2-en-indic-dist-200M is not able to use it. Please check.

The issue is occurring because the correct language tags are not being passed, and the necessary pre-processing steps expected by the IndicTrans2 models are missing.

Please refer to the HF readme, for a complete example script that demonstrates how to properly set language tags and apply the required pre-processing steps for the inference.

I am also faced the same language tag issue in ai4bharat/indictrans2-en-indic-dist-200M. But the NLLB model is working.

Please check and kindly share any reference to set the language tags properly.

import os
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IndicTransToolkit.processor import IndicProcessor

Set device

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

Source and target languages in ISO format

SRC_LANG, TGT_LANG = "eng_Latn", "hin_Deva"

Model

MODEL_NAME = "ai4bharat/indictrans2-en-indic-1B"

Load tokenizer and model

try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
dtype=torch.float16,
attn_implementation="flash_attention_2"
).to(DEVICE)
except Exception as e:
print(f"Error loading model: {e}")
exit(1)

IndicProcessor for preprocessing and postprocessing

ip = IndicProcessor(inference=True)

Preprocessing function to normalize lines

def normalize_for_ai4bharat(line: str) -> str:
line = line.strip().lower() # lowercase
line = re.sub(r'<.*?>', '', line) # remove placeholders like
line = re.sub(r'[^a-z0-9\s.,;?!]', '', line) # remove special chars
line = re.sub(r'\s+', ' ', line) # collapse spaces
return line

Translate a single line safely

def translate_line(line: str) -> str:
try:
normalized = normalize_for_ai4bharat(line)
if not normalized:
return ""

    batch = ip.preprocess_batch([normalized], src_lang=SRC_LANG, tgt_lang=TGT_LANG)
    if not batch or all(b is None for b in batch):
        return ""

    # Tokenize input
    inputs = tokenizer(
        batch,
        truncation=True,
        padding="longest",
        return_tensors="pt",
        return_attention_mask=True,
    ).to(DEVICE)

    # Generate translation
    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            use_cache=True,
            min_length=0,
            max_length=256,
            num_beams=5,
            num_return_sequences=1,
        )

    # Decode output
    decoded = tokenizer.batch_decode(
        generated_tokens,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    # Postprocess
    translations = ip.postprocess_batch(decoded, lang=TGT_LANG)
    return translations[0] if translations else ""

except Exception as e:
    print(f"Error processing line: {line}\n{e}")
    return None  # Return None for logging purposes

Process file line by line with crash-safe and skipped lines logging

def process_file(input_path: str, output_path: str, log_path: str):
if not os.path.exists(input_path):
print(f"Input file not found: {input_path}")
return

# Read already translated lines to allow crash recovery
translated_lines = []
if os.path.exists(output_path):
    with open(output_path, 'r', encoding='utf-8') as f:
        translated_lines = [line.rstrip("\n") for line in f]

with open(input_path, 'r', encoding='utf-8') as fin, \
     open(output_path, 'a', encoding='utf-8') as fout, \
     open(log_path, 'a', encoding='utf-8') as logf:

    for idx, line in enumerate(fin, start=1):
        # Skip lines already translated
        if idx <= len(translated_lines):
            continue

        translated = translate_line(line)
        if translated is None or translated.strip() == "":
            logf.write(f"[Line {idx}] {line.rstrip()}\n")
            print(f"[Line {idx}] Skipped / could not translate.")
        else:
            fout.write(translated + "\n")
            print(f"[Line {idx}] Translated successfully.")

print("Processing complete. Skipped lines logged in:", log_path)

Main

if name == "main":
input_file = "output_format.txt"
output_file = "Output_hindi.txt"
log_file = "Skipped_lines.log"
process_file(input_file, output_file, log_file)

This is also showing error and unable to perform the translation.

Sign up or log in to comment