| """Translate a password dictionary using a finetuned model.""" |
|
|
| from unsloth import FastLanguageModel |
| import torch |
| import argparse |
| from tqdm import tqdm |
|
|
| max_seq_length = 2048 |
| dtype = torch.float16 |
| load_in_4bit = True |
|
|
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name = "unsloth/llama-3-8b-bnb-4bit", |
| max_seq_length = max_seq_length, |
| dtype = dtype, |
| load_in_4bit = load_in_4bit, |
| ) |
|
|
| model = FastLanguageModel.get_peft_model( |
| model, |
| r = 16, |
| target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj",], |
| lora_alpha = 16, |
| lora_dropout = 0, |
| bias = "none", |
| |
| use_gradient_checkpointing = "unsloth", |
| random_state = 3407, |
| use_rslora = False, |
| loftq_config = None, |
| ) |
|
|
| import re |
| def extract_response(text): |
| |
| |
| match = re.search(r"### Response:\n(.*?)$", text, re.DOTALL) |
| if match: |
| response = match.group(1) |
| response = response.replace("<|end_of_text|>", "") |
| if response[-1] != "\n": |
| response = response + "\n" |
| return response |
| else: |
| raise "No response found in the text." |
|
|
| from unsloth import FastLanguageModel |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name = "lora_model", |
| max_seq_length = max_seq_length, |
| dtype = dtype, |
| load_in_4bit = load_in_4bit, |
| ) |
| FastLanguageModel.for_inference(model) |
| tokenizer.padding_side = "left" |
| alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
| |
| ### Instruction: |
| {} |
| |
| ### Input: |
| {} |
| |
| ### Response: |
| {}""" |
|
|
| def process_batch(batch): |
| inputs = [] |
| chunk_size = 10 |
| for i in range(0, len(batch), chunk_size): |
| chunk = ''.join(batch[i:i+chunk_size]) |
| inputs.append(alpaca_prompt.format( |
| "Translate this passwords while keeping the original format.", |
| chunk, |
| "", |
| )) |
| |
| input_tokens = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda") |
| outputs = model.generate(**input_tokens, max_new_tokens = 64, use_cache = True) |
| return [extract_response(response) for response in tokenizer.batch_decode(outputs)] |
|
|
| BATCH_SIZE = 1000 |
|
|
| def process_file(infile, outfile): |
| try: |
| with open(infile, 'r', encoding='latin1') as file: |
| lines = file.readlines() |
|
|
| translated_lines = [] |
| |
| |
| for i in tqdm(range(0, len(lines), BATCH_SIZE)): |
| translated_batch = process_batch(lines[i:i+BATCH_SIZE]) |
| translated_lines.extend(translated_batch) |
|
|
| |
| with open(outfile, 'w', encoding='latin1') as file: |
| file.writelines(translated_lines) |
|
|
| except FileNotFoundError: |
| print("The input file was not found.") |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Translate text file content to German.") |
| parser.add_argument("-i", "--input_file", required=True, help="Path to the input text file") |
| parser.add_argument("-o", "--output_file", required=True, help="Path to the output text file where translated text will be saved") |
|
|
| args = parser.parse_args() |
|
|
| process_file(args.input_file, args.output_file) |
|
|
| if __name__ == "__main__": |
| main() |
|
|