| """Code to augment the translated/untranslated passwords and create a dataset for the password translation task.""" |
|
|
| import pandas as pd |
| import random |
|
|
| N_SAMPLES = 10000 |
|
|
| def mutate_password_pair(pair): |
| |
| if random.random() < 0.2: |
| pair = (pair[0].capitalize(), pair[1].capitalize()) |
| |
| if random.random() < 0.2: |
| number = random.randint(0, 9) |
| pair = (pair[0] + str(number), pair[1] + str(number)) |
| |
| if random.random() < 0.2: |
| symbol = random.choice(['!', '@', '#', '$', '%', '&', '*']) |
| pair = (pair[0] + symbol, pair[1] + symbol) |
| |
| if random.random() < 0.2: |
| if "e" in pair[0]: |
| letter = "e" |
| number = "3" |
| elif "E" in pair[0]: |
| letter = "E" |
| number = "3" |
| elif "i" in pair[0]: |
| letter = "i" |
| number = "1" |
| elif "I" in pair[0]: |
| letter = "I" |
| number = "1" |
| elif "o" in pair[0]: |
| letter = "o" |
| number = "0" |
| elif "O" in pair[0]: |
| letter = "O" |
| number = "0" |
| elif "a" in pair[0]: |
| letter = "a" |
| number = "4" |
| elif "A" in pair[0]: |
| letter = "A" |
| number = "4" |
| elif "t" in pair[0]: |
| letter = "t" |
| number = "7" |
| elif "T" in pair[0]: |
| letter = "T" |
| number = "7" |
| else: |
| return pair |
| |
| |
| pair = (pair[0].replace(letter, number, 1), pair[1].replace(letter, number, 1)) |
| return pair |
|
|
| def create_dataframes(): |
| |
| with open('original_train.txt', 'r', encoding='latin1') as file: |
| original = file.readlines() |
| with open('translated_train.txt', 'r', encoding='utf-8') as file: |
| translated = file.readlines() |
| with open('untranslated.txt', 'r', encoding='latin1') as file: |
| untranslated = file.readlines() |
|
|
| |
| df_translated = pd.DataFrame({ |
| 'original': [line.strip() for line in original], |
| 'translated': [line.strip() for line in translated] |
| }) |
|
|
| |
| untranslated_list = [line.strip() for line in untranslated] |
|
|
| |
| df_instructions = pd.DataFrame(columns=['instruction', 'input', 'output']) |
|
|
| |
| for _ in range(N_SAMPLES): |
| |
| sampled_translated = df_translated.sample(8) |
| original_samples = sampled_translated['original'].tolist() |
| translated_samples = sampled_translated['translated'].tolist() |
|
|
| |
| untranslated_samples = random.sample(untranslated_list, 2) |
|
|
| |
| total_input = original_samples + untranslated_samples |
| total_output = translated_samples + untranslated_samples |
|
|
| combined_list = list(zip(total_input, total_output)) |
| random.shuffle(combined_list) |
| combined_list = [mutate_password_pair(pair) for pair in combined_list] |
| shuffled_input, shuffled_output = zip(*combined_list) |
|
|
| new_rows = { |
| 'instruction': 'Translate this passwords while keeping the original format.', |
| 'input': "\n".join(list(shuffled_input)), |
| 'output': "\n".join(list(shuffled_output)) |
| } |
| df_instructions = df_instructions._append(new_rows, ignore_index=True) |
|
|
| return df_instructions |
|
|
| |
| df_instructions = create_dataframes() |
|
|
| |
| print(df_instructions.head()) |
|
|
| |
| df_instructions.to_csv('password_translation_instructions.csv', index=False) |
|
|