| import random |
| import json |
| from pathlib import Path |
| from argparse import ArgumentParser |
|
|
|
|
| NO_INPUT_PROMPT: str = "δ»₯δΈγ―γγΏγΉγ―γθͺ¬ζγγζη€Ίγ§γγθ¦ζ±γι©εγ«ζΊγγεΏηγζΈγγͺγγγ" |
|
|
|
|
| def main(): |
| parser = ArgumentParser() |
| parser.add_argument("--ichikara-dir", type=str, required=True) |
| parser.add_argument("--answer-carefully-dir", type=str, required=True) |
| parser.add_argument("--output-dir", type=str, required=True) |
| args = parser.parse_args() |
|
|
| ichikara_filenames: list[str] = [ |
| "ichikara-instruction-003-001-1", |
| "ichikara-instruction-003-003-1", |
| ] |
| saved_ichikara_samples: list[dict] = [] |
| for ichikara_filename in ichikara_filenames: |
| ichikara_filepath: Path = Path(f"{args.ichikara_dir}/{ichikara_filename}.json") |
| print(ichikara_filepath) |
| with ichikara_filepath.open(mode="r", encoding="utf-8") as f: |
| loaded_samples = json.load(f) |
| for loaded_sample in loaded_samples: |
| saved_ichikara_samples.append( |
| { |
| "ID": loaded_sample["ID"], |
| "messages": [ |
| {"role": "system", "content": NO_INPUT_PROMPT}, |
| {"role": "user", "content": loaded_sample["text"]}, |
| {"role": "assistant", "content": loaded_sample["output"]}, |
| ], |
| } |
| ) |
|
|
| random.seed(42) |
| random.shuffle(saved_ichikara_samples) |
| with Path(f"{args.output_dir}/ichikara.jsonl").open("w", encoding="utf-8") as f: |
| for sample in saved_ichikara_samples: |
| f.write(json.dumps(sample, ensure_ascii=False) + "\n") |
|
|
| answer_carefully_filepath: Path = Path(f"{args.answer_carefully_dir}/AnswerCarefullyVersion002_Dev.json") |
| with answer_carefully_filepath.open(mode="r", encoding="utf-8") as f: |
| loaded_samples = json.load(f) |
| saved_answer_carefully_samples: list[dict] = [] |
| for loaded_sample in loaded_samples: |
| saved_answer_carefully_samples.append( |
| { |
| "ID": loaded_sample["ID"], |
| "messages": [ |
| {"role": "system", "content": NO_INPUT_PROMPT}, |
| {"role": "user", "content": loaded_sample["text"]}, |
| {"role": "assistant", "content": loaded_sample["output"]}, |
| ], |
| } |
| ) |
|
|
| random.seed(42) |
| random.shuffle(saved_answer_carefully_samples) |
| with Path(f"{args.output_dir}/answer_carefully.jsonl").open("w", encoding="utf-8") as f: |
| for sample in saved_answer_carefully_samples: |
| f.write(json.dumps(sample, ensure_ascii=False) + "\n") |
|
|
|
|
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|