| import cv2 |
| import numpy as np |
| import IPython |
| import os |
|
|
| import openai |
| import pandas as pd |
| import json |
| import subprocess |
|
|
|
|
| |
| def format_prompt(task_name): |
| instruction_text = open('misc/finetune_instructions_prompt.txt').read() |
| instruction_text = instruction_text.replace("TASK_NAME_TEMPLATE", task_name) |
| prompt_text = "\n Instructions: " + instruction_text + "\n\n###\n\n" |
| return prompt_text |
|
|
| def format_completion(task_name, descriptions, code): |
| completion_text = f" \nDescriptions: \n ```{task_name}: {descriptions} \n\n###\n\n" |
| completion_text += "Implementation: \n ```python\n" + code + "<|endoftext|>" |
| return completion_text |
|
|
| |
| |
| data_path = 'prompts/data' |
| def load_offline_memory(): |
| """get the current task descriptions, assets, and code""" |
| base_task_path = os.path.join(data_path, "base_tasks.json") |
| base_asset_path = os.path.join(data_path, "base_assets.json") |
| base_task_code_path = os.path.join(data_path, "base_task_codes.json") |
|
|
| base_tasks = json.load(open(base_task_path)) |
| base_assets = json.load(open(base_asset_path)) |
| base_task_codes = json.load(open(base_task_code_path)) |
|
|
| generated_task_path = os.path.join(data_path, "generated_tasks.json") |
| generated_asset_path = os.path.join(data_path, "generated_assets.json") |
| generated_task_code_path = os.path.join(data_path, "generated_task_codes.json") |
|
|
| |
| base_tasks.update(json.load(open(generated_task_path))) |
| |
|
|
| for task in json.load(open(generated_task_code_path)): |
| if task not in base_task_codes: |
| base_task_codes.append(task) |
|
|
| |
| return base_tasks, base_assets, base_task_codes |
|
|
|
|
| code_buffer = {} |
| base_tasks, base_assets, base_task_codes = load_offline_memory() |
| TOTAL_DATASET_TOKENS = 0 |
|
|
| added_tasks = [] |
| df = pd.DataFrame() |
| for task_file in base_task_codes: |
| |
| task_name = task_file[:-3].replace("_", "-") |
| if task_name in added_tasks: |
| continue |
|
|
| if task_name not in base_tasks: |
| print(f"{task_name} missing") |
| continue |
|
|
| added_tasks.append(task_name) |
| task_description = base_tasks[task_name] |
|
|
| if os.path.exists("cliport/tasks/" + task_file): |
| task_code = open("cliport/tasks/" + task_file).read() |
|
|
| |
| elif os.path.exists("cliport/generated_tasks/" + task_file): |
| task_code = open("cliport/generated_tasks/" + task_file).read() |
|
|
| prompt = format_prompt(task_name) |
| completion = format_completion(task_name, task_description, task_code) |
|
|
| |
| TOTAL_DATASET_TOKENS += len(prompt) / 4 |
| TOTAL_DATASET_TOKENS += len(completion) / 4 |
| new_row = { 'prompt': prompt, |
| 'completion': completion} |
| new_row = pd.DataFrame([new_row]) |
| df = pd.concat([df, new_row], axis=0, ignore_index=True) |
|
|
| df.to_csv("misc/finetune_data.csv", index=False) |
| print("======================================") |
| print("estimate number of tokens:", TOTAL_DATASET_TOKENS) |
| print("estimate price for davinci:", TOTAL_DATASET_TOKENS / 1000 * 0.03) |
| print("total number of instructions:", len(df)) |
| print("======================================") |
| |
|
|
| |
| subprocess.run('openai tools fine_tunes.prepare_data --file misc/finetune_data.csv --quiet'.split()) |
|
|
| print("now you can run \n openai api fine_tunes.create --training_file output/finetune_data_prepared.jsonl --model davinci --suffix 'GenSim'") |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
|
|
| |
| |
| |
| |