Model Card for Model ID
Same license of Qwen2.5 1B model. This model specifically finetuned with pandas code dataset for CSV natural question answering task.
The base model got from:- Qwen/Qwen2.5-1.5B
How to use this model for CSV question answering task.
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, StoppingCriteria
import torch
import re
import traceback
import pandas as pd
tokenizer = AutoTokenizer.from_pretrained("chelvan/Qwen2_5_Pandas_SFT_Finetune_CSV_QA")
model = AutoModelForCausalLM.from_pretrained(
"chelvan/Qwen2_5_Pandas_SFT_Finetune_CSV_QA",
torch_dtype=torch.bfloat16,
device_map="cuda"
)
model.config.use_cache = False
class CustomStopTokenSequenceCriterion(StoppingCriteria):
def __init__(self, stop_token_ids):
super().__init__()
self.stop_token_ids = stop_token_ids
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
# Get the length of the stop token sequence
stop_len = len(self.stop_token_ids)
# Check if the end of the input_ids matches the stop_token_ids
if len(input_ids[0]) >= stop_len and torch.equal(input_ids[0][-stop_len:],
torch.tensor(self.stop_token_ids, device=input_ids.device)):
return True
return False
stop_token = "### End"
stop_token_id = tokenizer.encode(stop_token, add_special_tokens=False)
print(stop_token_id)
stop_criteria = CustomStopTokenSequenceCriterion(stop_token_ids=stop_token_id)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1024)
def generate_prompt(task, header_columns):
prompt = f"""Below is an instruction that describes a task. Write a Python function using Pandas to accomplish the task described below.
### Instruction:
{task}
header columns with sample data:
{header_columns}
### Response:
"""
return prompt
def format_dataframe(df):
df_dict = df.head(5).to_dict(orient='list')
formatted_string = "data = {\n"
for key, values in df_dict.items():
formatted_string += " '{}': {},\n".format(key, values)
formatted_string += "}"
return formatted_string
def generate_code(prompt):
result = pipe(prompt, stopping_criteria=[stop_criteria])
generated_text = result[0]['generated_text']
res = generated_text.split("### End")[0]
match = re.search("```(.*?)```", res, re.DOTALL)
if match:
python_code = match.group(1).strip()
else:
python_code = generated_text
return python_code
def filter_dataframe(df, query):
python_code = ""
try:
header_column = format_dataframe(df)
prompt = generate_prompt(query, header_column)
python_code = generate_code(prompt)
python_code = "global process\n" + python_code
exec(python_code)
results = process(df)
return python_code, results
except Exception as e:
full_traceback = traceback.format_exc()
error_message = f"An error occurred: {e}\n{full_traceback}"
final_msg = f"{python_code}\n======Error======\n{error_message}"
return final_msg, "Error"
if __name__ == '__main__':
df = pd.read_csv("sales_orders.csv")
code, res = filter_dataframe(df, 'How many unique customers are there')
print(code)
print(res)
- Downloads last month
- 2