Model Card for Model ID

Same license of Qwen2.5 1B model. This model specifically finetuned with pandas code dataset for CSV natural question answering task.

The base model got from:- Qwen/Qwen2.5-1.5B

How to use this model for CSV question answering task.

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, StoppingCriteria
import torch
import re
import traceback
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("chelvan/Qwen2_5_Pandas_SFT_Finetune_CSV_QA")

model = AutoModelForCausalLM.from_pretrained(
    "chelvan/Qwen2_5_Pandas_SFT_Finetune_CSV_QA",
    torch_dtype=torch.bfloat16,
    device_map="cuda"
)

model.config.use_cache = False


class CustomStopTokenSequenceCriterion(StoppingCriteria):
    def __init__(self, stop_token_ids):
        super().__init__()
        self.stop_token_ids = stop_token_ids

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
        # Get the length of the stop token sequence
        stop_len = len(self.stop_token_ids)
        # Check if the end of the input_ids matches the stop_token_ids
        if len(input_ids[0]) >= stop_len and torch.equal(input_ids[0][-stop_len:],
                                                         torch.tensor(self.stop_token_ids, device=input_ids.device)):
            return True
        return False


stop_token = "### End"

stop_token_id = tokenizer.encode(stop_token, add_special_tokens=False)
print(stop_token_id)
stop_criteria = CustomStopTokenSequenceCriterion(stop_token_ids=stop_token_id)

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1024)


def generate_prompt(task, header_columns):
    prompt = f"""Below is an instruction that describes a task. Write a Python function using Pandas to accomplish the task described below.

### Instruction:
{task}

header columns with sample data:
{header_columns}

### Response:
"""
    return prompt


def format_dataframe(df):
    df_dict = df.head(5).to_dict(orient='list')
    formatted_string = "data = {\n"

    for key, values in df_dict.items():
        formatted_string += "    '{}': {},\n".format(key, values)

    formatted_string += "}"
    return formatted_string


def generate_code(prompt):
    result = pipe(prompt, stopping_criteria=[stop_criteria])
    generated_text = result[0]['generated_text']
    res = generated_text.split("### End")[0]

    match = re.search("```(.*?)```", res, re.DOTALL)

    if match:
        python_code = match.group(1).strip()
    else:
        python_code = generated_text

    return python_code


def filter_dataframe(df, query):
    python_code = ""

    try:
        header_column = format_dataframe(df)
        prompt = generate_prompt(query, header_column)
        python_code = generate_code(prompt)

        python_code = "global process\n" + python_code
        exec(python_code)
        results = process(df)

        return python_code, results

    except Exception as e:
        full_traceback = traceback.format_exc()

        error_message = f"An error occurred: {e}\n{full_traceback}"

        final_msg = f"{python_code}\n======Error======\n{error_message}"
        return final_msg, "Error"


if __name__ == '__main__':
    df = pd.read_csv("sales_orders.csv")
    code, res = filter_dataframe(df, 'How many unique customers are there')
    print(code)
    print(res)
Downloads last month
2
Safetensors
Model size
2B params
Tensor type
F16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support