| import pandas as pd |
| import torch |
| def preparing_data(text:str , domain: int): |
| """ |
| |
| |
| |
| Args: |
| text (_str_): input text from the user |
| domain (_int_): output domain from domain identification pipeline |
| |
| Returns: |
| _DataFrame_: dataframe contains texts and domain |
| """ |
| |
| |
| dict_data = { |
| 'text': ['hello world' ] , |
| 'domain': [0] , |
| } |
| |
| dict_data["text"].append(text) |
| dict_data["domain"].append(domain) |
| |
| df = pd.DataFrame(dict_data) |
|
|
| |
| return df |
|
|
|
|
| def loading_data(tokenizer , df: pd.DataFrame ): |
| ids = [] |
| masks = [] |
| domain_list = [] |
|
|
| texts = df["text"] |
| domains= df["domain"] |
|
|
| |
| for i in range(len(df)): |
| text = texts[i] |
| token = tokenizer(text) |
| ids.append(token["token_id"]) |
| masks.append(token["mask"]) |
| domain_list.append(domains[i]) |
|
|
| input_ids = torch.cat(ids , dim=0) |
| input_masks = torch.cat(masks ,dim = 0) |
| input_domains = torch.tensor(domain_list) |
| |
| |
| return input_ids , input_masks , input_domains |
|
|
|
|