| import os |
| import pandas as pd |
| from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk |
| from .config import Config |
|
|
| class DataProcessor: |
| def __init__(self, tokenizer): |
| self.tokenizer = tokenizer |
|
|
| def load_clap_data(self): |
| """ |
| 加载 clapAI/MultiLingualSentiment 数据集的中文部分 |
| """ |
| print("Loading clapAI/MultiLingualSentiment (zh)...") |
| try: |
| |
| |
| ds = load_dataset("clapAI/MultiLingualSentiment", "zh", split="train", trust_remote_code=True) |
| except Exception: |
| |
| print("Warning: Could not load 'zh' specific config, attempting to load generic...") |
| ds = load_dataset("clapAI/MultiLingualSentiment", split="train", trust_remote_code=True) |
| ds = ds.filter(lambda x: x['language'] == 'zh') |
| |
| |
| |
| |
| |
| return ds |
|
|
| def load_medical_data(self): |
| """ |
| 加载 OpenModels/Chinese-Herbal-Medicine-Sentiment 垂直领域数据 |
| """ |
| print("Loading OpenModels/Chinese-Herbal-Medicine-Sentiment...") |
| ds = load_dataset("OpenModels/Chinese-Herbal-Medicine-Sentiment", split="train", trust_remote_code=True) |
| return ds |
|
|
| def clean_data(self, examples): |
| """ |
| 数据清洗逻辑 |
| """ |
| text = examples['text'] |
| |
| |
| if "此用户未填写评价内容" in text: |
| return False |
| |
| |
| if len(text.strip()) < 2: |
| return False |
| |
| return True |
|
|
| def unify_labels(self, example): |
| """ |
| 统一标签为: 0 (Negative), 1 (Neutral), 2 (Positive) |
| """ |
| label = example['label'] |
| |
| |
| |
| |
| if isinstance(label, str): |
| label = label.lower() |
| if label in ['negative', 'pos', '0']: |
| return {'labels': 0} |
| elif label in ['neutral', 'neu', '1']: |
| return {'labels': 1} |
| elif label in ['positive', 'neg', '2']: |
| return {'labels': 2} |
| |
| |
| return {'labels': int(label)} |
|
|
| def tokenize_function(self, examples): |
| return self.tokenizer( |
| examples['text'], |
| padding="max_length", |
| truncation=True, |
| max_length=Config.MAX_LENGTH |
| ) |
|
|
| def get_processed_dataset(self, cache_dir=None, num_proc=1): |
| |
| if cache_dir is None: |
| cache_dir = Config.DATA_DIR |
|
|
| |
| processed_path = os.path.join(cache_dir, "processed_dataset") |
| if os.path.exists(processed_path): |
| print(f"Loading processed dataset from {processed_path}...") |
| return load_from_disk(processed_path) |
|
|
| |
| ds_clap = self.load_clap_data() |
| ds_med = self.load_medical_data() |
| |
| |
| |
| if 'review_text' in ds_med.column_names: |
| ds_med = ds_med.rename_column('review_text', 'text') |
| if 'sentiment_label' in ds_med.column_names: |
| ds_med = ds_med.rename_column('sentiment_label', 'label') |
| |
| |
| print("Cleaning datasets...") |
| ds_med = ds_med.filter(self.clean_data) |
| ds_clap = ds_clap.filter(self.clean_data) |
| |
| |
| |
| common_cols = ['text', 'label'] |
| ds_clap = ds_clap.select_columns(common_cols) |
| ds_med = ds_med.select_columns(common_cols) |
| |
| combined_ds = concatenate_datasets([ds_clap, ds_med]) |
| |
| |
| |
| combined_ds = combined_ds.map(self.unify_labels, remove_columns=['label']) |
| |
| |
| tokenized_ds = combined_ds.map( |
| self.tokenize_function, |
| batched=True, |
| remove_columns=['text'] |
| ) |
| |
| |
| split_ds = tokenized_ds.train_test_split(test_size=0.1) |
| |
| return split_ds |
|
|