File size: 1,929 Bytes
0ee3210 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | import json
import os
from pathlib import Path
from datasets import load_dataset
DATA_DIR = Path(__file__).parent / "data"
def main():
print("Loading Cornell's Hate Speech & Offensive Language dataset...")
# class mapping in this dataset:
# 0 - hate speech
# 1 - offensive language (harassment/general toxicity)
# 2 - neither (clean)
ds = load_dataset("hate_speech_offensive", split="train")
# Group by class
hate = ds.filter(lambda x: x["class"] == 0).shuffle(seed=42).select(range(10))
offensive = ds.filter(lambda x: x["class"] == 1).shuffle(seed=42).select(range(10))
clean = ds.filter(lambda x: x["class"] == 2).shuffle(seed=42).select(range(10))
new_posts = []
def process(rows, label, action):
for row in rows:
new_posts.append({
"content_id": f"real_post_{row['count']}_{row['class']}",
"content_type": "text_post",
"text": row["tweet"],
"author_region": "GLOBAL",
"author_history": [],
"language": "en",
"ground_truth_labels": label,
"ground_truth_action": action,
"difficulty": "extreme"
})
process(hate, ["hate_speech"], "remove")
process(offensive, ["harassment"], "remove")
process(clean, ["clean"], "approve")
# Load existing posts
posts_path = DATA_DIR / "posts.json"
if posts_path.exists():
with open(posts_path, "r", encoding="utf-8") as f:
existing_posts = json.load(f)
else:
existing_posts = []
# Append new posts
all_posts = existing_posts + new_posts
with open(posts_path, "w", encoding="utf-8") as f:
json.dump(all_posts, f, indent=2, ensure_ascii=False)
print(f"Successfully integrated {len(new_posts)} real-world X/Twitter posts into posts.json.")
if __name__ == "__main__":
main()
|