semran1 commited on
Commit
611e31d
·
verified ·
1 Parent(s): d062854

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. books.py +144 -0
  2. filter_en.bin +3 -0
  3. filter_fineweb.sh +8 -0
books.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import re
4
+ import os
5
+ import unicodedata
6
+ from typing import List
7
+ from multiprocessing import Pool
8
+
9
+ import fasttext
10
+ import pandas as pd
11
+ from tqdm import tqdm
12
+
13
+ # Only use the Kyutai Dactory English FastText model
14
+ FASTTEXT_MODEL_PATH = "filter_en.bin"
15
+ # Minimum probability threshold for the '__label__books' class
16
+ THRESHOLD = 0.3
17
+
18
+
19
+ def parse_args():
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--data-path", type=str, required=True,
22
+ help="Directory or file path containing input data.")
23
+ parser.add_argument("--save-path", type=str, required=True,
24
+ help="Root directory to save filtered results.")
25
+ parser.add_argument("--content-key", type=str, required=True,
26
+ help="JSON key for the review or text content.")
27
+ parser.add_argument("--processes-num", type=int, default=64,
28
+ help="Number of parallel worker processes.")
29
+ parser.add_argument("--write-batch-size", type=int, default=100,
30
+ help="Batch size for writing to output file.")
31
+ parser.add_argument("--inplace", action="store_true",
32
+ help="Skip processing files that already exist.")
33
+ return parser.parse_args()
34
+
35
+
36
+ def fasttext_preprocess_func(content: str) -> str:
37
+ """Normalize content for FastText inference."""
38
+ content = re.sub(r'\n{3,}', '\n\n', content) # collapse multiple newlines
39
+ content = content.lower()
40
+ content = ''.join(
41
+ c for c in unicodedata.normalize('NFKD', content)
42
+ if unicodedata.category(c) != 'Mn'
43
+ )
44
+ content = content.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
45
+ content = re.sub(r' +', ' ', content).strip()
46
+ return content
47
+
48
+
49
+ def fasttext_infer(norm_content: str, model: fasttext.FastText):
50
+ """Run FastText model to get the '__label__books' probability."""
51
+ labels, probs = model.predict(norm_content, k=10)
52
+ for label, prob in zip(labels, probs):
53
+ if label == '__label__books':
54
+ return label, float(prob)
55
+ return None, 0.0
56
+
57
+
58
+ def load_data(file_path: str, content_key: str) -> List[str]:
59
+ """Load raw text content from supported files."""
60
+ samples: List[str] = []
61
+ if file_path.endswith(('.jsonl', '.json')):
62
+ with open(file_path, 'r', encoding='utf-8') as f:
63
+ for line in f:
64
+ data = json.loads(line)
65
+ if content_key in data and data[content_key]:
66
+ samples.append(str(data[content_key]))
67
+ elif file_path.endswith('.parquet'):
68
+ df = pd.read_parquet(file_path)
69
+ for val in df.get(content_key, []):
70
+ if pd.notna(val) and val:
71
+ samples.append(str(val))
72
+ else:
73
+ raise ValueError(f"Unsupported file type: {file_path}")
74
+ return samples
75
+
76
+
77
+ def process_file(
78
+ file_path: str,
79
+ save_path: str,
80
+ item: int,
81
+ content_key: str,
82
+ inplace: bool,
83
+ write_batch_size: int) -> None:
84
+ """Process one file: filter by '__label__books' score > THRESHOLD."""
85
+ fasttext_model = fasttext.load_model(FASTTEXT_MODEL_PATH)
86
+ contents = load_data(file_path, content_key)
87
+
88
+ file_name = os.path.basename(file_path)
89
+ base_name, _ = os.path.splitext(file_name)
90
+ output_file = os.path.join(save_path, f"{base_name}_filtered.jsonl")
91
+
92
+ if inplace and os.path.exists(output_file):
93
+ print(f"Skipping existing file: {output_file}")
94
+ return
95
+ if os.path.exists(output_file):
96
+ os.remove(output_file)
97
+
98
+ print(f"ID {item}: Processing {file_path} ({len(contents)} records) -> {output_file}")
99
+ buffer: List[dict] = []
100
+ for content in tqdm(contents, desc=f"File {item}"):
101
+ norm = fasttext_preprocess_func(content)
102
+ label, score = fasttext_infer(norm, fasttext_model)
103
+ # Keep only if the predicted label is '__label__books' and probability above threshold
104
+ if label == '__label__books' and score > THRESHOLD:
105
+ buffer.append({
106
+ 'content': content,
107
+ 'books_score': score
108
+ })
109
+ if len(buffer) >= write_batch_size:
110
+ with open(output_file, 'a', encoding='utf-8') as out_f:
111
+ out_f.write("\n".join(json.dumps(x, ensure_ascii=False) for x in buffer) + "\n")
112
+ buffer.clear()
113
+ # Write remaining
114
+ if buffer:
115
+ with open(output_file, 'a', encoding='utf-8') as out_f:
116
+ out_f.write("\n".join(json.dumps(x, ensure_ascii=False) for x in buffer) + "\n")
117
+
118
+
119
+ def main():
120
+ args = parse_args()
121
+ os.makedirs(args.save_path, exist_ok=True)
122
+
123
+ # Collect input paths
124
+ if os.path.isdir(args.data_path):
125
+ paths = [os.path.join(args.data_path, fname) for fname in os.listdir(args.data_path)]
126
+ else:
127
+ paths = [args.data_path]
128
+
129
+ print("=" * 80)
130
+ print(f"Running with FastText model: {FASTTEXT_MODEL_PATH}")
131
+ print(f"Processing {len(paths)} files, threshold={THRESHOLD} for '__label__books'.")
132
+ print("=" * 80)
133
+
134
+ with Pool(processes=args.processes_num) as pool:
135
+ pool.starmap(
136
+ process_file,
137
+ [(p, args.save_path, i, args.content_key, args.inplace, args.write_batch_size)
138
+ for i, p in enumerate(paths)]
139
+ )
140
+ print("All done.")
141
+
142
+
143
+ if __name__ == "__main__":
144
+ main()
filter_en.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72802388092696c0fa4febdc2745cc953dd760ff335b724f50537c0a9b1b7811
3
+ size 921431161
filter_fineweb.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ huggingface-cli download skymizer/fineweb-edu-dedup-45B --local-dir ./data_raw
2
+
3
+ python books.py \
4
+ --data-path ./data_raw \
5
+ --save-path ./data_proc \
6
+ --content-key text \
7
+ --processes-num 64 \
8
+ --write-batch-size 100 \