Justin-lee commited on
Commit
9f2a23b
ยท
verified ยท
1 Parent(s): 8e46316

Upload enterprise_llm_train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. enterprise_llm_train.py +506 -1
enterprise_llm_train.py CHANGED
@@ -3,5 +3,510 @@
3
  ไผๆฅญๅคšไปปๅ‹™ LLM ่จ“็ทด่…ณๆœฌ
4
  ๅŸบๅบงๆจกๅž‹: Qwen/Qwen2.5-7B-Instruct + QLoRA 4-bit
5
  ๅ››ๅคง่ƒฝๅŠ›: ๅฎขๆœFAQ | ๆ–‡ไปถๅ•็ญ” | ๅทฅๅ–ฎๅˆ†้กž | ่ณ‡่จŠๆŠฝๅ–
 
 
 
 
 
 
 
 
 
6
  """
7
- # See full script in training job
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ไผๆฅญๅคšไปปๅ‹™ LLM ่จ“็ทด่…ณๆœฌ
4
  ๅŸบๅบงๆจกๅž‹: Qwen/Qwen2.5-7B-Instruct + QLoRA 4-bit
5
  ๅ››ๅคง่ƒฝๅŠ›: ๅฎขๆœFAQ | ๆ–‡ไปถๅ•็ญ” | ๅทฅๅ–ฎๅˆ†้กž | ่ณ‡่จŠๆŠฝๅ–
6
+
7
+ ่ณ‡ๆ–™ไพ†ๆบ:
8
+ - YeungNLP/firefly-train-1.1M (NER/ๅˆ†้กž/ๆ‘˜่ฆ/QA)
9
+ - hfl/cmrc2018 (ไธญๆ–‡้–ฑ่ฎ€็†่งฃ)
10
+ - clue/clue [tnews] (15้กžๆ–ฐ่žๅˆ†้กž)
11
+ - BelleGroup/train_1M_CN (้€š็”จๆŒ‡ไปค)
12
+
13
+ ่จ“็ทดๆ–นๆณ•: QLoRA SFT (NF4 + double quant, LoRA on all-linear)
14
+ ๅƒ่€ƒ: Qwen2 Technical Report (2407.10671), QLoRA Paper (2305.14314)
15
  """
16
+
17
+ import os
18
+ import json
19
+ import random
20
+ import torch
21
+ import numpy as np
22
+ from datasets import load_dataset, Dataset, concatenate_datasets
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
24
+ from peft import LoraConfig, prepare_model_for_kbit_training
25
+ from trl import SFTTrainer, SFTConfig
26
+
27
+ # โ”€โ”€ Reproducibility โ”€โ”€
28
+ SEED = 42
29
+ random.seed(SEED)
30
+ np.random.seed(SEED)
31
+ torch.manual_seed(SEED)
32
+
33
+ # โ”€โ”€ Config โ”€โ”€
34
+ MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
35
+ OUTPUT_DIR = "./qwen25-7b-enterprise-zh"
36
+ HUB_MODEL_ID = "Justin-lee/Qwen2.5-7B-Enterprise-ZH"
37
+ MAX_SEQ_LENGTH = 2048
38
+
39
+ # โ”€โ”€ Task system prompts โ”€โ”€
40
+ SYSTEM_PROMPTS = {
41
+ "faq": "ไฝ ๆ˜ฏไธ€ๅ€‹ๅฐˆๆฅญ็š„ไผๆฅญๅฎขๆœๅŠฉๆ‰‹ใ€‚่ซ‹ๆ นๆ“š็”จๆˆถ็š„ๅ•้กŒ๏ผŒๆไพ›ๆบ–็ขบใ€็ฐกๆฝ”ใ€ๆœ‰็ฆฎ่ฒŒ็š„ๅ›ž็ญ”ใ€‚ๅฆ‚ๆžœไธ็ขบๅฎš็ญ”ๆกˆ๏ผŒ่ซ‹่ช ๅฏฆๅ‘Š็Ÿฅใ€‚",
42
+ "doc_qa": "ไฝ ๆ˜ฏไธ€ๅ€‹ๆ–‡ไปถๅˆ†ๆžๅŠฉๆ‰‹ใ€‚่ซ‹ไป”็ดฐ้–ฑ่ฎ€ๆไพ›็š„ๆ–‡ไปถๅ…งๅฎน๏ผŒๅƒ…ๆ นๆ“šๆ–‡ไปถไธญ็š„่ณ‡่จŠๅ›ž็ญ”ๅ•้กŒใ€‚็ญ”ๆกˆๅฟ…้ ˆไพ†่‡ชๆ–‡ไปถ๏ผŒไธ่ฆ็ทจ้€ ๅ…งๅฎนใ€‚",
43
+ "classify": "ไฝ ๆ˜ฏไธ€ๅ€‹ๅทฅๅ–ฎๅˆ†้กž่ˆ‡ๅˆ†ๆตๅŠฉๆ‰‹ใ€‚่ซ‹ๆ นๆ“š็”จๆˆถๆ่ฟฐ็š„ๅ•้กŒ๏ผŒๅฐ‡ๅ…ถๅˆ†้กžๅˆฐๆœ€ๅˆ้ฉ็š„่™•็†้กžๅˆฅ๏ผŒไธฆ็ฐก่ฟฐๅˆ†้กž็†็”ฑใ€‚",
44
+ "ner": "ไฝ ๆ˜ฏไธ€ๅ€‹่ณ‡่จŠๆŠฝๅ–ๅŠฉๆ‰‹ใ€‚่ซ‹ๅพžๆ–‡ๆœฌไธญๆบ–็ขบๆŠฝๅ–ๆŒ‡ๅฎš้กžๅž‹็š„ๅฏฆ้ซ”่ณ‡่จŠ๏ผˆๅฆ‚ๆ—ฅๆœŸใ€้‡‘้กใ€ๅœฐๅ€ใ€ๅง“ๅใ€ๆขไปถ็ญ‰๏ผ‰๏ผŒไปฅ็ตๆง‹ๅŒ–ๆ ผๅผ่ผธๅ‡บใ€‚",
45
+ "general": "ไฝ ๆ˜ฏไธ€ๅ€‹ๆ™บ่ƒฝๅŠฉๆ‰‹๏ผŒ่ซ‹ๆ นๆ“š็”จๆˆถ็š„ๆŒ‡ไปคๅฎŒๆˆไปปๅ‹™ใ€‚",
46
+ }
47
+
48
+ # โ”€โ”€ TNEWS label mapping (15 classes) โ”€โ”€
49
+ TNEWS_LABELS = {
50
+ 0: "ๆ•…ไบ‹", 1: "ๆ–‡ๅŒ–", 2: "ๅจ›ๆจ‚", 3: "้ซ”่‚ฒ", 4: "่ฒก็ถ“",
51
+ 5: "ๆˆฟ็”ข", 6: "ๆฑฝ่ปŠ", 7: "ๆ•™่‚ฒ", 8: "็ง‘ๆŠ€", 9: "่ปไบ‹",
52
+ 10: "ๆ—…้Š", 11: "ๅœ‹้š›", 12: "่‚ก็ฅจ", 13: "่พฒๆฅญ", 14: "้›ป็ซถ",
53
+ }
54
+
55
+ # โ”€โ”€ Firefly task type mapping (verified from actual dataset kinds) โ”€โ”€
56
+ FIREFLY_IE_KINDS = {"NER", "KeywordRecognition", "SentimentAnalyze"}
57
+ FIREFLY_QA_KINDS = {"MRC", "Cot", "TextMatching"}
58
+ FIREFLY_FAQ_KINDS = {"OpenQA", "ProductDesc", "Dictionary"}
59
+ FIREFLY_CLASSIFY_KINDS = {"ClassicalChinese", "NLI", "TextCorrection"}
60
+
61
+
62
+ def format_messages(system: str, user: str, assistant: str) -> dict:
63
+ """Format a single example into ChatML messages format."""
64
+ msgs = []
65
+ if system:
66
+ msgs.append({"role": "system", "content": system})
67
+ msgs.append({"role": "user", "content": user})
68
+ msgs.append({"role": "assistant", "content": assistant})
69
+ return {"messages": msgs}
70
+
71
+
72
+ def load_firefly_data(max_per_task: int = 5000):
73
+ """Load YeungNLP/firefly-train-1.1M and split by task type."""
74
+ print("๐Ÿ“ฆ Loading Firefly-1.1M...")
75
+ ds = load_dataset("YeungNLP/firefly-train-1.1M", split="train", streaming=True)
76
+
77
+ ie_data, qa_data, faq_data = [], [], []
78
+ counts = {"ie": 0, "qa": 0, "faq": 0}
79
+
80
+ for row in ds:
81
+ kind = row["kind"]
82
+ inp = row["input"].strip()
83
+ tgt = row["target"].strip()
84
+
85
+ if not inp or not tgt or len(tgt) < 5:
86
+ continue
87
+
88
+ if kind in FIREFLY_IE_KINDS and counts["ie"] < max_per_task:
89
+ ie_data.append(format_messages(SYSTEM_PROMPTS["ner"], inp, tgt))
90
+ counts["ie"] += 1
91
+ elif kind in FIREFLY_QA_KINDS and counts["qa"] < max_per_task:
92
+ qa_data.append(format_messages(SYSTEM_PROMPTS["doc_qa"], inp, tgt))
93
+ counts["qa"] += 1
94
+ elif kind in FIREFLY_FAQ_KINDS and counts["faq"] < max_per_task:
95
+ faq_data.append(format_messages(SYSTEM_PROMPTS["faq"], inp, tgt))
96
+ counts["faq"] += 1
97
+
98
+ if all(v >= max_per_task for v in counts.values()):
99
+ break
100
+
101
+ print(f" โœ… Firefly โ€” IE: {counts['ie']}, QA: {counts['qa']}, FAQ: {counts['faq']}")
102
+ all_data = ie_data + qa_data + faq_data
103
+ return Dataset.from_list(all_data) if all_data else None
104
+
105
+
106
+ def load_cmrc_data(max_samples: int = 5000):
107
+ """Load hfl/cmrc2018 as document QA examples."""
108
+ print("๐Ÿ“ฆ Loading CMRC2018...")
109
+ ds = load_dataset("hfl/cmrc2018", split="train")
110
+
111
+ data = []
112
+ for row in ds:
113
+ context = row["context"].strip()
114
+ question = row["question"].strip()
115
+ answers = row["answers"]["text"]
116
+ if not answers:
117
+ continue
118
+ answer = answers[0].strip()
119
+
120
+ user_msg = f"่ซ‹ๆ นๆ“šไปฅไธ‹ๆ–‡ไปถๅ›ž็ญ”ๅ•้กŒใ€‚\n\nใ€ๆ–‡ไปถๅ…งๅฎนใ€‘\n{context}\n\nใ€ๅ•้กŒใ€‘\n{question}"
121
+ data.append(format_messages(SYSTEM_PROMPTS["doc_qa"], user_msg, answer))
122
+
123
+ if len(data) >= max_samples:
124
+ break
125
+
126
+ print(f" โœ… CMRC2018 โ€” {len(data)} ๆขๆ–‡ไปถๅ•็ญ”")
127
+ return Dataset.from_list(data) if data else None
128
+
129
+
130
+ def load_tnews_data(max_samples: int = 10000):
131
+ """Load CLUE TNEWS as classification examples."""
132
+ print("๐Ÿ“ฆ Loading TNEWS...")
133
+ ds = load_dataset("clue/clue", "tnews", split="train")
134
+
135
+ data = []
136
+ for row in ds:
137
+ sentence = row["sentence"].strip()
138
+ label = row["label"]
139
+ if label < 0 or label > 14:
140
+ continue
141
+
142
+ label_name = TNEWS_LABELS.get(label, "ๅ…ถไป–")
143
+ user_msg = f"่ซ‹ๅฐ‡ไปฅไธ‹ๆ–‡ๆœฌๅˆ†้กžๅˆฐๆœ€ๅˆ้ฉ็š„้กžๅˆฅใ€‚\nๅฏ้ธ้กžๅˆฅ๏ผš{', '.join(TNEWS_LABELS.values())}\n\nๆ–‡ๆœฌ๏ผš{sentence}\n\n่ซ‹็›ดๆŽฅ่ผธๅ‡บ้กžๅˆฅๅ็จฑๅ’Œๅˆ†้กž็†็”ฑใ€‚"
144
+ assistant_msg = f"้กžๅˆฅ๏ผš{label_name}\n็†็”ฑ๏ผšๆ นๆ“šๆ–‡ๆœฌๅ…งๅฎน๏ผŒ่ฉฒๆ–‡ๆœฌไธป่ฆ่จŽ่ซ–็š„ๆ˜ฏ{label_name}็›ธ้—œ็š„่ฉฑ้กŒใ€‚"
145
+ data.append(format_messages(SYSTEM_PROMPTS["classify"], user_msg, assistant_msg))
146
+
147
+ if len(data) >= max_samples:
148
+ break
149
+
150
+ print(f" โœ… TNEWS โ€” {len(data)} ๆขๅˆ†้กžๆจฃๆœฌ")
151
+ return Dataset.from_list(data) if data else None
152
+
153
+
154
+ def load_belle_data(max_samples: int = 10000):
155
+ """Load BelleGroup/train_1M_CN as general instruction + FAQ samples."""
156
+ print("๐Ÿ“ฆ Loading BELLE-1M...")
157
+ ds = load_dataset("BelleGroup/train_1M_CN", split="train", streaming=True)
158
+
159
+ data = []
160
+ count = 0
161
+ for row in ds:
162
+ instruction = row["instruction"].strip()
163
+ inp = row.get("input", "").strip()
164
+ output = row["output"].strip()
165
+
166
+ if not instruction or not output or len(output) < 10:
167
+ continue
168
+
169
+ user_msg = instruction
170
+ if inp:
171
+ user_msg += f"\n{inp}"
172
+
173
+ # Alternate between FAQ and general system prompts
174
+ sys_prompt = SYSTEM_PROMPTS["faq"] if count % 2 == 0 else SYSTEM_PROMPTS["general"]
175
+ data.append(format_messages(sys_prompt, user_msg, output))
176
+ count += 1
177
+
178
+ if count >= max_samples:
179
+ break
180
+
181
+ print(f" โœ… BELLE โ€” {len(data)} ๆข้€š็”จๆŒ‡ไปค")
182
+ return Dataset.from_list(data) if data else None
183
+
184
+
185
+ def create_synthetic_ie_examples():
186
+ """Create synthetic IE examples for dates/amounts/addresses/names extraction."""
187
+ print("๐Ÿ“ฆ Creating synthetic IE examples...")
188
+
189
+ examples = [
190
+ # ๆ—ฅๆœŸๆŠฝๅ–
191
+ {
192
+ "input": "่ซ‹ๅพžไปฅไธ‹ๆ–‡ๆœฌไธญๆŠฝๅ–ๆ‰€ๆœ‰ๆ—ฅๆœŸ่ณ‡่จŠ๏ผš\n\nใ€Œๅผตๅ…ˆ็”Ÿๆ–ผ2024ๅนด3ๆœˆ15ๆ—ฅ็ฐฝ่จ‚ไบ†่ณผๆˆฟๅˆๅŒ๏ผŒ็ด„ๅฎšๅœจ2024ๅนด6ๆœˆ30ๆ—ฅๅ‰ๅฎŒๆˆ้Žๆˆถๆ‰‹็บŒ๏ผŒ้ฆ–ไป˜ๆฌพ้œ€ๅœจ2024ๅนด4ๆœˆ1ๆ—ฅๅ‰ๆ”ฏไป˜ใ€‚ใ€",
193
+ "output": "ๆŠฝๅ–็ตๆžœ๏ผš\n- ๆ—ฅๆœŸ1๏ผš2024ๅนด3ๆœˆ15ๆ—ฅ๏ผˆๅˆๅŒ็ฐฝ่จ‚ๆ—ฅ๏ผ‰\n- ๆ—ฅๆœŸ2๏ผš2024ๅนด6ๆœˆ30ๆ—ฅ๏ผˆ้Žๆˆถๆˆชๆญขๆ—ฅ๏ผ‰\n- ๆ—ฅๆœŸ3๏ผš2024ๅนด4ๆœˆ1ๆ—ฅ๏ผˆ้ฆ–ไป˜ๆฌพๆ”ฏไป˜ๆˆชๆญขๆ—ฅ๏ผ‰"
194
+ },
195
+ # ้‡‘้กๆŠฝๅ–
196
+ {
197
+ "input": "่ซ‹ๅพžไปฅไธ‹ๆ–‡ๆœฌไธญๆŠฝๅ–ๆ‰€ๆœ‰้‡‘้ก่ณ‡่จŠ๏ผš\n\nใ€Œๆœฌๆฌกไบคๆ˜“็ธฝ้‡‘้ก็‚บไบบๆฐ‘ๅนฃ358่ฌๅ…ƒๆ•ด๏ผŒๅ…ถไธญ้ฆ–ไป˜ๆฌพ107.4่ฌๅ…ƒ๏ผŒ้Š€่กŒ่ฒธๆฌพ250.6่ฌๅ…ƒ๏ผŒๆœˆไพ›็ด„12,800ๅ…ƒ๏ผŒ่ฒธๆฌพๆœŸ้™30ๅนดใ€‚ใ€",
198
+ "output": "ๆŠฝๅ–็ตๆžœ๏ผš\n- ็ธฝ้‡‘้ก๏ผš358่ฌๅ…ƒ\n- ้ฆ–ไป˜ๆฌพ๏ผš107.4่ฌๅ…ƒ\n- ้Š€่กŒ่ฒธๆฌพ๏ผš250.6่ฌๅ…ƒ\n- ๆœˆไพ›๏ผš12,800ๅ…ƒ"
199
+ },
200
+ # ๅœฐๅ€ๆŠฝๅ–
201
+ {
202
+ "input": "่ซ‹ๅพžไปฅไธ‹ๆ–‡ๆœฌไธญๆŠฝๅ–ๆ‰€ๆœ‰ๅœฐๅ€่ณ‡่จŠ๏ผš\n\nใ€Œ็™ผ่ฒจๅœฐๅ€๏ผšไธŠๆตทๅธ‚ๆตฆๆฑๆ–ฐๅ€ๅผตๆฑŸ้ซ˜็ง‘ๆŠ€ๅœ’ๅ€็ขงๆณข่ทฏ690่™Ÿใ€‚ๆ”ถ่ฒจๅœฐๅ€๏ผšๅŒ—ไบฌๅธ‚ๆœ้™ฝๅ€ๅปบๅœ‹้–€ๅค–ๅคง่ก—1่™Ÿๅœ‹่ฒฟๅคงๅปˆAๅบง2305ๅฎคใ€‚้€€่ฒจๅœฐๅ€๏ผšๅปฃๆฑ็œๆทฑๅœณๅธ‚ๅ—ๅฑฑๅ€็ง‘ๆŠ€ๅœ’ๅ—ๅ€W1-BๆฃŸ3ๆจ“ใ€‚ใ€",
203
+ "output": "ๆŠฝๅ–็ตๆžœ๏ผš\n- ็™ผ่ฒจๅœฐๅ€๏ผšไธŠๆตทๅธ‚ๆตฆๆฑๆ–ฐๅ€ๅผตๆฑŸ้ซ˜็ง‘ๆŠ€ๅœ’ๅ€็ขงๆณข่ทฏ690่™Ÿ\n- ๆ”ถ่ฒจๅœฐๅ€๏ผšๅŒ—ไบฌๅธ‚ๆœ้™ฝๅ€ๅปบๅœ‹้–€ๅค–ๅคง่ก—1่™Ÿๅœ‹่ฒฟๅคงๅปˆAๅบง2305ๅฎค\n- ้€€่ฒจๅœฐๅ€๏ผšๅปฃๆฑ็œๆทฑๅœณๅธ‚ๅ—ๅฑฑๅ€็ง‘ๆŠ€ๅœ’ๅ—ๅ€W1-BๆฃŸ3ๆจ“"
204
+ },
205
+ # ๅง“ๅๆŠฝๅ–
206
+ {
207
+ "input": "่ซ‹ๅพžไปฅไธ‹ๆ–‡ๆœฌไธญๆŠฝๅ–ๆ‰€ๆœ‰ไบบๅ๏ผš\n\nใ€Œ่ˆ‡ๆœƒไบบๅ“กๅŒ…ๆ‹ฌ๏ผš้ …็›ฎ็ถ“็†็Ž‹ๅปบๅœ‹ใ€ๆŠ€่ก“็ธฝ็›ฃๆŽ่Šณใ€ๅธ‚ๅ ด้ƒจ็š„้™ณๅฟ—ๆ˜Žๅ’Œๅผตๅฐ็ด…๏ผŒไปฅๅŠๅค–้ƒจ้กงๅ•Dr. Michael Chenใ€‚ๆœƒ่ญฐ็”ฑๅ‰ฏ็ธฝ่ฃ่ถ™ๅœ‹ๅผทไธปๆŒใ€‚ใ€",
208
+ "output": "ๆŠฝๅ–็ตๆžœ๏ผš\n- ็Ž‹ๅปบๅœ‹๏ผˆ้ …็›ฎ็ถ“็†๏ผ‰\n- ๆŽ่Šณ๏ผˆๆŠ€่ก“็ธฝ็›ฃ๏ผ‰\n- ้™ณๅฟ—ๆ˜Ž๏ผˆๅธ‚ๅ ด้ƒจ๏ผ‰\n- ๅผตๅฐ็ด…๏ผˆๅธ‚ๅ ด้ƒจ๏ผ‰\n- Dr. Michael Chen๏ผˆๅค–้ƒจ้กงๅ•๏ผ‰\n- ่ถ™ๅœ‹ๅผท๏ผˆๅ‰ฏ็ธฝ่ฃ๏ผ‰"
209
+ },
210
+ # ๆททๅˆๆŠฝๅ–
211
+ {
212
+ "input": "่ซ‹ๅพžไปฅไธ‹ๆ–‡ๆœฌไธญๆŠฝๅ–ๆ‰€ๆœ‰้—œ้ตๅฏฆ้ซ”๏ผˆไบบๅใ€ๆ—ฅๆœŸใ€้‡‘้กใ€ๅœฐๅ€๏ผ‰๏ผš\n\nใ€Œ่ฒทๆ–นๆž—็พŽ็Žฒๅฅณๅฃซๆ–ผ2024ๅนด1ๆœˆ10ๆ—ฅๅœจๅฐๅŒ—ๅธ‚ไฟก็พฉๅ€ๆพไป่ทฏ100่™Ÿ็š„ไธๅ‹•็”ขไปฒไป‹่™•๏ผŒไปฅๆ–ฐๅฐๅนฃ2,580่ฌๅ…ƒ่ณผๅ…ฅไธ€ๆˆถไฝๅฎ…ใ€‚่ณฃๆ–น้™ณๅคงๆ˜Žๅ…ˆ็”ŸๅŒๆ„ๅœจ2024ๅนด2ๆœˆ28ๆ—ฅๅ‰ๅฎŒๆˆไบคๅฑ‹๏ผŒ้•็ด„้‡‘็‚บ็ธฝๅƒน็š„10%ๅณ258่ฌๅ…ƒใ€‚ใ€",
213
+ "output": "ๆŠฝๅ–็ตๆžœ๏ผš\n\nใ€ไบบๅใ€‘\n- ๆž—็พŽ็Žฒ๏ผˆ่ฒทๆ–น๏ผ‰\n- ้™ณๅคงๆ˜Ž๏ผˆ่ณฃๆ–น๏ผ‰\n\nใ€ๆ—ฅๆœŸใ€‘\n- 2024ๅนด1ๆœˆ10ๆ—ฅ๏ผˆ่ณผ่ฒทๆ—ฅ๏ผ‰\n- 2024ๅนด2ๆœˆ28ๆ—ฅ๏ผˆไบคๅฑ‹ๆˆช๏ฟฝ๏ฟฝๆ—ฅ๏ผ‰\n\nใ€้‡‘้กใ€‘\n- 2,580่ฌๅ…ƒ๏ผˆ่ณผ่ฒท็ธฝๅƒน๏ผ‰\n- 258่ฌๅ…ƒ๏ผˆ้•็ด„้‡‘๏ผŒ็ธฝๅƒน10%๏ผ‰\n\nใ€ๅœฐๅ€ใ€‘\n- ๅฐๅŒ—ๅธ‚ไฟก็พฉๅ€ๆพไป่ทฏ100่™Ÿ๏ผˆไปฒไป‹่™•ๅœฐๅ€๏ผ‰"
214
+ },
215
+ # ๅˆ็ด„ๆขไปถๆŠฝๅ–
216
+ {
217
+ "input": "่ซ‹ๆŠฝๅ–ไปฅไธ‹ๅˆ็ด„ๆขๆฌพไธญ็š„้—œ้ตๆขไปถ๏ผš\n\nใ€Œ็”ฒๆ–นๆ‡‰ๅœจๆ”ถๅˆฐไน™ๆ–นไบคไป˜็š„้ฉ—ๆ”ถๅˆๆ ผๅ ฑๅ‘ŠๅพŒ15ๅ€‹ๅทฅไฝœๆ—ฅๅ…ง๏ผŒๆ”ฏไป˜ๅˆๅŒ็ธฝ้ก็š„70%ๅณไบบๆฐ‘ๅนฃ84่ฌๅ…ƒใ€‚ๅ‰ฉ้ค˜30%ๅณ36่ฌๅ…ƒไฝœ็‚บ่ณชไฟ้‡‘๏ผŒๅœจ่ณชไฟๆœŸ๏ผˆ่‡ช้ฉ—ๆ”ถๅˆๆ ผไน‹ๆ—ฅ่ตท12ๅ€‹ๆœˆ๏ผ‰ๆปฟๅพŒ30ๅ€‹ๅทฅไฝœๆ—ฅๅ…ง็„กๆฏ้€€้‚„ใ€‚ๅฆ‚็”ฒๆ–น้€พๆœŸไป˜ๆฌพ๏ผŒๆฏๆ—ฅๆŒ‰ๆœชไป˜้‡‘้ก็š„0.05%ๆ”ฏไป˜้•็ด„้‡‘ใ€‚ใ€",
218
+ "output": "ๆŠฝๅ–็ตๆžœ๏ผš\n\nใ€ไป˜ๆฌพๆขไปถใ€‘\n- ้ฆ–ๆœŸ๏ผšๅˆๅŒ็ธฝ้ก70%๏ผˆ84่ฌๅ…ƒ๏ผ‰๏ผŒ้ฉ—ๆ”ถๅˆๆ ผๅพŒ15ๅ€‹ๅทฅไฝœๆ—ฅๅ…งๆ”ฏไป˜\n- ่ณชไฟ้‡‘๏ผšๅˆๅŒ็ธฝ้ก30%๏ผˆ36่ฌๅ…ƒ๏ผ‰๏ผŒ่ณชไฟๆœŸๆปฟๅพŒ30ๅ€‹ๅทฅไฝœๆ—ฅๅ…ง้€€้‚„\n- ๅˆๅŒ็ธฝ้ก๏ผš120่ฌๅ…ƒ\n\nใ€ๆ™‚้™ๆขไปถใ€‘\n- ้ฆ–ๆœŸไป˜ๆฌพๆœŸ้™๏ผš้ฉ—ๆ”ถๅˆๆ ผๅพŒ15ๅ€‹ๅทฅไฝœๆ—ฅ\n- ่ณชไฟๆœŸ๏ผš้ฉ—ๆ”ถๅˆๆ ผไน‹ๆ—ฅ่ตท12ๅ€‹ๆœˆ\n- ่ณชไฟ้‡‘้€€้‚„ๆœŸ้™๏ผš่ณชไฟๆœŸๆปฟๅพŒ30ๅ€‹ๅทฅไฝœๆ—ฅ\n\nใ€้•็ด„ๆขไปถใ€‘\n- ้€พๆœŸไป˜ๆฌพ้•็ด„้‡‘๏ผšๆฏๆ—ฅๆŒ‰ๆœชไป˜้‡‘้ก็š„0.05%"
219
+ },
220
+ # ๅฎขๆœๅทฅๅ–ฎๆŠฝๅ–
221
+ {
222
+ "input": "่ซ‹ๅพžไปฅไธ‹ๅฎขๆœๅฐ่ฉฑไธญๆŠฝๅ–้—œ้ต่ณ‡่จŠ๏ผš\n\nใ€Œๅฎขๆˆถ๏ผšไฝ ๅฅฝ๏ผŒๆˆ‘ๆ˜ฏ่จ‚ๅ–ฎ่™Ÿ ORD-2024031578 ็š„่ฒทๅฎถๅผตๅฐๆ˜Ž๏ผŒๆˆ‘3ๆœˆ18่™Ÿๅœจไฝ ๅ€‘ๅฎ˜็ถฒ่ฒทไบ†ไธ€ๅฐๅž‹่™Ÿ็‚บ XR-500 ็š„็ฉบๆฐฃๆธ…ๆทจๆฉŸ๏ผŒ่Šฑไบ†3,299ๅ…ƒ๏ผŒๆ”ถ่ฒจๅœฐๅ€ๆ˜ฏๆญๅทžๅธ‚่ฅฟๆน–ๅ€ๆ–‡ไธ‰่ทฏ489่™Ÿใ€‚ไฝ†ๆ”ถๅˆฐ็š„ๅ•†ๅ“ๅค–ๆฎผๆœ‰ๆ˜Ž้กฏๅˆฎ็—•๏ผŒๆˆ‘่ฆๆฑ‚ๆ›่ฒจๆˆ–้€€ๆฌพใ€‚ๆˆ‘็š„่ฏ็นซ้›ป่ฉฑๆ˜ฏ 138-0571-8899ใ€‚ใ€",
223
+ "output": "ๆŠฝๅ–็ตๆžœ๏ผš\n\nใ€ๅฎขๆˆถ่ณ‡่จŠใ€‘\n- ๅง“ๅ๏ผšๅผตๅฐๆ˜Ž\n- ้›ป่ฉฑ๏ผš138-0571-8899\n- ๅœฐๅ€๏ผšๆญๅทžๅธ‚่ฅฟๆน–ๅ€ๆ–‡ไธ‰่ทฏ489่™Ÿ\n\nใ€่จ‚ๅ–ฎ่ณ‡่จŠใ€‘\n- ่จ‚ๅ–ฎ่™Ÿ๏ผšORD-2024031578\n- ไธ‹ๅ–ฎๆ—ฅๆœŸ๏ผš3ๆœˆ18ๆ—ฅ\n- ่ณผ่ฒทๆธ ้“๏ผšๅฎ˜็ถฒ\n- ๅ•†ๅ“ๅž‹่™Ÿ๏ผšXR-500๏ผˆ็ฉบๆฐฃๆธ…ๆทจๆฉŸ๏ผ‰\n- ้‡‘้ก๏ผš3,299ๅ…ƒ\n\nใ€ๅ•้กŒๆ่ฟฐใ€‘\n- ๅ•้กŒ้กžๅž‹๏ผšๅ•†ๅ“ๅ“่ณชๅ•้กŒ๏ผˆๅค–ๆฎผๅˆฎ็—•๏ผ‰\n- ๅฎขๆˆถ่จดๆฑ‚๏ผšๆ›่ฒจๆˆ–้€€ๆฌพ"
224
+ },
225
+ # ็™ผ็ฅจ/ๆ”ถๆ“šๆŠฝๅ–
226
+ {
227
+ "input": "่ซ‹ๅพžไปฅไธ‹็™ผ็ฅจ่ณ‡่จŠไธญๆŠฝๅ–็ตๆง‹ๅŒ–ๆ•ธๆ“š๏ผš\n\nใ€Œๅขžๅ€ผ็จ…ๅฐˆ็”จ็™ผ็ฅจ ็™ผ็ฅจไปฃ็ขผ๏ผš3100224130 ็™ผ็ฅจ่™Ÿ็ขผ๏ผš08956723 ้–‹็ฅจๆ—ฅๆœŸ๏ผš2024ๅนด03ๆœˆ20ๆ—ฅ ้Šทๅ”ฎๆ–น๏ผšไธŠๆตท็ง‘ๅ‰ตๆ•ธๆ“šๆŠ€่ก“ๆœ‰้™ๅ…ฌๅธ ็ตฑไธ€็คพๆœƒไฟก็”จไปฃ็ขผ๏ผš91310115MA1K4XYZ89 ๅœฐๅ€๏ผšไธŠๆตทๅธ‚ๆตฆๆฑๆ–ฐๅ€ไธ–็ด€ๅคง้“1000่™Ÿ ้›ป่ฉฑ๏ผš021-58889999 ่ณผ่ฒทๆ–น๏ผšๅŒ—ไบฌๆ™บๆ…ง้›ฒ็ซฏ็ง‘ๆŠ€ๆœ‰้™ๅ…ฌๅธ ้‡‘้กๅˆ่จˆ๏ผˆๅคงๅฏซ๏ผ‰๏ผšๅฃนๆ‹พ่ฒณ่ฌๅไปŸ่‚†ไฝฐไผๆ‹พ้™ธๅ…ƒๆ•ด ้‡‘้กๅˆ่จˆ๏ผˆๅฐๅฏซ๏ผ‰๏ผšยฅ123,456.00 ็จ…็އ๏ผš13% ็จ…้ก๏ผšยฅ16,049.28 ๅƒน็จ…ๅˆ่จˆ๏ผšยฅ139,505.28ใ€",
228
+ "output": "ๆŠฝๅ–็ตๆžœ๏ผš\n\nใ€็™ผ็ฅจๅŸบๆœฌ่ณ‡่จŠใ€‘\n- ้กžๅž‹๏ผšๅขžๅ€ผ็จ…ๅฐˆ็”จ็™ผ็ฅจ\n- ็™ผ็ฅจไปฃ็ขผ๏ผš3100224130\n- ็™ผ็ฅจ่™Ÿ็ขผ๏ผš08956723\n- ้–‹็ฅจๆ—ฅๆœŸ๏ผš2024ๅนด03ๆœˆ20ๆ—ฅ\n\nใ€้Šทๅ”ฎๆ–นใ€‘\n- ๅ็จฑ๏ผšไธŠๆตท็ง‘ๅ‰ตๆ•ธๆ“šๆŠ€่ก“ๆœ‰้™ๅ…ฌๅธ\n- ไฟก็”จไปฃ็ขผ๏ผš91310115MA1K4XYZ89\n- ๅœฐๅ€๏ผšไธŠๆตทๅธ‚ๆตฆๆฑๆ–ฐๅ€ไธ–็ด€ๅคง้“1000่™Ÿ\n- ้›ป่ฉฑ๏ผš021-58889999\n\nใ€่ณผ่ฒทๆ–นใ€‘\n- ๅ็จฑ๏ผšๅŒ—ไบฌๆ™บๆ…ง้›ฒ็ซฏ็ง‘ๆŠ€ๆœ‰้™ๅ…ฌๅธ\n\nใ€้‡‘้ก่ณ‡่จŠใ€‘\n- ้‡‘้ก๏ผˆไธๅซ็จ…๏ผ‰๏ผšยฅ123,456.00\n- ็จ…็އ๏ผš13%\n- ็จ…้ก๏ผšยฅ16,049.28\n- ๅƒน็จ…ๅˆ่จˆ๏ผšยฅ139,505.28"
229
+ },
230
+ ]
231
+
232
+ # Expand to more variations
233
+ expanded = []
234
+ for ex in examples:
235
+ expanded.append(format_messages(SYSTEM_PROMPTS["ner"], ex["input"], ex["output"]))
236
+
237
+ print(f" โœ… Synthetic IE โ€” {len(expanded)} ๆข")
238
+ return Dataset.from_list(expanded)
239
+
240
+
241
+ def create_synthetic_faq_examples():
242
+ """Create realistic enterprise customer service FAQ examples."""
243
+ print("๐Ÿ“ฆ Creating synthetic FAQ examples...")
244
+
245
+ examples = [
246
+ # ่จ‚ๅ–ฎ็›ธ้—œ
247
+ {"q": "ๆˆ‘็š„่จ‚ๅ–ฎไป€้บผๆ™‚ๅ€™่ƒฝๅˆฐ๏ผŸ่จ‚ๅ–ฎ่™Ÿๆ˜ฏ ORD-20240315ใ€‚", "a": "ๆ‚จๅฅฝ๏ผ่ฎ“ๆˆ‘็‚บๆ‚จๆŸฅ่ฉข่จ‚ๅ–ฎ ORD-20240315 ็š„็‰ฉๆต็‹€ๆ…‹ใ€‚ไธ€่ˆฌๆƒ…ๆณไธ‹๏ผš\n- ๆจ™ๆบ–้…้€๏ผš3-5ๅ€‹ๅทฅไฝœๆ—ฅ\n- ๅŠ ๆ€ฅ้…้€๏ผš1-2ๅ€‹ๅทฅไฝœๆ—ฅ\n\nๆ‚จๅฏไปฅๅœจใ€Œๆˆ‘็š„่จ‚ๅ–ฎใ€้ ้ขๆŸฅ็œ‹ๅณๆ™‚็‰ฉๆต่ฟฝ่นค่ณ‡่จŠใ€‚ๅฆ‚ๆžœ่ถ…้Ž้ ่จˆ้€้”ๆ™‚้–“ไปๆœชๆ”ถๅˆฐ๏ผŒ่ซ‹่ฏ็นซๆˆ‘ๅ€‘๏ผŒๆˆ‘ๅ€‘ๆœƒ็‚บๆ‚จๅ„ชๅ…ˆ่™•็†ใ€‚"},
248
+ {"q": "ๆˆ‘ๆƒณ้€€่ฒจ๏ผŒๅทฒ็ถ“ๆ”ถๅˆฐๅ•†ๅ“3ๅคฉไบ†ใ€‚", "a": "ๆฒ’ๅ•้กŒ๏ผๆˆ‘ๅ€‘็š„้€€่ฒจๆ”ฟ็ญ–ๅฆ‚ไธ‹๏ผš\n\nโœ… **ๆ”ฏๆŒ้€€่ฒจ็š„ๆขไปถ๏ผš**\n- ็ฐฝๆ”ถๅพŒ7ๅคฉๅ…ง\n- ๅ•†ๅ“ๆœชไฝฟ็”จใ€ๆœชๆ‹†ๅฐ\n- ไฟ็•™ๅŽŸๅŒ…่ฃๅ’Œ้…ไปถ\n\n**้€€่ฒจๆต็จ‹๏ผš**\n1. ้€ฒๅ…ฅใ€Œๆˆ‘็š„่จ‚ๅ–ฎใ€โ†’ ้ธๆ“‡่ฆ้€€่ฒจ็š„่จ‚ๅ–ฎ\n2. ้ปžๆ“Šใ€Œ็”ณ่ซ‹้€€่ฒจใ€\n3. ้ธๆ“‡้€€่ฒจๅŽŸๅ› ไธฆไธŠๅ‚ณ็…ง็‰‡\n4. ็ญ‰ๅพ…ๅฏฉๆ ธ๏ผˆ้€šๅธธ1ๅ€‹ๅทฅไฝœๆ—ฅๅ…ง๏ผ‰\n5. ๅฏฉๆ ธ้€š้ŽๅพŒๅฏ„ๅ›žๅ•†ๅ“\n6. ๆˆ‘ๅ€‘ๆ”ถๅˆฐๅ•†ๅ“ๅพŒ3-5ๅ€‹ๅทฅไฝœๆ—ฅๅ…ง้€€ๆฌพ\n\nๆ‚จๆ”ถๅˆฐๆ‰3ๅคฉ๏ผŒๅฎŒๅ…จๅœจ้€€่ฒจๆ™‚ๆ•ˆๅ…งใ€‚้œ€่ฆๆˆ‘ๅนซๆ‚จ็™ผ่ตท้€€่ฒจ็”ณ่ซ‹ๅ—Ž๏ผŸ"},
249
+ {"q": "้€€ๆฌพๅคšไน…่ƒฝๅˆฐๅธณ๏ผŸ", "a": "้€€ๆฌพๅˆฐๅธณๆ™‚้–“ๅ–ๆฑบๆ–ผๆ‚จ็š„ไป˜ๆฌพๆ–นๅผ๏ผš\n\n| ไป˜ๆฌพๆ–นๅผ | ้ ่จˆๅˆฐๅธณๆ™‚้–“ |\n|---------|------------|\n| ไฟก็”จๅก | 3-7ๅ€‹ๅทฅไฝœๆ—ฅ |\n| ้Š€่กŒ่ฝ‰ๅธณ | 1-3ๅ€‹ๅทฅไฝœๆ—ฅ |\n| ็ฌฌไธ‰ๆ–นๆ”ฏไป˜๏ผˆๆ”ฏไป˜ๅฏถ/ๅพฎไฟก๏ผ‰ | ๅณๆ™‚-24ๅฐๆ™‚ |\n| ็ฆฎๅ“ๅก/้ค˜้ก | ๅณๆ™‚้€€ๅ›ž |\n\n้€€ๆฌพไธ€็ถ“่™•็†๏ผŒ๏ฟฝ๏ฟฝ๏ฟฝๅ€‘ๆœƒ็™ผ้€้€š็Ÿฅ้ƒตไปถ็ตฆๆ‚จใ€‚ๅฆ‚ๆžœ่ถ…้ŽไธŠ่ฟฐๆ™‚้–“ไปๆœชๆ”ถๅˆฐ๏ผŒ่ซ‹่ฏ็นซ้Š€่กŒ็ขบ่ชใ€‚"},
250
+ {"q": "ไฝ ๅ€‘็š„็”ขๅ“ๆœ‰ไฟๅ›บๅ—Ž๏ผŸ", "a": "ๆœ‰็š„๏ผๆˆ‘ๅ€‘ๆ‰€ๆœ‰็”ขๅ“ๅ‡ไบซๆœ‰ไฟๅ›บๆœๅ‹™๏ผš\n\n๐Ÿ”ง **ๆจ™ๆบ–ไฟๅ›บ๏ผš**\n- ้›ปๅญ็”ขๅ“๏ผš1ๅนด\n- ๅฎถ้›ป็”ขๅ“๏ผš2ๅนด\n- ้…ไปถ่€—ๆ๏ผš6ๅ€‹ๆœˆ\n\n๐Ÿ“‹ **ไฟๅ›บๆถต่“‹๏ผš**\n- ่ฃฝ้€ ็ผบ้™ท\n- ้žไบบ็‚บๆๅฃž็š„ๅŠŸ่ƒฝๆ•…้šœ\n- ๆๆ–™ๅ“่ณชๅ•้กŒ\n\nโŒ **ไธๆถต่“‹๏ผš**\n- ไบบ็‚บๆๅฃž๏ผˆๆ‘”่ฝใ€้€ฒๆฐด็ญ‰๏ผ‰\n- ๆœช็ถ“ๆŽˆๆฌŠ็š„ๆ‹†่ฃๆˆ–ๆ”น่ฃ\n- ๆญฃๅธธ็ฃจๆๆถˆ่€—\n\nๅ ฑไฟฎๆ–นๅผ๏ผšๆ’ฅๆ‰“ๅฎขๆœ้›ป่ฉฑ 400-888-9999 ๆˆ–ๅœจๅฎ˜็ถฒๆไบค็ถญไฟฎ็”ณ่ซ‹ใ€‚"},
251
+ {"q": "ๆ€Ž้บผไฟฎๆ”นๆ”ถ่ฒจๅœฐๅ€๏ผŸ่จ‚ๅ–ฎๅทฒ็ถ“ไป˜ๆฌพไบ†ใ€‚", "a": "ๅฏไปฅไฟฎๆ”น๏ผไฝ†้œ€่ฆๆณจๆ„่จ‚ๅ–ฎ็‹€ๆ…‹๏ผš\n\nโœ… **ๅฏไฟฎๆ”น๏ผš** ่จ‚ๅ–ฎ็‹€ๆ…‹็‚บใ€Œๅพ…็™ผ่ฒจใ€\nโ†’ ๅœจใ€Œๆˆ‘็š„่จ‚ๅ–ฎใ€ไธญ็›ดๆŽฅไฟฎๆ”นๅœฐๅ€\n\nโš ๏ธ **้œ€่ฏ็นซๅฎขๆœ๏ผš** ่จ‚ๅ–ฎ็‹€ๆ…‹็‚บใ€Œๅทฒ็™ผ่ฒจใ€\nโ†’ ๅฆ‚ๆžœๅฟซ้ž้‚„ๅœจๅˆ†ๆ€ไธญๅฟƒ๏ผŒๆœ‰ๅฏ่ƒฝๅฏไปฅๆ””ๆˆชไฟฎๆ”น\nโ†’ ่ซ‹ๆไพ›๏ผš่จ‚ๅ–ฎ่™Ÿ + ๆ–ฐๅœฐๅ€ + ๆ”ถไปถไบบ้›ป่ฉฑ\n\nโŒ **็„กๆณ•ไฟฎๆ”น๏ผš** ่จ‚ๅ–ฎๅทฒๅœจๆดพ้€ไธญ\nโ†’ ๅปบ่ญฐ่ฏ็นซๅฟซ้žๅ“กๅ”ๅ•†้€ๅˆฐๆ–ฐๅœฐๅ€๏ผŒๆˆ–ๆ‹’ๆ”ถๅพŒ้‡ๆ–ฐไธ‹ๅ–ฎ\n\n่ซ‹ๅ‘Š่จดๆˆ‘ๆ‚จ็š„่จ‚ๅ–ฎ่™Ÿ๏ผŒๆˆ‘ไพ†ๅนซๆ‚จๆŸฅ่ฉข็›ฎๅ‰็‹€ๆ…‹ใ€‚"},
252
+ # ๅธณ่™Ÿ็›ธ้—œ
253
+ {"q": "ๅฟ˜่จ˜ๅฏ†็ขผๆ€Ž้บผ่พฆ๏ผŸ", "a": "้‡่จญๅฏ†็ขผๅพˆ็ฐกๅ–ฎ๏ผš\n\n**ๆ–นๆณ•ไธ€๏ผš็ทšไธŠ้‡่จญ**\n1. ้ปžๆ“Š็™ปๅ…ฅ้ ้ข็š„ใ€Œๅฟ˜่จ˜ๅฏ†็ขผใ€\n2. ่ผธๅ…ฅ่จปๅ†Šๆ™‚็š„ๆ‰‹ๆฉŸ่™Ÿ็ขผๆˆ–้ƒต็ฎฑ\n3. ๆŽฅๆ”ถ้ฉ—่ญ‰็ขผ\n4. ่จญๅฎšๆ–ฐๅฏ†็ขผ๏ผˆ่‡ณๅฐ‘8ไฝ๏ผŒ้œ€ๅŒ…ๅซๅญ—ๆฏๅ’Œๆ•ธๅญ—๏ผ‰\n\n**ๆ–นๆณ•ไบŒ๏ผšๅฎขๆœๅ”ๅŠฉ**\nๅฆ‚ๆžœๆ‚จ็š„ๆ‰‹ๆฉŸ่™Ÿ็ขผ/้ƒต็ฎฑๅทฒๆ›ดๆ›๏ผš\n1. ๆบ–ๅ‚™่บซๅˆ†่ญ‰ๆ˜Žๆ–‡ไปถ\n2. ๆ’ฅๆ‰“ๅฎขๆœ้›ป่ฉฑ 400-888-9999\n3. ๆˆ‘ๅ€‘้ฉ—่ญ‰่บซๅˆ†ๅพŒ็‚บๆ‚จ้‡่จญ\n\n๐Ÿ’ก ๅปบ่ญฐ่จญๅฎšๅพŒ้–‹ๅ•Ÿ้›™้‡้ฉ—่ญ‰๏ผŒๆ›ดๅฎ‰ๅ…จ๏ผ"},
254
+ {"q": "ๆ€Ž้บผ้–‹็™ผ็ฅจ๏ผŸ", "a": "้–‹็ซ‹็™ผ็ฅจๆœ‰ไปฅไธ‹ๆ–นๅผ๏ผš\n\n**้›ปๅญ็™ผ็ฅจ๏ผˆๆŽจ่–ฆ๏ผ‰๏ผš**\n1. ้€ฒๅ…ฅใ€Œๆˆ‘็š„่จ‚ๅ–ฎใ€โ†’ ้ธๆ“‡ๅทฒๅฎŒๆˆ็š„่จ‚ๅ–ฎ\n2. ้ปžๆ“Šใ€Œ็”ณ่ซ‹็™ผ็ฅจใ€\n3. ้ธๆ“‡็™ผ็ฅจ้กžๅž‹๏ผˆๅ€‹ไบบ/ๅ…ฌๅธ๏ผ‰\n4. ๅกซๅฏซๆŠฌ้ ญๅ’Œ็ตฑไธ€็ทจ่™Ÿ\n5. ้›ปๅญ็™ผ็ฅจๆœƒๅœจ1-3ๅ€‹ๅทฅไฝœๆ—ฅๅ…ง็™ผ้€ๅˆฐๆ‚จ็š„ไฟก็ฎฑ\n\n**็ด™่ณช็™ผ็ฅจ๏ผš**\n- ้œ€้กๅค–็”ณ่ซ‹๏ผŒ้ƒตๅฏ„ๆ™‚้–“็ด„5-7ๅ€‹ๅทฅไฝœๆ—ฅ\n- ไธ‹ๅ–ฎๆ™‚ๅœจๅ‚™่จปๆฌ„่จปๆ˜Ž้œ€่ฆ็ด™่ณช็™ผ็ฅจ\n\n๐Ÿ“ ๆณจๆ„๏ผš่จ‚ๅ–ฎๅฎŒๆˆๅพŒ30ๅคฉๅ…งๅฏ็”ณ่ซ‹็™ผ็ฅจใ€‚"},
255
+ # ็”ขๅ“็›ธ้—œ
256
+ {"q": "XR-500 ๅ’Œ XR-700 ๆœ‰ไป€้บผๅทฎๅˆฅ๏ผŸ่ฉฒ่ฒทๅ“ชๅ€‹๏ผŸ", "a": "ไปฅไธ‹ๆ˜ฏๅ…ฉๆฌพ็”ขๅ“็š„่ฉณ็ดฐๆฏ”่ผƒ๏ผš\n\n| ่ฆๆ ผ | XR-500 | XR-700 |\n|------|--------|--------|\n| ้ฉ็”จ้ข็ฉ | 20-30ๅช | 35-50ๅช |\n| CADRๅ€ผ | 500 mยณ/h | 700 mยณ/h |\n| ๅ™ช้Ÿณ | 30-52 dB | 28-48 dB |\n| ๆฟพ็ถฒๅฃฝๅ‘ฝ | 6ๅ€‹ๆœˆ | 12ๅ€‹ๆœˆ |\n| ๆ™บ่ƒฝๅŠŸ่ƒฝ | Wi-FiๆŽงๅˆถ | Wi-Fi + AIๆ„Ÿ็Ÿฅ |\n| ๅƒนๆ ผ | ยฅ3,299 | ยฅ5,499 |\n\n**่ณผ่ฒทๅปบ่ญฐ๏ผš**\n- ๐Ÿ  ไธ€่ˆฌๅฎถๅบญ๏ผˆ30ๅชไปฅๅ…ง๏ผ‰โ†’ XR-500 ๆ€งๅƒนๆฏ”ๆ›ด้ซ˜\n- ๐Ÿข ๅคง็ฉบ้–“/ๅฐๅ™ช้Ÿณๆ•ๆ„Ÿ โ†’ XR-700 ๆ›ดๅฎ‰้œใ€ๆ›ดๅผทๆ•ˆ\n\nๅ…ฉๆฌพ้ƒฝๆ”ฏๆŒ7ๅคฉ็„ก็†็”ฑ้€€่ฒจ๏ผŒๅฏไปฅๆ”พๅฟƒ่ณผ่ฒท่ฉฆ็”จใ€‚"},
257
+ ]
258
+
259
+ data = []
260
+ for ex in examples:
261
+ data.append(format_messages(SYSTEM_PROMPTS["faq"], ex["q"], ex["a"]))
262
+
263
+ print(f" โœ… Synthetic FAQ โ€” {len(data)} ๆข")
264
+ return Dataset.from_list(data)
265
+
266
+
267
+ def create_synthetic_ticket_examples():
268
+ """Create synthetic ticket classification/routing examples."""
269
+ print("๐Ÿ“ฆ Creating synthetic ticket classification examples...")
270
+
271
+ TICKET_CATEGORIES = {
272
+ "ๅ”ฎๅพŒๆœๅ‹™": "ๅ•†ๅ“้€€ๆ›่ฒจใ€็ถญไฟฎใ€ไฟๅ›บๅ•้กŒ",
273
+ "็‰ฉๆต้…้€": "็‰ฉๆตๆŸฅ่ฉขใ€ๅปถ้ฒใ€ไธŸไปถใ€ๅœฐๅ€ไฟฎๆ”น",
274
+ "ๅธณ่™Ÿๅ•้กŒ": "็™ปๅ…ฅใ€ๅฏ†็ขผใ€ๅฎ‰ๅ…จใ€ๅ€‹ไบบ่ณ‡ๆ–™",
275
+ "ไป˜ๆฌพ่ฒกๅ‹™": "ไป˜ๆฌพๅคฑๆ•—ใ€้€€ๆฌพใ€็™ผ็ฅจใ€ๅธณๅ–ฎ",
276
+ "็”ขๅ“่ซฎ่ฉข": "็”ขๅ“่ฆๆ ผใ€้ธ่ณผๅปบ่ญฐใ€ๅบซๅญ˜ๆŸฅ่ฉข",
277
+ "ๆŠ•่จดๅปบ่ญฐ": "ๆœๅ‹™ๆ…‹ๅบฆใ€ๅ“่ณชๆŠ•่จดใ€ๆ”นๅ–„ๅปบ่ญฐ",
278
+ "ๆŠ€่ก“ๆ”ฏๆด": "็”ขๅ“ไฝฟ็”จๅ•้กŒใ€ๆ•…้šœๆŽ’้™คใ€่ปŸ้ซ”ๆ›ดๆ–ฐ",
279
+ "ๅˆไฝœๆดฝ่ซ‡": "ๅ•†ๅ‹™ๅˆไฝœใ€ๆ‰น้‡ๆŽก่ณผใ€ไปฃ็†ๅŠ ็›Ÿ",
280
+ }
281
+
282
+ examples = [
283
+ ("ๆˆ‘ไธŠๅ€‹ๆœˆ่ฒท็š„ๆด—่กฃๆฉŸๆผๆฐดไบ†๏ผŒ้‚„ๅœจไฟๅ›บๆœŸๅ…ง๏ผŒๆ€Ž้บผๅ ฑไฟฎ๏ผŸ", "ๅ”ฎๅพŒๆœๅ‹™", "ๅฎขๆˆถๅๆ˜ ็”ขๅ“ๅœจไฟๅ›บๆœŸๅ…งๅ‡บ็พๆ•…้šœ๏ผˆๆด—่กฃๆฉŸๆผๆฐด๏ผ‰๏ผŒๅฑฌๆ–ผ็ถญไฟฎไฟๅ›บ็ฏ„็–‡ใ€‚"),
284
+ ("ๅฟซ้žๅทฒ็ถ“5ๅคฉไบ†้‚„ๆฒ’ๅˆฐ๏ผŒ็‰ฉๆต่ณ‡่จŠ3ๅคฉๆฒ’ๆ›ดๆ–ฐใ€‚", "็‰ฉๆต้…้€", "ๅฎขๆˆถๅๆ˜ ็‰ฉๆต่ถ…ๆ™‚ไธ”่ฟฝ่นค่ณ‡่จŠๅœๆปฏ๏ผŒๅฑฌๆ–ผ็‰ฉๆต็•ฐๅธธๅ•้กŒใ€‚"),
285
+ ("ๆˆ‘ไธ€็›ดๆ”ถๅˆฐ็™ปๅ…ฅ็•ฐๅธธ็š„้€š็Ÿฅ๏ผŒไฝ†ๆˆ‘ๆฒ’ๆœ‰ๅœจๅ…ถไป–ๅœฐๆ–น็™ปๅ…ฅ้Žใ€‚", "ๅธณ่™Ÿๅ•้กŒ", "ๅฎขๆˆถๅธณ่™Ÿๅฏ่ƒฝๅญ˜ๅœจๅฎ‰ๅ…จ้ขจ้šช๏ผˆ็–‘ไผผ่ขซ็›œ๏ผ‰๏ผŒ้œ€่ฆๅฎ‰ๅ…จๅœ˜้šŠ่™•็†ใ€‚"),
286
+ ("ไธŠๆฌก้€€่ฒจ็š„้€€ๆฌพไธ€็›ดๆฒ’ๆ”ถๅˆฐ๏ผŒๅทฒ็ถ“่ถ…้Ž7ๅคฉไบ†ใ€‚", "ไป˜ๆฌพ่ฒกๅ‹™", "้€€ๆฌพ้€พๆœŸๆœชๅˆฐๅธณ๏ผŒๅฑฌๆ–ผ่ฒกๅ‹™้€€ๆฌพๅ•้กŒใ€‚"),
287
+ ("ๆƒณๅ•ไธ€ไธ‹ไฝ ๅ€‘็š„ๆ™บ่ƒฝๆ‰‹้Œถๆ”ฏไธๆ”ฏๆŒๆธธๆณณๆ™‚ไฝฟ็”จ๏ผŸ้˜ฒๆฐด็ญ‰็ดšๆ˜ฏๅคšๅฐ‘๏ผŸ", "็”ขๅ“่ซฎ่ฉข", "ๅฎขๆˆถ่ฉขๅ•็”ขๅ“่ฆๆ ผ๏ผˆ้˜ฒๆฐด็ญ‰็ดš๏ผ‰๏ผŒๅฑฌๆ–ผๅ”ฎๅ‰่ซฎ่ฉขใ€‚"),
288
+ ("ไฝ ๅ€‘็š„ๅฎขๆœๆ…‹ๅบฆๅคชๅทฎไบ†๏ผไธŠๆฌกๆ‰“้›ป่ฉฑ้Žไพ†่ขซๆŽ›ไบ†ไธ‰ๆฌก๏ผ", "ๆŠ•่จดๅปบ่ญฐ", "ๅฎขๆˆถๆŠ•่จดๅฎขๆœๆœๅ‹™ๆ…‹ๅบฆ๏ผŒๅฑฌๆ–ผๆœๅ‹™ๅ“่ณชๆŠ•่จด๏ผŒ้œ€่ฆๅ„ชๅ…ˆ่™•็†ใ€‚"),
289
+ ("ๆ–ฐ่ฒท็š„ๅนณๆฟ้›ป่…ฆ้€ฃไธไธŠWiFi๏ผŒ่ฉฆไบ†้‡้–‹ๆฉŸ้‚„ๆ˜ฏไธ่กŒใ€‚", "ๆŠ€่ก“ๆ”ฏๆด", "็”ขๅ“ๆŠ€่ก“ๅ•้กŒ๏ผˆWiFi้€ฃ็ทšๆ•…้šœ๏ผ‰๏ผŒ้œ€่ฆๆŠ€่ก“ไบบๅ“กๅ”ๅŠฉๆŽ’ๆŸฅใ€‚"),
290
+ ("ๆˆ‘ๅ€‘ๅ…ฌๅธๆƒณๆŽก่ณผ200ๅฐไฝ ๅ€‘็š„็ฉบๆฐฃๆธ…ๆทจๆฉŸ๏ผŒๆœ‰ๅœ˜่ณผๅƒนๅ—Ž๏ผŸ", "ๅˆไฝœๆดฝ่ซ‡", "ไผๆฅญๅฎขๆˆถ็š„ๆ‰น้‡ๆŽก่ณผ้œ€ๆฑ‚๏ผŒ้œ€่ฝ‰่‡ณๅ•†ๅ‹™้ƒจ้–€ใ€‚"),
291
+ ("ๆˆ‘ไป˜ๆฌพ็š„ๆ™‚ๅ€™ไธ€็›ด้กฏ็คบไป˜ๆฌพๅคฑๆ•—๏ผŒ้ค˜้กๆ˜ฏๅค ็š„ใ€‚", "ไป˜ๆฌพ่ฒกๅ‹™", "ไป˜ๆฌพ็•ฐๅธธๅ•้กŒ๏ผŒๅฏ่ƒฝๆถ‰ๅŠๆ”ฏไป˜้€š้“ๆˆ–็ณป็ตฑๅ•้กŒใ€‚"),
292
+ ("APPๆ›ดๆ–ฐไน‹ๅพŒไธ€็›ด้–ƒ้€€๏ผŒๆ‰‹ๆฉŸๆ˜ฏiPhone 15ใ€‚", "ๆŠ€่ก“ๆ”ฏๆด", "่ปŸ้ซ”็›ธๅฎนๆ€งๅ•้กŒ๏ผˆAPP้–ƒ้€€๏ผ‰๏ผŒ้œ€่ฆๆŠ€่ก“ๆŽ’ๆŸฅใ€‚"),
293
+ ("ๆˆ‘ๆƒณๅ–ๆถˆ่จ‚ๅ–ฎ๏ผŒๅ•†ๅ“้‚„ๆฒ’็™ผ่ฒจใ€‚", "ๅ”ฎๅพŒๆœๅ‹™", "ๅฎขๆˆถ่ฆๆฑ‚ๅ–ๆถˆๆœช็™ผ่ฒจ่จ‚ๅ–ฎ๏ผŒๅฑฌๆ–ผๅ”ฎๅพŒ่™•็†ใ€‚"),
294
+ ("ไฝ ๅ€‘ๆœ‰ๆฒ’ๆœ‰ๅœจๆ‹›ๅœฐๅ€็ถ“้Šทๅ•†๏ผŸ", "ๅˆไฝœๆดฝ่ซ‡", "ไปฃ็†ๅŠ ็›Ÿ่ซฎ่ฉข๏ผŒ้œ€่ฝ‰่‡ณๆธ ้“ๆ‹“ๅฑ•้ƒจ้–€ใ€‚"),
295
+ ]
296
+
297
+ data = []
298
+ for text, category, reason in examples:
299
+ cat_desc = TICKET_CATEGORIES[category]
300
+ user_msg = f"่ซ‹ๅฐ‡ไปฅไธ‹ๅฎขๆˆถ่จŠๆฏๅˆ†้กžๅˆฐๅˆ้ฉ็š„่™•็†้ƒจ้–€ใ€‚\n\nๅฏ้ธ้ƒจ้–€๏ผš\n"
301
+ for cat, desc in TICKET_CATEGORIES.items():
302
+ user_msg += f"- {cat}๏ผš{desc}\n"
303
+ user_msg += f"\nๅฎขๆˆถ่จŠๆฏ๏ผš{text}\n\n่ซ‹่ผธๅ‡บๅˆ†้กž็ตๆžœๅ’Œ็†็”ฑใ€‚"
304
+ assistant_msg = f"ๅˆ†้กž็ตๆžœ๏ผš{category}\n\n็†็”ฑ๏ผš{reason}\n\nๅปบ่ญฐ่™•็†ๅ„ชๅ…ˆ็ดš๏ผš{'้ซ˜' if category in ['ๆŠ•่จดๅปบ่ญฐ', 'ๅธณ่™Ÿๅ•้กŒ'] else 'ไธญ'}"
305
+ data.append(format_messages(SYSTEM_PROMPTS["classify"], user_msg, assistant_msg))
306
+
307
+ print(f" โœ… Synthetic Tickets โ€” {len(data)} ๆข")
308
+ return Dataset.from_list(data)
309
+
310
+
311
+ def build_dataset():
312
+ """Build the combined multi-task training dataset."""
313
+ print("\n" + "="*60)
314
+ print("๐Ÿ”จ Building multi-task training dataset")
315
+ print("="*60 + "\n")
316
+
317
+ datasets_list = []
318
+
319
+ # 1. Firefly: IE + QA + FAQ (15K total)
320
+ firefly_ds = load_firefly_data(max_per_task=5000)
321
+ if firefly_ds:
322
+ datasets_list.append(firefly_ds)
323
+
324
+ # 2. CMRC2018: Document QA (all ~10K)
325
+ cmrc_ds = load_cmrc_data(max_samples=10000)
326
+ if cmrc_ds:
327
+ datasets_list.append(cmrc_ds)
328
+
329
+ # 3. TNEWS: Classification (10K)
330
+ tnews_ds = load_tnews_data(max_samples=10000)
331
+ if tnews_ds:
332
+ datasets_list.append(tnews_ds)
333
+
334
+ # 4. BELLE: General FAQ + instructions (10K)
335
+ belle_ds = load_belle_data(max_samples=10000)
336
+ if belle_ds:
337
+ datasets_list.append(belle_ds)
338
+
339
+ # 5. Synthetic IE examples (high-quality, task-specific)
340
+ syn_ie = create_synthetic_ie_examples()
341
+ datasets_list.append(syn_ie)
342
+
343
+ # 6. Synthetic FAQ examples (enterprise-specific)
344
+ syn_faq = create_synthetic_faq_examples()
345
+ datasets_list.append(syn_faq)
346
+
347
+ # 7. Synthetic ticket classification examples
348
+ syn_tickets = create_synthetic_ticket_examples()
349
+ datasets_list.append(syn_tickets)
350
+
351
+ # Combine all
352
+ combined = concatenate_datasets(datasets_list)
353
+ combined = combined.shuffle(seed=SEED)
354
+
355
+ print(f"\n๐Ÿ“Š Total training examples: {len(combined)}")
356
+ print(f" Sample messages format: {combined[0]['messages'][:1]}")
357
+
358
+ return combined
359
+
360
+
361
+ def main():
362
+ print("๐Ÿš€ Enterprise Multi-Task LLM Training")
363
+ print(f" Model: {MODEL_ID}")
364
+ print(f" Output: {HUB_MODEL_ID}")
365
+ print(f" Max Seq Length: {MAX_SEQ_LENGTH}")
366
+ print()
367
+
368
+ # โ”€โ”€ Build dataset โ”€โ”€
369
+ train_dataset = build_dataset()
370
+
371
+ # โ”€โ”€ Initialize Trackio โ”€โ”€
372
+ try:
373
+ import trackio
374
+ trackio.init(
375
+ project="enterprise-llm",
376
+ name="qwen25-7b-multitask-sft",
377
+ config={
378
+ "model": MODEL_ID,
379
+ "method": "QLoRA-SFT",
380
+ "tasks": "FAQ,DocQA,Classification,IE",
381
+ "dataset_size": len(train_dataset),
382
+ "max_seq_length": MAX_SEQ_LENGTH,
383
+ }
384
+ )
385
+ print("๐Ÿ“Š Trackio monitoring initialized")
386
+ except Exception as e:
387
+ print(f"โš ๏ธ Trackio init failed (non-fatal): {e}")
388
+
389
+ # โ”€โ”€ Load tokenizer โ”€โ”€
390
+ print("\n๐Ÿ“ฆ Loading tokenizer...")
391
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
392
+ if tokenizer.pad_token is None:
393
+ tokenizer.pad_token = tokenizer.eos_token
394
+ tokenizer.padding_side = "right"
395
+
396
+ # โ”€โ”€ BitsAndBytes config โ”€โ”€
397
+ bnb_config = BitsAndBytesConfig(
398
+ load_in_4bit=True,
399
+ bnb_4bit_quant_type="nf4",
400
+ bnb_4bit_use_double_quant=True,
401
+ bnb_4bit_compute_dtype=torch.bfloat16,
402
+ )
403
+
404
+ # โ”€โ”€ Load model โ”€โ”€
405
+ print("๐Ÿ“ฆ Loading model with 4-bit quantization...")
406
+ model = AutoModelForCausalLM.from_pretrained(
407
+ MODEL_ID,
408
+ quantization_config=bnb_config,
409
+ device_map="auto",
410
+ trust_remote_code=True,
411
+ torch_dtype=torch.bfloat16,
412
+ )
413
+ model = prepare_model_for_kbit_training(model)
414
+ print(f" Model loaded: {model.dtype}, device: {model.device}")
415
+
416
+ # โ”€โ”€ LoRA config โ”€โ”€
417
+ peft_config = LoraConfig(
418
+ r=64,
419
+ lora_alpha=128,
420
+ target_modules="all-linear",
421
+ lora_dropout=0.05,
422
+ bias="none",
423
+ task_type="CAUSAL_LM",
424
+ )
425
+
426
+ # โ”€โ”€ Training config โ”€โ”€
427
+ training_args = SFTConfig(
428
+ output_dir=OUTPUT_DIR,
429
+ hub_model_id=HUB_MODEL_ID,
430
+ push_to_hub=True,
431
+
432
+ # Training hyperparams
433
+ num_train_epochs=3,
434
+ per_device_train_batch_size=2,
435
+ gradient_accumulation_steps=8, # effective batch = 16
436
+ learning_rate=2e-4,
437
+ lr_scheduler_type="cosine",
438
+ warmup_ratio=0.03,
439
+ weight_decay=0.01,
440
+ max_grad_norm=1.0,
441
+
442
+ # Sequence
443
+ max_length=MAX_SEQ_LENGTH,
444
+ packing=False,
445
+
446
+ # Memory optimization
447
+ gradient_checkpointing=True,
448
+ gradient_checkpointing_kwargs={"use_reentrant": False},
449
+ optim="paged_adamw_8bit",
450
+ bf16=True,
451
+
452
+ # Logging
453
+ logging_steps=10,
454
+ logging_first_step=True,
455
+ logging_strategy="steps",
456
+ disable_tqdm=True,
457
+ report_to="none",
458
+
459
+ # Saving
460
+ save_strategy="steps",
461
+ save_steps=500,
462
+ save_total_limit=3,
463
+
464
+ # Other
465
+ dataloader_num_workers=4,
466
+ seed=SEED,
467
+ remove_unused_columns=True,
468
+ )
469
+
470
+ # โ”€โ”€ Trainer โ”€โ”€
471
+ print("\n๐Ÿ‹๏ธ Initializing SFTTrainer...")
472
+ trainer = SFTTrainer(
473
+ model=model,
474
+ args=training_args,
475
+ train_dataset=train_dataset,
476
+ peft_config=peft_config,
477
+ processing_class=tokenizer,
478
+ )
479
+
480
+ # Print trainable params
481
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
482
+ total_params = sum(p.numel() for p in model.parameters())
483
+ print(f" Trainable: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.2f}%)")
484
+
485
+ # โ”€โ”€ Train โ”€โ”€
486
+ print("\n๐Ÿš€ Starting training...")
487
+ train_result = trainer.train()
488
+
489
+ # โ”€โ”€ Save & Push โ”€โ”€
490
+ print("\n๐Ÿ’พ Saving model...")
491
+ trainer.save_model()
492
+
493
+ # Save training metrics
494
+ metrics = train_result.metrics
495
+ trainer.log_metrics("train", metrics)
496
+ trainer.save_metrics("train", metrics)
497
+
498
+ print("\n๐Ÿ“ค Pushing to Hub...")
499
+ trainer.push_to_hub(commit_message="Multi-task enterprise LLM: FAQ + DocQA + Classification + IE")
500
+
501
+ # Also push tokenizer
502
+ tokenizer.push_to_hub(HUB_MODEL_ID)
503
+
504
+ print("\n" + "="*60)
505
+ print("โœ… Training complete!")
506
+ print(f" Model: https://huggingface.co/{HUB_MODEL_ID}")
507
+ print(f" Metrics: {json.dumps(metrics, indent=2)}")
508
+ print("="*60)
509
+
510
+
511
+ if __name__ == "__main__":
512
+ main()