| |
| """ |
| SFT Data Preparation v2 for Multilingual 3B GPT |
| |
| Data sources: |
| 1. HebrewGPT SFT v3 — 27K Hebrew instruction samples from our prior work (S3) |
| 2. HebrewGPT individual datasets — alpaca_hebrew, chat, dolly, QA, summarization, etc. (S3) |
| 3. Aya Dataset — human-annotated instructions (en, ar, fa) |
| 4. arbml/alpaca_arabic — 52K Arabic alpaca-style instructions |
| 5. FreedomIntelligence/alpaca-gpt4-arabic — 50K Arabic GPT-4 instructions |
| 6. tatsu-lab/alpaca — 52K English instructions |
| 7. databricks/dolly-15k — diverse English instructions |
| |
| Output: tokenized binary data for SFT training. |
| """ |
|
|
| import os, sys, json, argparse, random |
| from collections import defaultdict |
| sys.stdout.reconfigure(line_buffering=True) |
|
|
| datasets_mod = None |
| spm = None |
| np = None |
|
|
| def ensure_imports(): |
| global datasets_mod, spm, np |
| if datasets_mod is None: |
| import datasets as _ds |
| import sentencepiece as _spm |
| import numpy as _np |
| datasets_mod = _ds |
| spm = _spm |
| np = _np |
|
|
| |
| USER_PREFIX = "### User:\n" |
| ASSISTANT_PREFIX = "### Assistant:\n" |
| TURN_END = "\n\n" |
|
|
| def format_instruction(instruction, response, input_text=None): |
| if input_text and input_text.strip(): |
| user_text = f"{instruction}\n\n{input_text}" |
| else: |
| user_text = instruction |
| return f"{USER_PREFIX}{user_text}{TURN_END}{ASSISTANT_PREFIX}{response}{TURN_END}" |
|
|
|
|
| def load_aya_multilingual(max_per_lang=5000): |
| """Load Aya Dataset using correct language_code field.""" |
| ensure_imports() |
| print("Loading Aya Dataset (using language_code field)...") |
| |
| code_map = { |
| 'eng': 'en', |
| 'arb': 'ar', |
| 'ary': 'ar', |
| 'arz': 'ar', |
| 'ars': 'ar', |
| 'apc': 'ar', |
| 'pes': 'fa', |
| } |
| |
| ds = datasets_mod.load_dataset("CohereForAI/aya_dataset", split="train") |
| |
| |
| by_lang = defaultdict(list) |
| for s in ds: |
| code = s['language_code'] |
| target = code_map.get(code) |
| if target: |
| by_lang[target].append(s) |
| |
| all_samples = [] |
| for lang, samples in by_lang.items(): |
| random.shuffle(samples) |
| selected = samples[:max_per_lang] |
| for s in selected: |
| all_samples.append({ |
| 'text': format_instruction(s['inputs'], s['targets']), |
| 'lang': lang, |
| 'source': 'aya', |
| }) |
| print(f" Aya [{lang}]: {len(selected)} samples (from {len(samples)} available)") |
| |
| return all_samples |
|
|
|
|
| def load_arabic_alpaca(max_samples=5000): |
| """Load arbml/alpaca_arabic — high-quality Arabic instructions.""" |
| ensure_imports() |
| print("Loading arbml/alpaca_arabic...") |
| |
| try: |
| ds = datasets_mod.load_dataset("arbml/alpaca_arabic", split="train") |
| indices = list(range(len(ds))) |
| random.shuffle(indices) |
| indices = indices[:max_samples] |
| |
| samples = [] |
| skipped = 0 |
| for i in indices: |
| s = ds[i] |
| instr = s.get('instruction', '').strip() |
| out = s.get('output', '').strip() |
| inp = s.get('input', '').strip() |
| if not instr or not out: |
| skipped += 1 |
| continue |
| samples.append({ |
| 'text': format_instruction(instr, out, inp), |
| 'lang': 'ar', |
| 'source': 'alpaca_arabic', |
| }) |
| print(f" alpaca_arabic: {len(samples)} samples (skipped {skipped} empty)") |
| return samples |
| except Exception as e: |
| print(f" Warning: Could not load alpaca_arabic: {e}") |
| return [] |
|
|
|
|
| def load_arabic_gpt4(max_samples=5000): |
| """Load FreedomIntelligence/alpaca-gpt4-arabic — GPT-4 generated Arabic.""" |
| ensure_imports() |
| print("Loading FreedomIntelligence/alpaca-gpt4-arabic...") |
| |
| try: |
| ds = datasets_mod.load_dataset("FreedomIntelligence/alpaca-gpt4-arabic", split="train") |
| indices = list(range(len(ds))) |
| random.shuffle(indices) |
| indices = indices[:max_samples] |
| |
| samples = [] |
| skipped = 0 |
| for i in indices: |
| s = ds[i] |
| convs = s.get('conversations', []) |
| if len(convs) < 2: |
| skipped += 1 |
| continue |
| |
| human = None |
| for c in convs: |
| if c['from'] == 'human': |
| human = c['value'].strip() |
| elif c['from'] == 'gpt' and human: |
| gpt = c['value'].strip() |
| if human and gpt: |
| samples.append({ |
| 'text': format_instruction(human, gpt), |
| 'lang': 'ar', |
| 'source': 'alpaca_gpt4_arabic', |
| }) |
| human = None |
| print(f" alpaca-gpt4-arabic: {len(samples)} samples (skipped {skipped} empty)") |
| return samples[:max_samples] |
| except Exception as e: |
| print(f" Warning: Could not load alpaca-gpt4-arabic: {e}") |
| return [] |
|
|
|
|
| def load_english_alpaca(max_samples=5000): |
| """Load tatsu-lab/alpaca for English instruction data.""" |
| ensure_imports() |
| print("Loading tatsu-lab/alpaca (English)...") |
| |
| try: |
| ds = datasets_mod.load_dataset("tatsu-lab/alpaca", split="train") |
| indices = list(range(len(ds))) |
| random.shuffle(indices) |
| indices = indices[:max_samples] |
| |
| samples = [] |
| for i in indices: |
| s = ds[i] |
| instr = s.get('instruction', '').strip() |
| out = s.get('output', '').strip() |
| inp = s.get('input', '').strip() |
| if not instr or not out: |
| continue |
| samples.append({ |
| 'text': format_instruction(instr, out, inp), |
| 'lang': 'en', |
| 'source': 'alpaca_en', |
| }) |
| print(f" alpaca_en: {len(samples)} samples") |
| return samples |
| except Exception as e: |
| print(f" Warning: Could not load alpaca: {e}") |
| return [] |
|
|
|
|
| def load_hebrew_sft(data_dir, max_samples=10000): |
| """Load Hebrew instruction data from S3 (HebrewGPT project).""" |
| import json as _json |
| print(f"Loading Hebrew SFT data from {data_dir}...") |
| |
| all_samples = [] |
| |
| |
| for fname in os.listdir(data_dir): |
| if not fname.endswith('.jsonl'): |
| continue |
| filepath = os.path.join(data_dir, fname) |
| count = 0 |
| with open(filepath) as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| try: |
| d = _json.loads(line) |
| except: |
| continue |
| |
| |
| if 'messages' in d: |
| |
| msgs = d['messages'] |
| if len(msgs) >= 2: |
| user_msg = msgs[0].get('content', '').strip() |
| asst_msg = msgs[1].get('content', '').strip() |
| if user_msg and asst_msg: |
| all_samples.append({ |
| 'text': format_instruction(user_msg, asst_msg), |
| 'lang': 'he', |
| 'source': f'hebrew_{fname.replace(".jsonl", "")}', |
| }) |
| count += 1 |
| elif 'instruction' in d: |
| instr = d.get('instruction', '').strip() |
| inp = d.get('input', '').strip() |
| out = d.get('output', d.get('response', '')).strip() |
| if instr and out: |
| all_samples.append({ |
| 'text': format_instruction(instr, out, inp), |
| 'lang': 'he', |
| 'source': f'hebrew_{fname.replace(".jsonl", "")}', |
| }) |
| count += 1 |
| |
| if count > 0: |
| print(f" {fname}: {count} samples") |
| |
| |
| random.shuffle(all_samples) |
| if max_samples and len(all_samples) > max_samples: |
| all_samples = all_samples[:max_samples] |
| |
| print(f" Total Hebrew: {len(all_samples)} samples (capped from {len(all_samples)} if needed)") |
| return all_samples |
|
|
|
|
| def load_dolly(max_samples=3000): |
| """Load databricks/dolly-15k for diverse English instructions.""" |
| ensure_imports() |
| print("Loading databricks/databricks-dolly-15k (English)...") |
| |
| try: |
| ds = datasets_mod.load_dataset("databricks/databricks-dolly-15k", split="train") |
| indices = list(range(len(ds))) |
| random.shuffle(indices) |
| indices = indices[:max_samples] |
| |
| samples = [] |
| for i in indices: |
| s = ds[i] |
| instr = s.get('instruction', '').strip() |
| resp = s.get('response', '').strip() |
| ctx = s.get('context', '').strip() |
| if not instr or not resp: |
| continue |
| samples.append({ |
| 'text': format_instruction(instr, resp, ctx), |
| 'lang': 'en', |
| 'source': 'dolly', |
| }) |
| print(f" dolly: {len(samples)} samples") |
| return samples |
| except Exception as e: |
| print(f" Warning: Could not load dolly: {e}") |
| return [] |
|
|
|
|
| def tokenize_and_save(samples, tokenizer_path, output_dir, val_ratio=0.05): |
| """Tokenize samples and save as binary files.""" |
| ensure_imports() |
| |
| sp = spm.SentencePieceProcessor(tokenizer_path) |
| os.makedirs(output_dir, exist_ok=True) |
| |
| random.shuffle(samples) |
| |
| n_val = max(int(len(samples) * val_ratio), 100) |
| val_samples = samples[:n_val] |
| train_samples = samples[n_val:] |
| |
| |
| source_counts = defaultdict(int) |
| lang_counts = defaultdict(int) |
| for s in samples: |
| source_counts[s['source']] += 1 |
| lang_counts[s['lang']] += 1 |
| |
| print(f"\n{'='*60}") |
| print(f"DATASET VALIDATION") |
| print(f"{'='*60}") |
| print(f"Total samples: {len(samples)} ({len(train_samples)} train, {n_val} val)") |
| print(f"\nBy source:") |
| for src, cnt in sorted(source_counts.items(), key=lambda x: -x[1]): |
| print(f" {src}: {cnt} ({cnt*100/len(samples):.1f}%)") |
| print(f"\nBy language:") |
| for lang, cnt in sorted(lang_counts.items(), key=lambda x: -x[1]): |
| print(f" {lang}: {cnt} ({cnt*100/len(samples):.1f}%)") |
| |
| |
| print(f"\n--- Sample validation ---") |
| empty_count = 0 |
| short_count = 0 |
| for s in samples: |
| text = s['text'] |
| if not text.strip(): |
| empty_count += 1 |
| elif len(text) < 20: |
| short_count += 1 |
| print(f" Empty samples: {empty_count}") |
| print(f" Very short (<20 chars): {short_count}") |
| |
| |
| print(f"\n--- Random samples per language ---") |
| by_lang = defaultdict(list) |
| for s in samples: |
| by_lang[s['lang']].append(s) |
| for lang in sorted(by_lang.keys()): |
| s = random.choice(by_lang[lang]) |
| text = s['text'][:200].replace('\n', '\\n') |
| print(f"\n [{lang}] ({s['source']}): {text}...") |
| |
| |
| print(f"\n--- Tokenization ---") |
| total_tokens = 0 |
| for split_name, split_data in [('train', train_samples), ('val', val_samples)]: |
| all_ids = [] |
| for s in split_data: |
| ids = sp.encode(s['text']) |
| ids.append(sp.eos_id()) |
| all_ids.extend(ids) |
| |
| arr = np.array(all_ids, dtype=np.uint16) |
| filepath = os.path.join(output_dir, f'{split_name}_sft.bin') |
| arr.tofile(filepath) |
| total_tokens += len(arr) |
| print(f" {split_name}: {len(arr):,} tokens → {filepath}") |
| |
| |
| print(f"\n--- Token budget per language ---") |
| for lang in sorted(by_lang.keys()): |
| lang_tokens = 0 |
| for s in by_lang[lang]: |
| lang_tokens += len(sp.encode(s['text'])) + 1 |
| print(f" {lang}: {lang_tokens:,} tokens ({lang_tokens*100/total_tokens:.1f}%)") |
| |
| |
| metadata = { |
| 'total_samples': len(samples), |
| 'train_samples': len(train_samples), |
| 'val_samples': n_val, |
| 'total_tokens': total_tokens, |
| 'source_counts': dict(source_counts), |
| 'lang_counts': dict(lang_counts), |
| 'format': 'USER_PREFIX + instruction + ASSISTANT_PREFIX + response', |
| 'tokenizer': os.path.basename(tokenizer_path), |
| 'data_sources': [ |
| 'CohereForAI/aya_dataset (en, ar dialects, fa)', |
| 'arbml/alpaca_arabic', |
| 'FreedomIntelligence/alpaca-gpt4-arabic', |
| 'tatsu-lab/alpaca (en)', |
| 'databricks/databricks-dolly-15k (en)', |
| ], |
| 'notes': 'Hebrew data from HebrewGPT project (S3). Arabic from Aya + alpaca. Farsi from Aya. English from Aya + alpaca + dolly.', |
| } |
| with open(os.path.join(output_dir, 'sft_metadata.json'), 'w') as f: |
| json.dump(metadata, f, indent=2, ensure_ascii=False) |
| print(f"\nMetadata saved to {output_dir}/sft_metadata.json") |
| |
| print(f"\n{'='*60}") |
| print(f"✅ SFT DATA PREPARATION COMPLETE") |
| print(f"Total: {len(samples)} samples, {total_tokens:,} tokens") |
| print(f"Languages: {dict(lang_counts)}") |
| if 'he' not in dict(lang_counts): |
| print(f"⚠️ No Hebrew instruction data — Hebrew relies on cross-lingual transfer") |
| print(f"{'='*60}") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--tokenizer', required=True) |
| parser.add_argument('--output', default='/tmp/sft_data_v2') |
| parser.add_argument('--aya-per-lang', type=int, default=5000) |
| parser.add_argument('--arabic-alpaca', type=int, default=5000) |
| parser.add_argument('--arabic-gpt4', type=int, default=5000) |
| parser.add_argument('--english-alpaca', type=int, default=5000) |
| parser.add_argument('--dolly', type=int, default=3000) |
| parser.add_argument('--hebrew-dir', default='/tmp/hebrew_sft', help='Dir with Hebrew JSONL files from S3') |
| parser.add_argument('--hebrew-max', type=int, default=10000) |
| parser.add_argument('--seed', type=int, default=42) |
| args = parser.parse_args() |
| |
| random.seed(args.seed) |
| |
| print(f"Preparing multilingual SFT data v2") |
| print(f"Output: {args.output}\n") |
| |
| all_samples = [] |
| |
| |
| if os.path.isdir(args.hebrew_dir): |
| all_samples.extend(load_hebrew_sft(args.hebrew_dir, args.hebrew_max)) |
| else: |
| print(f"⚠️ Hebrew dir not found: {args.hebrew_dir}") |
| |
| |
| all_samples.extend(load_aya_multilingual(args.aya_per_lang)) |
| |
| |
| all_samples.extend(load_arabic_alpaca(args.arabic_alpaca)) |
| |
| |
| all_samples.extend(load_arabic_gpt4(args.arabic_gpt4)) |
| |
| |
| all_samples.extend(load_english_alpaca(args.english_alpaca)) |
| |
| |
| all_samples.extend(load_dolly(args.dolly)) |
| |
| if not all_samples: |
| print("ERROR: No samples collected!") |
| sys.exit(1) |
| |
| tokenize_and_save(all_samples, args.tokenizer, args.output) |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|