ronnengmail commited on
Commit
6aac7da
·
verified ·
1 Parent(s): bffe8cd

Upload sft_data/sft_metadata.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. sft_data/sft_metadata.json +40 -0
sft_data/sft_metadata.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_samples": 38512,
3
+ "train_samples": 36587,
4
+ "val_samples": 1925,
5
+ "total_tokens": 6329257,
6
+ "source_counts": {
7
+ "alpaca_arabic": 4991,
8
+ "aya": 10522,
9
+ "alpaca_gpt4_arabic": 5000,
10
+ "hebrew_sentiment_instruction": 3963,
11
+ "hebrew_heq_instruction": 1217,
12
+ "dolly": 3000,
13
+ "hebrew_sft_v3_combined": 1484,
14
+ "hebrew_hesum_instruction": 565,
15
+ "hebrew_translation_instruction": 2171,
16
+ "alpaca_en": 4999,
17
+ "hebrew_alpaca_hebrew": 261,
18
+ "hebrew_hebnli_instruction": 136,
19
+ "hebrew_dolly_hebrew": 123,
20
+ "hebrew_chat_hebrew": 56,
21
+ "hebrew_winograd_instruction": 14,
22
+ "hebrew_hebnli_extra_instruction": 10
23
+ },
24
+ "lang_counts": {
25
+ "ar": 14991,
26
+ "fa": 1578,
27
+ "he": 10000,
28
+ "en": 11943
29
+ },
30
+ "format": "USER_PREFIX + instruction + ASSISTANT_PREFIX + response",
31
+ "tokenizer": "multilingual_32k.model",
32
+ "data_sources": [
33
+ "CohereForAI/aya_dataset (en, ar dialects, fa)",
34
+ "arbml/alpaca_arabic",
35
+ "FreedomIntelligence/alpaca-gpt4-arabic",
36
+ "tatsu-lab/alpaca (en)",
37
+ "databricks/databricks-dolly-15k (en)"
38
+ ],
39
+ "notes": "Hebrew data from HebrewGPT project (S3). Arabic from Aya + alpaca. Farsi from Aya. English from Aya + alpaca + dolly."
40
+ }