Huayang commited on
Commit
313e0d2
·
verified ·
1 Parent(s): 5cafcd4

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. -/english.jsonl +0 -0
  2. -/number_reading.py +56 -0
  3. -/play.ipynb +25 -0
-/english.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
-/number_reading.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datasets
3
+
4
+ _CITATION = """\
5
+ # (Optional) Add your citation here
6
+ """
7
+
8
+ _DESCRIPTION = """\
9
+ Number Reading
10
+ """
11
+ LANGUAGES = [
12
+ "english"
13
+ ]
14
+ class NumberReadingConfig(datasets.BuilderConfig):
15
+ def __init__(self, task_name, **kwargs):
16
+ super().__init__(name=task_name, **kwargs)
17
+ self.task_name = task_name
18
+
19
+
20
+ class NumberReading(datasets.GeneratorBasedBuilder):
21
+ BUILDER_CONFIGS = [
22
+ NumberReadingConfig(task_name=lang) for lang in LANGUAGES
23
+ ]
24
+
25
+ def _info(self):
26
+ return datasets.DatasetInfo(
27
+ description=_DESCRIPTION,
28
+ features=datasets.Features({
29
+ "golden_reading": datasets.Sequence(datasets.Value("string")),
30
+ "prompt": datasets.Value("string"),
31
+ "number": datasets.Value("string"),
32
+ "language": datasets.Value("string"),
33
+ # add more fields depending on your JSONL schema
34
+ }),
35
+ supervised_keys=None,
36
+ homepage="https://huggingface.co/datasets/huayangli/`nlgraph`",
37
+ citation=_CITATION,
38
+ )
39
+
40
+ def _split_generators(self, dl_manager):
41
+ # task_dir = self.config.data_dir or "."
42
+ task_dir = "/home/huayang_sakana_ai/workspace/FSAugmentation/hf_data"
43
+ lang = self.config.name
44
+ return [
45
+ datasets.SplitGenerator(
46
+ name=datasets.Split.TEST,
47
+ gen_kwargs={"filepath": os.path.join(task_dir, f"{lang}.jsonl")},
48
+ )
49
+ ]
50
+ def _generate_examples(self, filepath):
51
+ import json
52
+ with open(filepath, "r", encoding="utf-8") as f:
53
+ for idx, line in enumerate(f):
54
+ data = json.loads(line)
55
+ data.pop("training")
56
+ yield idx, data
-/play.ipynb ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "7f691db1",
7
+ "metadata": {
8
+ "vscode": {
9
+ "languageId": "plaintext"
10
+ }
11
+ },
12
+ "outputs": [],
13
+ "source": [
14
+ "from datasets import load_dataset"
15
+ ]
16
+ }
17
+ ],
18
+ "metadata": {
19
+ "language_info": {
20
+ "name": "python"
21
+ }
22
+ },
23
+ "nbformat": 4,
24
+ "nbformat_minor": 5
25
+ }