Huayang commited on
Upload folder using huggingface_hub
Browse files- -/english.jsonl +0 -0
- -/number_reading.py +56 -0
- -/play.ipynb +25 -0
-/english.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
-/number_reading.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import datasets
|
| 3 |
+
|
| 4 |
+
_CITATION = """\
|
| 5 |
+
# (Optional) Add your citation here
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
_DESCRIPTION = """\
|
| 9 |
+
Number Reading
|
| 10 |
+
"""
|
| 11 |
+
LANGUAGES = [
|
| 12 |
+
"english"
|
| 13 |
+
]
|
| 14 |
+
class NumberReadingConfig(datasets.BuilderConfig):
|
| 15 |
+
def __init__(self, task_name, **kwargs):
|
| 16 |
+
super().__init__(name=task_name, **kwargs)
|
| 17 |
+
self.task_name = task_name
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class NumberReading(datasets.GeneratorBasedBuilder):
|
| 21 |
+
BUILDER_CONFIGS = [
|
| 22 |
+
NumberReadingConfig(task_name=lang) for lang in LANGUAGES
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
def _info(self):
|
| 26 |
+
return datasets.DatasetInfo(
|
| 27 |
+
description=_DESCRIPTION,
|
| 28 |
+
features=datasets.Features({
|
| 29 |
+
"golden_reading": datasets.Sequence(datasets.Value("string")),
|
| 30 |
+
"prompt": datasets.Value("string"),
|
| 31 |
+
"number": datasets.Value("string"),
|
| 32 |
+
"language": datasets.Value("string"),
|
| 33 |
+
# add more fields depending on your JSONL schema
|
| 34 |
+
}),
|
| 35 |
+
supervised_keys=None,
|
| 36 |
+
homepage="https://huggingface.co/datasets/huayangli/`nlgraph`",
|
| 37 |
+
citation=_CITATION,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
def _split_generators(self, dl_manager):
|
| 41 |
+
# task_dir = self.config.data_dir or "."
|
| 42 |
+
task_dir = "/home/huayang_sakana_ai/workspace/FSAugmentation/hf_data"
|
| 43 |
+
lang = self.config.name
|
| 44 |
+
return [
|
| 45 |
+
datasets.SplitGenerator(
|
| 46 |
+
name=datasets.Split.TEST,
|
| 47 |
+
gen_kwargs={"filepath": os.path.join(task_dir, f"{lang}.jsonl")},
|
| 48 |
+
)
|
| 49 |
+
]
|
| 50 |
+
def _generate_examples(self, filepath):
|
| 51 |
+
import json
|
| 52 |
+
with open(filepath, "r", encoding="utf-8") as f:
|
| 53 |
+
for idx, line in enumerate(f):
|
| 54 |
+
data = json.loads(line)
|
| 55 |
+
data.pop("training")
|
| 56 |
+
yield idx, data
|
-/play.ipynb
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "7f691db1",
|
| 7 |
+
"metadata": {
|
| 8 |
+
"vscode": {
|
| 9 |
+
"languageId": "plaintext"
|
| 10 |
+
}
|
| 11 |
+
},
|
| 12 |
+
"outputs": [],
|
| 13 |
+
"source": [
|
| 14 |
+
"from datasets import load_dataset"
|
| 15 |
+
]
|
| 16 |
+
}
|
| 17 |
+
],
|
| 18 |
+
"metadata": {
|
| 19 |
+
"language_info": {
|
| 20 |
+
"name": "python"
|
| 21 |
+
}
|
| 22 |
+
},
|
| 23 |
+
"nbformat": 4,
|
| 24 |
+
"nbformat_minor": 5
|
| 25 |
+
}
|