EMBO
/

bio-lm

+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# template from : https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
+"""Loading script for the biolang dataset for language modeling in biology."""
+from __future__ import absolute_import, division, print_function
+import json
+import datasets
+class BioLang(datasets.GeneratorBasedBuilder):
+    """BioLang: a dataset to train language models in biology."""
+    _CITATION = """\
+    @Unpublished{
+        huggingface: dataset,
+        title = {biolang},
+        authors={Thomas Lemberger, EMBO},
+        year={2021}
+    }
+    """
+    _DESCRIPTION = """\
+    This dataset is based on abstracts from the open access section of EuropePubMed Central to train language models in the domain of biology.
+    """
+    _HOMEPAGE = "https://europepmc.org/downloads/openaccess"
+    _LICENSE = "CC BY 4.0"
+    _URLS = {
+        "biolang": "https://huggingface.co/datasets/EMBO/biolang/resolve/main/oapmc_abstracts_figs.zip",
+    }
+    VERSION = datasets.Version("0.0.1")
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name="SEQ2SEQ", version="0.0.1", description="Control dataset with no masking for seq2seq task."),
+        datasets.BuilderConfig(name="MLM", version="0.0.1", description="Dataset for general masked language model."),
+        datasets.BuilderConfig(name="DET", version="0.0.1", description="Dataset for part-of-speech (determinant) masked language model."),
+        datasets.BuilderConfig(name="VERB", version="0.0.1", description="Dataset for part-of-speech (verbs) masked language model."),
+        datasets.BuilderConfig(name="SMALL", version="0.0.1", description="Dataset for part-of-speech (determinants, conjunctions, prepositions, pronouns) masked language model."),
+        datasets.BuilderConfig(name="NOUN", version="0.0.1", description="Dataset for part-of-speech (nouns) masked language model."),
+    ]
+    DEFAULT_CONFIG_NAME = "MLM"  # It's not mandatory to have a default configuration. Just use one if it make sense.
+    def _info(self):
+        if self.config.name == "MLM":
+            features = datasets.Features({
+                "input_ids": datasets.Sequence(feature=datasets.Value("int32")),
+                "special_tokens_mask": datasets.Sequence(feature=datasets.Value("int8")),
+            })
+        elif self.config.name in ["DET", "VERB", "SMALL", "NOUN", "NULL"]:
+            features = datasets.Features({
+                "input_ids": datasets.Sequence(feature=datasets.Value("int32")),
+                "tag_mask": datasets.Sequence(feature=datasets.Value("int8")),
+            })
+        elif self.config.name == "SEQ2SEQ":
+            features = datasets.Features({
+                "input_ids": datasets.Sequence(feature=datasets.Value("int32")),
+                "labels": datasets.Sequence(feature=datasets.Value("int32"))
+            })
+        return datasets.DatasetInfo(
+            description=self._DESCRIPTION,
+            features=features,  # Here we define them above because they are different between the two configurations
+            supervised_keys=('input_ids', 'pos_mask'),
+            homepage=self._HOMEPAGE,
+            license=self._LICENSE,
+            citation=self._CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        if self.config.data_dir:
+            data_dir = self.config.data_dir
+        else:
+            url = self._URLS["biolang"]
+            data_dir = dl_manager.download_and_extract(url)
+            data_dir += "/oapmc_abstracts_figs"
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": data_dir + "/train.jsonl",
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": data_dir + "/test.jsonl",
+                    "split": "test"
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": data_dir + "/eval.jsonl",
+                    "split": "eval",
+                },
+            ),
+        ]
+    def _generate_examples(self, filepath, split):
+        """ Yields examples. """
+        with open(filepath, encoding="utf-8") as f:
+            for id_, row in enumerate(f):
+                data = json.loads(row)
+                if self.config.name == "MLM":
+                    yield id_, {
+                        "input_ids": data["input_ids"],
+                        "special_tokens_mask": data['special_tokens_mask']
+                    }
+                # else Part of Speech tags based on
+                # Universal POS tags https://universaldependencies.org/u/pos/
+                elif self.config.name == "DET":
+                    pos_mask = [0] * len(data['input_ids'])
+                    for idx, label in enumerate(data['label_ids']):
+                        if label == 'DET':
+                            pos_mask[idx] = 1
+                    yield id_, {
+                        "input_ids": data['input_ids'],
+                        "tag_mask": pos_mask,
+                    }
+                elif self.config.name == "VERB":
+                    pos_mask = [0] * len(data['input_ids'])
+                    for idx, label in enumerate(data['label_ids']):
+                        if label == 'VERB':
+                            pos_mask[idx] = 1
+                    yield id_, {
+                        "input_ids": data['input_ids'],
+                        "tag_mask": pos_mask,
+                    }
+                elif self.config.name == "SMALL":
+                    pos_mask = [0] * len(data['input_ids'])
+                    for idx, label in enumerate(data['label_ids']):
+                        if label in ['DET', 'CCONJ', 'SCONJ', 'ADP', 'PRON']:
+                            pos_mask[idx] = 1
+                    yield id_, {
+                        "input_ids": data['input_ids'],
+                        "tag_mask": pos_mask,
+                    }
+                elif self.config.name == "NOUN":
+                    pos_mask = [0] * len(data['input_ids'])
+                    for idx, label in enumerate(data['label_ids']):
+                        if label in ['NOUN']:
+                            pos_mask[idx] = 1
+                    yield id_, {
+                        "input_ids": data['input_ids'],
+                        "tag_mask": pos_mask,
+                    }
+                elif self.config.name == "SEQ2SEQ":
+                    "Seq2seq training needs the input_ids as labels, no masking"
+                    pos_mask = [0] * len(data['input_ids'])
+                    yield id_, {
+                        "input_ids": data['input_ids'],
+                        "labels": data['input_ids']
+                    }