| """ |
| Classes to read/write disrpt-like files |
| + analysis of sentence splitter / "gold" sentences or stanza/spacy sentences |
| - ersatz |
| |
| Disrpt is a discourse analysis campaign with (as of 2023): |
| - discourse segmentation information, in a conll-like format |
| - discourse connective information (also conll-like) |
| - discourse relations pairs, in a specific format |
| |
| data are separated by corpora and language with conventionnal names |
| as language.framework.corpusname |
| eg fra.srdt.annodis |
| |
| TODO: |
| - refactor how sentences are stored with dictionary: "connlu" / "tok" / "split" |
| [ok] dictionary |
| ? refactor creation of corpus/documents to allow for update (or load tok+conllu at once) |
| - [ok] italian luna corpus has different meta tags avec un niveau supplémentaire: newdoc_id/newturn_id/newutterance_id |
| - [ok] check behaviour on languages without pretrained models/what candidates ? |
| - nl, pt, it -> en? |
| - thai -> multilingual |
| - test different candidates sets for splitting locations: |
| - [done] all -> trop sous-spécifié et trop lent |
| - [ok] en on all but zho+thai |
| - (done] en à la place de multilingual ? |
| bad scores on zho |
| - [ok] fix bad characters: BOM, replacement char etc |
| spécial char for apostrophe, cf |
| data_clean/eng.dep.scidtb/eng.dep.scidtb_train.tok / newdoc_id = P16-1030 prob de char pour possessif |
| ��antagonist�� |
| |
| pb basque: "Osasun-zientzietako Ikertzaileen II ." nb tokens ... |
| Iru�eko etc |
| - pb turk: tur.pdtb.tdb/tur.pdtb.tdb_train: BOM ? '\ufeff' -> 'Makale' |
| + extra blanc dans train (785)? |
| 774 olduğunu _ _ _ _ _ _ _ _ |
| 775 söylüyor _ _ _ _ _ _ _ _ |
| 776 : _ _ _ _ _ _ _ _ |
| 777 Türkiye _ _ _ _ _ _ _ _ |
| 778 demokrasi _ _ _ _ _ _ _ _ |
| 779 istiyor _ _ _ _ _ _ _ _ |
| 780 ÖDPGenel _ _ _ _ _ _ _ _ |
| 781 Başkanı _ _ _ _ _ _ _ _ |
| 782 Ufuk _ _ _ _ _ _ _ _ |
| 783 Uras'tan _ _ _ _ _ _ _ _ |
| 784 : _ _ _ _ _ _ _ _ |
| 785 _ _ _ _ _ _ _ _ |
| 786 Türkiye _ _ _ _ _ _ _ _ |
| 787 , _ _ _ _ _ _ _ _ |
| 788 AİHM'de _ _ |
| - pb zh |
| zh: ?是 is this "?" listed in ersatz ? |
| ??hosto2 |
| sctb 3.巴斯克 |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| |
| |
| - specific preproc: |
| annodis/gum: titles |
| gum/rrt : biblio / articles |
| scidtb ? |
| - different sentence splitters |
| - [ok] ersatz |
| - trankit |
| - [abandoned] stanza: FIXME: lots of errors done by stanza eg split within words (might be due to bad input tokenization) |
| - [done] write doc in disrt format (after transformation for instance) |
| - [done] eval of beginning of sentences (precision) |
| - [done] (done in split_sentence script) eval / nb sentences connl ~= recall sentences |
| - eval length sentences (max) |
| - [moot] clean main script : arguments/argparse -> script à part |
| - [done] method for sentence splitting (for tok) |
| - [done] iterate all docs in corpus |
| - [done] choose language according to corpus name automatically |
| - ?method for sentence resplitting for conllu ? needs ways of indexing tokens for later reeval ? or eval script does not care ? |
| |
| |
| candidate sets for splitting: |
| |
| - multilingual (default) is as described in ersatz paper == [EOS punctuation][!number] |
| - en requires a space following punctuation |
| - all: a space between any two characters |
| - custom can be written that uses the determiner.Split() base class |
| |
| |
| |
| """ |
| import sys, os |
| import dataclasses |
| from itertools import chain |
| from collections import Counter |
| from copy import copy, deepcopy |
| from tqdm import tqdm |
| |
| |
| |
| |
| |
|
|
| from transformers import pipeline |
|
|
| from wtpsplit import SaT |
|
|
|
|
|
|
| |
| BOM = '\ufeff' |
| REPL_CHAR = "\ufffd" |
|
|
| test_doc_seg = """# newdoc id = geop_3_space |
| 1 La le DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 2 det _ BeginSeg=Yes |
| 2 Space space PROPN _ _ 0 root _ _ |
| 3 Launcher Launcher PROPN _ _ 2 flat:name _ _ |
| 4 Initiative initiative PROPN _ _ 2 flat:name _ _ |
| 5 . . PUNCT _ _ 2 punct _ _ |
| |
| 1 Le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 2 det _ BeginSeg=Yes |
| 2 programme programme NOUN _ Gender=Masc|Number=Sing 10 nsubj _ _ |
| 3 de de ADP _ _ 4 case _ _ |
| 4 Space space PROPN _ _ 2 nmod _ _ |
| 5 Launcher Launcher PROPN _ _ 4 flat:name _ _ |
| 6 Initiative initiative PROPN _ _ 4 flat:name _ _ |
| 7 ( ( PUNCT _ _ 8 punct _ BeginSeg=Yes |
| 8 SLI SLI PROPN _ _ 4 appos _ _ |
| 9 ) ) PUNCT _ _ 8 punct _ _ |
| 10 vise viser VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ BeginSeg=Yes |
| 11 à à ADP _ _ 12 mark _ _ |
| 12 développer développer VERB _ VerbForm=Inf 10 ccomp _ _ |
| 13 un un DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 14 det _ _ |
| 14 système système NOUN _ Gender=Masc|Number=Sing 12 obj _ _ |
| 15 de de ADP _ _ 16 case _ _ |
| 16 lanceur lanceur NOUN _ Gender=Masc|Number=Sing 14 nmod _ _ |
| 17 réutilisable réutilisable ADJ _ Gender=Masc|Number=Sing 16 amod _ _ |
| 18 entièrement entièrement ADV _ _ 19 advmod _ _ |
| 19 inédit inédit ADJ _ Gender=Masc|Number=Sing 14 amod _ _ |
| 20 . . PUNCT _ _ 10 punct _ _ |
| |
| # newdoc id = ling_fuchs_section2 |
| 1 Théorie théorie PROPN _ _ 0 root _ BeginSeg=Yes |
| 2 psychomécanique psychomécanique ADJ _ Gender=Masc|Number=Sing 1 amod _ _ |
| 3 et et CCONJ _ _ 4 cc _ _ |
| 4 cognition cognition NOUN _ Gender=Fem|Number=Sing 1 conj _ _ |
| 5 . . PUNCT _ _ 1 punct _ _ |
| """ |
|
|
| |
| Token = dataclasses.make_dataclass("Token","id form lemma pos xpos morph head_id dep_type extra label".split(), |
| namespace={'__repr__': lambda self: self.form, |
| 'format': lambda self: ("\t".join(map(str,dataclasses.astuple(self)))), |
| |
| |
| |
| } |
| ) |
|
|
|
|
| class Sentence: |
|
|
| def __init__(self,token_list,meta): |
| self.toks = token_list |
| self.meta = meta |
| |
| self.label_start = ["Seg=B-conn", "Seg=B-seg"] |
| self.label_end = ["Seg=I-conn", "Seg=O"] |
|
|
| def __iter__(self): |
| return iter(self.toks) |
| |
| def __len__(self): |
| return len(self.toks) |
|
|
| def display(self,segment=False): |
| """if segment option set to true, print sentences with marking of EDUs""" |
| if segment: |
| output = [f"{'|' if token.label=='Seg=B-seg' else ''}{token.form}" for token in self] |
| |
| return " ".join(output)+"|" |
| else: |
| return self.meta["text"] |
|
|
| def __in__(self,word): |
| for token in self.toks: |
| if token.form == word: |
| return True |
| return False |
|
|
| def __repr__(self): |
| return self.display() |
|
|
| def format(self): |
| meta = f"# sent_id = {self.meta['sent_id']}\n" + f"# text = {self.meta['text']}\n" |
| output = "\n".join([t.format() for t in self.toks]) |
| return meta+output |
|
|
| |
| |
| TRANKIT_LANG_MAP = { |
| "de": "german", |
| "en":"english", |
| |
| "gum": "english-gum", |
| "fr":"french", |
| "it": "italian", |
| "sp": "spanish", |
| "es": "spanish", |
| "eu": "basque", |
| "zh": "chinese", |
| "ru": "russian", |
| "tr": "turkish", |
| "pt":"portuguese", |
| "fa": "persian", |
| "nl":"dutch", |
| |
| } |
|
|
| lg_map = {"sp":"es", |
| "po":"pt", |
| "tu":"tr"} |
|
|
|
|
| def get_language(lang,model): |
| lang = lang[:2] |
| if lang in lg_map: |
| lang = lg_map[lang] |
| if model=="ersatz": |
| if lang not in ersatz_languages: |
| lang = "default-multilingual" |
| if model=="trankit": |
| lang = TRANKIT_LANG_MAP.get(lang,"auto") |
| return lang |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| ending_punc = { |
| '\u0964', |
| '\u061F', |
| '\u002E', |
| '\u3002', |
| '\u0021', |
| '\u06D4', |
| '\u17D4', |
| '\u003F', |
| '\uFF61', |
| '\uFF0E', |
| '\u2026', |
| } |
|
|
| closing_punc = { |
| '\u3011', |
| '\u00BB', |
| '\u201D', |
| '\u300F', |
| '\u2018', |
| '\u0022', |
| '\u300D', |
| '\u201C', |
| '\u0027', |
| '\u2019', |
| '\u0029' |
| } |
|
|
| list_set = { |
| '\u30fb', |
| '\uFF65', |
| '\u002a', |
| '\u002d', |
| '\u4e00' |
| } |
| class Document: |
| _hard_punct = {"default":{".",";","?","!"}| ending_punc, |
| "zh": {"。","?"} |
| } |
|
|
| def __init__(self,sentence_list,meta,src="conllu"): |
| self.sentences = {src:sentence_list} |
| self.meta = meta |
|
|
| def __repr__(self): |
| |
| if "tok" in self.sentences: |
| return "\n".join(map(repr,self.sentences.get("conllu",self.sentences["tok"]))) |
| elif "conllu" in self.sentences: |
| return "\n".join(map(repr,self.sentences.get("conllu",self.sentences["conllu"]))) |
| else: |
| sys.exit("Unknown type of file: "+str(self.sentences.keys())) |
|
|
| |
| def get_sentences(self,src="tok"): |
| return self.sentences[src] |
| |
| def baseline_split(self,lang="default"): |
| """default split for languages where we have issues re-aligning tokens for various reasons |
| |
| this just splits at every token that is a hard punctuations |
| |
| FIXME : this is not complete |
| """ |
| sentence_id = 1 |
| sentences = [] |
| current = [] |
| orig_doc = self.sentences["tok"][0] |
| for token in orig_doc: |
| current.append(token) |
| if token.lemma in self._hard_punct.get(lang,"default"): |
| sentences.append(Sentence(current,meta)) |
| meta = {"doc_id":orig_doc.meta["doc_id"], |
| "sent_id" : sentence_id, |
| "text": " ".join([x.form for x in current]) |
| } |
| current = [] |
| sentence += 1 |
| if current!=[]: |
| meta = {"doc_id":orig_doc.meta["doc_id"], |
| "sent_id" : sentence_id, |
| "text": " ".join([x.form for x in current]) |
| } |
| sentences.append(Sentence(current,meta)) |
| return sentences |
|
|
|
|
| def cutoff_split(self,cutoff=120,lang="default"): |
| """ |
| default split for corpora with little or no punctuation (transcription etc) |
| |
| just make a new sentence as soon as more than cutoff tokens |
| """ |
| sentence_id = 1 |
| sentences = [] |
| current = [] |
| current_cpt = 1 |
| orig_doc = self.sentences["tok"][0] |
| meta = {"doc_id":orig_doc.meta["doc_id"], |
| "sent_id" : sentence_id, |
| } |
| for token in orig_doc: |
| token.id = current_cpt |
| current_cpt += 1 |
| current.append(token) |
| |
| if len(current) >= cutoff: |
| |
| meta = {"doc_id":orig_doc.meta["doc_id"], |
| "sent_id" : sentence_id, |
| "text": " ".join([x.form for x in current]) |
| } |
| sentences.append(Sentence(current,meta)) |
| current = [] |
| sentence_id += 1 |
| current_cpt = 1 |
| if current!=[]: |
| meta = {"doc_id":orig_doc.meta["doc_id"], |
| "sent_id" : sentence_id, |
| "text": " ".join([x.form for x in current]) |
| } |
| sentences.append(Sentence(current,meta)) |
| return sentences |
|
|
| def ersatz_split(self,doc,lang='default-multilingual',candidates="en"): |
| result = split(model=lang, |
| text=doc, output=None, |
| batch_size=16, |
| candidates=candidates, |
| cpu=True, columns=None, delimiter='\t') |
| return result |
| |
| def stanza_split(self,orig_doc,lang): |
| nlp = stanza.Pipeline(lang=lang, processors='tokenize',download_method=DownloadMethod.REUSE_RESOURCES) |
| doc = nlp(orig_doc) |
| sentences = [] |
| for s in doc.sentences: |
| sentences.append(" ".join([t.text for t in s.tokens])) |
| return sentences |
| |
|
|
| def trankit_split(self,orig_doc,lang,pipeline): |
| trk_sentences = pipeline.ssplit(orig_doc) |
| sentences = [] |
| for s in trk_sentences["sentences"]: |
| sentences.append(s["text"]) |
| return sentences |
| |
| def sat_split(self, orig_doc, sat_model): |
| sat_sentences = sat_model.split( str(orig_doc) ) |
| sentences = [] |
| for s in sat_sentences: |
| sentences.append(s) |
| return sentences |
|
|
| |
| def _remap_tokens(self,split_sentences): |
| """remap tokens from sentence splitting to the token original information""" |
| |
| |
| |
| |
| |
| orig_token_nb = sum(map(len,self.sentences["tok"])) |
| split_token_nb = len(list(chain(*[x.split() for x in split_sentences]))) |
| try: |
| assert orig_token_nb==split_token_nb |
| except: |
| print("WARNING wrong nb of tokens",orig_token_nb,"initially but",split_token_nb,"after split",file=sys.stderr) |
| |
| new_sentences = [] |
| position = 0 |
| skip_first_token = False |
| |
| orig_doc = self.sentences["tok"][0] |
| for i,s in enumerate(split_sentences): |
| new_toks = s.split() |
| if skip_first_token: |
| new_toks = new_toks[1:] |
| toks = orig_doc.toks[position:position+len(new_toks)] |
| meta = {"doc_id":orig_doc.meta["doc_id"], |
| "sent_id" : i+1, |
| "text": " ".join([x.form for x in toks]) |
| } |
| new_tok_position = position |
| shift = 0 |
| |
| |
| new_toks_length = len(new_toks) |
| for j in range(len(toks)): |
| toks[j].id = j+1 |
| new_j = j + shift |
| try: |
| assert toks[j].form==new_toks[new_j] |
| |
| |
| skip_first_token = False |
| except: |
| |
| |
| |
| |
| if j!= len(toks)-1: |
| if len(toks[j].form)!=len(new_toks[new_j]): |
| |
| if toks[j].form==new_toks[new_j]+new_toks[new_j+1]: |
| |
| shift = shift + 1 |
| |
| else: |
| if i+1<len(split_sentences): |
| next_sentence = split_sentences[i+1] |
| next_token = split_sentences[i+1].split()[0] |
| skip_first_token = True |
| if toks[j].form==new_toks[new_j]+next_token: |
| pass |
| |
| else: |
| pass |
| |
| |
| else: |
| pass |
| |
| |
| |
| |
| if len(toks)>0: |
| new_sentences.append(Sentence(toks,meta)) |
| position = position + len(new_toks) - shift |
| else: |
| skip_first_token = False |
| split_token_nb = sum( [len(s.toks) for s in new_sentences] ) |
| |
| try: |
| assert orig_token_nb==split_token_nb |
| except: |
| print("ERROR wrong nb of tokens",orig_token_nb,"originally but",split_token_nb,"after split+remap",file=sys.stderr) |
| sys.exit() |
| return new_sentences |
|
|
|
|
| def sentence_split(self,model="ersatz",lang="default-multilingual",**kwargs): |
| """ |
| call the sentence splitter to the actual document read as one from a tok file. |
| kwargs might contain an open "pipeline" (eg. trankit pipeline) to pass on downstream for splitting sentences, so that it is not re-created for each paragraph |
| """ |
| |
| |
| doc = [x.form for x in self.sentences["tok"][0]] |
| doc = " ".join(doc) |
| if model=="ersatz": |
| |
| |
| |
| candidates = "en" if lang not in {"zh","th"} else "multilingual" |
| new_sentences = self.ersatz_split(doc,lang=lang,candidates=candidates) |
| elif model=="stanza": |
| new_sentences = self.stanza_split(doc,lang=lang) |
| elif model=="trankit": |
| new_sentences = self.trankit_split(doc,lang=lang,**kwargs) |
| elif model=="baseline": |
| new_sentences = self.baseline_split(lang=lang) |
| self.sentences["split"] = new_sentences |
| elif model=="sat": |
| sat_model = kwargs.get("sat_model") |
| if sat_model is None: |
| raise ValueError("sat_model must be provided for SAT sentence splitting.") |
| new_sentences = self.sat_split(doc, sat_model) |
| self.sentences["split"] = new_sentences |
| elif model == "cutoff": |
| new_sentences = self.cutoff_split(lang=lang) |
| self.sentences["split"] = new_sentences |
| else: |
| raise NotImplementedError |
| if model!="baseline" and model!="cutoff": |
| self.sentences["split"] = self._remap_tokens(new_sentences) |
| return self.sentences["split"] |
| |
|
|
| def search_word(self,word): |
| return [s for s in self.sentences.get("split",[]) if word in s] |
|
|
| def format(self,mode="split"): |
| """format the document as disrpt format |
| mode=original (sentences) or split (split_sentences) |
| """ |
| target = self.sentences[mode] |
| |
| output = "\n".join([s.format()+"\n" for s in target]) |
| meta = f"# doc_id = {self.meta}\n" |
| return meta+output |
|
|
|
|
| class Corpus: |
| META_types = {"newdoc_id":"doc_id", |
| "newdoc id":"doc_id", |
| "doc_id":"doc_id", |
| "sent_id":"sent_id", |
| "newturn_id":"newturn_id", |
| "newutterance":"newutterance", |
| "newutterance_id":"newutterance_id", |
| "text":"text", |
| } |
|
|
|
|
|
|
| def __init__(self,data=None): |
| """input to constructor is a string |
| """ |
| if data: |
| self.docs = self._parse(data.split("\n")) |
| |
| @staticmethod |
| def _meta_parse(data_line): |
| """ parse comments as they contain meta information""" |
| if not("=" in data_line): |
| return "","" |
| info, value = data_line[1:].strip().split("=",1) |
| info = info.strip() |
| if info in Corpus.META_types: |
| meta_type = Corpus.META_types[info] |
| else: |
| |
| meta_type, value = "","" |
| return meta_type,value.strip() |
|
|
| def search_doc(self,docid): |
| return [x for x in self.docs if x.meta==docid] |
|
|
| def _parse(self,data_lines,src="tok"): |
| """parse disrpt segmentation/connective files""" |
| curr_token_list = [] |
| sentences = [] |
| docs = [] |
| s_idx = 0 |
| doc_idx = 0 |
| meta = {} |
| |
| for data_line in data_lines: |
| data_line = data_line.strip() |
| if data_line: |
| |
| if data_line.startswith("#"): |
| meta_type,value = Corpus._meta_parse(data_line) |
| |
| if meta_type=="doc_id": |
| |
| if doc_idx>0: |
| |
| docs.append(Document(sentences,meta["doc_id"],src=src)) |
| sentences = [] |
| curr_token_list = [] |
| s_idx = 0 |
| meta = {} |
| doc_idx += 1 |
| if meta_type!="": |
| meta[meta_type] = value |
| else: |
| token, label = self.parse_token(meta, data_line) |
| |
| |
| |
| if not("-" in token[0]) and not("." in token[0]): |
| curr_token_list.append(Token(*token,label)) |
| else: |
| meta["text"] = " ".join((x.form for x in curr_token_list)) |
| s_idx += 1 |
| |
| if "sent_id" not in meta: |
| meta["sent_id"] = s_idx |
| sentences.append(Sentence(curr_token_list,meta)) |
| curr_token_list = [] |
| meta = {"doc_id":meta["doc_id"]} |
| if len(curr_token_list)>0 or len(sentences)>0: |
| meta["text"] = " ".join((x.form for x in curr_token_list)) |
| sentences.append(Sentence(curr_token_list,meta)) |
| |
| |
| |
| docs.append(Document(sentences,meta["doc_id"],src=src)) |
| |
| return docs |
| def format(self, file=None, mode="split"): |
| output = "\n\n".join([doc.format(mode=mode) for doc in self.docs]) |
| if file: |
| os.makedirs(os.path.dirname(file), exist_ok=True) |
| with open(file, "w", encoding="utf-8") as f: |
| f.write(output) |
| return output |
| def parse_token(self, meta, data_line): |
| *token, label = data_line.split("\t") |
| if len(token)==8: |
| print("ERROR: missing label ",meta,token,file=sys.stderr) |
| token = token + [label] |
| label = '_' |
| |
| |
| |
| if token[1] == BOM: token[1]="_" |
| |
| |
| token[1] = token[1].replace(REPL_CHAR,"_") |
| label_set = set(label.split("|")) |
| label = (label_set & set(self.LABELS)) |
| if label==set(): |
| label= "_" |
| else: |
| label = label.pop() |
| return token,label |
|
|
| def from_file(self,filepath): |
| """ |
| reads a conllu or tok file |
| connlu has sentences, tok does not |
| |
| option to pass on a string instead of file path, mostly for testing |
| |
| TODO: should be a static method |
| """ |
| self.filepath = filepath |
| basename = os.path.basename(filepath) |
| src = basename.split(".")[-1] |
| |
| with open(filepath,"r",encoding="utf8") as f: |
| data_lines = f.readlines() |
| self.docs = self._parse(data_lines,src=src) |
| |
| |
| def from_string(self, text: str, src="conllu"): |
| """ |
| Lit directement à partir d'une string (utile pour tests ou génération dynamique). |
| src peut être 'conllu', 'tok', ou 'split' pour indiquer le format. |
| """ |
| self.filepath = None |
| if isinstance(text, str): |
| data_lines = text.strip().splitlines() |
| else: |
| raise ValueError("from_string attend une chaîne de caractères") |
| self.docs = self._parse(data_lines, src=src) |
| def format(self,mode="split",file=sys.stdout): |
| if type(file)==str: |
| os.makedirs(os.path.dirname(file), exist_ok=True) |
| file = open(file,"w") |
| for d in self.docs: |
| print(d.format(mode=mode),file=file) |
|
|
| def align(self,filepath): |
| """load conllu for corresponding tok file""" |
| pass |
| |
| def sentence_split(self,model="ersatz",lang="default-multilingual",**kwargs): |
| """apply a sentence splitter to the document, assuming this was read from |
| a .tok file |
| |
| kwargs might contain an open "pipeline" (eg. trankit pipeline) to pass on downstream for splitting sentences, so that it is not re-created for each paragraph |
| |
| """ |
| for doc in tqdm(self.docs): |
| doc.sentence_split(model=model,lang=lang,**kwargs) |
| |
|
|
| def eval_sentences(self,mode="split"): |
| """eval sentence beginning as segment beginning |
| TODO rename -> precision |
| |
| only .tok for now but could be used to eval re-split of connlu |
| more complex for pdtb: need to align tok and connlu |
| """ |
| tp = 0 |
| total_s = 0 |
| labels = [] |
| for doc in self.docs: |
| for s in doc.get_sentences(mode): |
| if len(s.toks)==0: |
| print("WARNING empty sentence in ",s.meta,file=sys.stderr) |
| break |
| tp += (s.toks[0].label=="Seg=B-seg") |
| |
| total_s += 1 |
| labels.extend([x.label for x in s]) |
| counts = Counter(labels) |
| |
| return tp, total_s, counts["Seg=B-seg"] |
|
|
| class SegmentCorpus(Corpus): |
| LABELS = ["Seg=O","Seg=B-seg"] |
|
|
| class ConnectiveCorpus(Corpus): |
| LABELS = ['Conn=O', 'Conn=B-conn', 'Conn=I-conn'] |
| id2label = {i: label for i, label in enumerate( LABELS )} |
| label2id = {v: k for k,v in id2label.items()} |
|
|
| class RelationCorpus(Corpus): |
|
|
| def from_file(self,filepath): |
| pass |
|
|
| |
| |
| |
| |
| |
|
|
|
|
| if __name__=="__main__": |
| |
| import sys, os |
| from pathlib import PurePath |
| |
| |
| |
|
|
|
|
| sat = SaT("sat-3l") |
| |
| |
| print( sat.split("This is a test This is another test.") ) |
| if len(sys.argv)>1: |
| test_path = sys.argv[1] |
| else: |
| test_path = "../jiant/tests/test_data/eng.pdtb.pdtb/eng.pdtb.pdtb_debug.tok" |
| |
| |
| basename = os.path.basename(test_path) |
| lang = basename.split(".")[0] |
| |
| |
| path = PurePath(test_path) |
| |
| output_path = "out" |
| |
| if "pdtb" in test_path: |
| corpus = ConnectiveCorpus() |
| else: |
| corpus = SegmentCorpus() |
| corpus.from_file(test_path) |
|
|
| sat = SaT("sat-3l") |
| |
| |
| print( sat.split("This is a test This is another test.") ) |
| doc1 = corpus.docs[0] |
| s0 = doc1.sentences["tok"][0] |
| print(doc1) |
| print(list(sat.split(str(doc1)))) |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
|
|
|
|
| |
| |
| |
| |
| corpus.sentence_split(model="sat", sat_model=sat) |
| tp, tot, all = corpus.eval_sentences() |
| print(tp, tot, all) |
| |
| corpus.format(file=output_path) |