| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| from __future__ import division |
| from __future__ import print_function |
|
|
| import argparse |
| import io |
| import sys |
| import unicodedata |
| import unittest |
|
|
| __version__ = "2.1.1-dev" |
|
|
|
|
| |
| ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) |
|
|
| |
| CONTENT_DEPRELS = { |
| "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative", |
| "expl", "dislocated", "advcl", "advmod", "discourse", "nmod", "appos", |
| "nummod", "acl", "amod", "conj", "fixed", "flat", "compound", "list", |
| "parataxis", "orphan", "goeswith", "reparandum", "root", "dep" |
| } |
|
|
| FUNCTIONAL_DEPRELS = { |
| "aux", "cop", "mark", "det", "clf", "case", "cc" |
| } |
|
|
| UNIVERSAL_FEATURES = { |
| "PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Gender", |
| "Animacy", "Number", "Case", "Definite", "Degree", "VerbForm", "Mood", |
| "Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite" |
| } |
|
|
| |
| class UDError(Exception): |
| pass |
|
|
| |
| def load_conllu(file, single_root=1): |
| |
| class UDRepresentation: |
| def __init__(self): |
| |
| |
| self.characters = [] |
| |
| self.tokens = [] |
| |
| self.words = [] |
| |
| self.sentences = [] |
| class UDSpan: |
| def __init__(self, start, end): |
| self.start = start |
| |
| |
| self.end = end |
| class UDWord: |
| def __init__(self, span, columns, is_multiword): |
| |
| self.span = span |
| |
| self.columns = columns |
| |
| |
| self.is_multiword = is_multiword |
| |
| self.parent = None |
| |
| self.functional_children = [] |
| |
| self.columns[FEATS] = "|".join(sorted(feat for feat in columns[FEATS].split("|") |
| if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) |
| |
| self.columns[DEPREL] = columns[DEPREL].split(":")[0] |
| |
| self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS |
| self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS |
|
|
| ud = UDRepresentation() |
|
|
| |
| index, sentence_start = 0, None |
| while True: |
| line = file.readline() |
| if not line: |
| break |
| line = line.rstrip("\r\n") |
|
|
| |
| if sentence_start is None: |
| |
| if line.startswith("#"): |
| continue |
| |
| ud.sentences.append(UDSpan(index, 0)) |
| sentence_start = len(ud.words) |
| if not line: |
| |
| def process_word(word): |
| if word.parent == "remapping": |
| raise UDError("There is a cycle in a sentence") |
| if word.parent is None: |
| if word.columns[HEAD] == "_": |
| word.parent = "missing" |
| else: |
| head = int(word.columns[HEAD]) |
| if head < 0 or head > len(ud.words) - sentence_start: |
| raise UDError("HEAD '{}' points outside of the sentence".format(word.columns[HEAD])) |
| if head: |
| parent = ud.words[sentence_start + head - 1] |
| word.parent = "remapping" |
| process_word(parent) |
| word.parent = parent |
|
|
| for word in ud.words[sentence_start:]: |
| process_word(word) |
| |
| |
| for word in ud.words[sentence_start:]: |
| if word.parent and word.is_functional_deprel: |
| word.parent.functional_children.append(word) |
|
|
| |
| if single_root: |
| if len([word for word in ud.words[sentence_start:] if word.parent is None]) > 1: |
| raise UDError("There are multiple roots in a sentence") |
|
|
| |
| ud.sentences[-1].end = index |
| sentence_start = None |
| continue |
|
|
| |
| columns = line.split("\t") |
| if len(columns) != 10: |
| raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(line)) |
|
|
| |
| if "." in columns[ID]: |
| continue |
|
|
| |
| |
| |
| if sys.version_info < (3, 0) and isinstance(line, str): |
| columns[FORM] = columns[FORM].decode("utf-8") |
| columns[FORM] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[FORM])) |
| if sys.version_info < (3, 0) and isinstance(line, str): |
| columns[FORM] = columns[FORM].encode("utf-8") |
| if not columns[FORM]: |
| raise UDError("There is an empty FORM in the CoNLL-U file") |
|
|
| |
| ud.characters.extend(columns[FORM]) |
| ud.tokens.append(UDSpan(index, index + len(columns[FORM]))) |
| index += len(columns[FORM]) |
|
|
| |
| if "-" in columns[ID]: |
| try: |
| start, end = map(int, columns[ID].split("-")) |
| except: |
| raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID])) |
|
|
| for _ in range(start, end + 1): |
| word_line = file.readline().rstrip("\r\n") |
| word_columns = word_line.split("\t") |
| if len(word_columns) != 10: |
| raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(word_line)) |
| ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) |
| |
| else: |
| try: |
| word_id = int(columns[ID]) |
| except: |
| raise UDError("Cannot parse word ID '{}'".format(columns[ID])) |
| if word_id != len(ud.words) - sentence_start + 1: |
| raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1)) |
|
|
| if columns[HEAD] != "_": |
| try: |
| head_id = int(columns[HEAD]) |
| except: |
| raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD])) |
| if head_id < 0: |
| raise UDError("HEAD cannot be negative") |
|
|
| ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) |
|
|
| if sentence_start is not None: |
| raise UDError("The CoNLL-U file does not end with empty line") |
|
|
| return ud |
|
|
| |
| def evaluate(gold_ud, system_ud): |
| class Score: |
| def __init__(self, gold_total, system_total, correct, aligned_total=None): |
| self.correct = correct |
| self.gold_total = gold_total |
| self.system_total = system_total |
| self.aligned_total = aligned_total |
| self.precision = correct / system_total if system_total else 0.0 |
| self.recall = correct / gold_total if gold_total else 0.0 |
| self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 |
| self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total |
| class AlignmentWord: |
| def __init__(self, gold_word, system_word): |
| self.gold_word = gold_word |
| self.system_word = system_word |
| class Alignment: |
| def __init__(self, gold_words, system_words): |
| self.gold_words = gold_words |
| self.system_words = system_words |
| self.matched_words = [] |
| self.matched_words_map = {} |
| def append_aligned_words(self, gold_word, system_word): |
| self.matched_words.append(AlignmentWord(gold_word, system_word)) |
| self.matched_words_map[system_word] = gold_word |
|
|
| def lower(text): |
| if sys.version_info < (3, 0) and isinstance(text, str): |
| return text.decode("utf-8").lower() |
| return text.lower() |
|
|
| def spans_score(gold_spans, system_spans): |
| correct, gi, si = 0, 0, 0 |
| while gi < len(gold_spans) and si < len(system_spans): |
| if system_spans[si].start < gold_spans[gi].start: |
| si += 1 |
| elif gold_spans[gi].start < system_spans[si].start: |
| gi += 1 |
| else: |
| correct += gold_spans[gi].end == system_spans[si].end |
| si += 1 |
| gi += 1 |
|
|
| return Score(len(gold_spans), len(system_spans), correct) |
|
|
| def alignment_score(alignment, key_fn=None, filter_fn=None): |
| if filter_fn is not None: |
| gold = sum(1 for gold in alignment.gold_words if filter_fn(gold)) |
| system = sum(1 for system in alignment.system_words if filter_fn(system)) |
| aligned = sum(1 for word in alignment.matched_words if filter_fn(word.gold_word)) |
| else: |
| gold = len(alignment.gold_words) |
| system = len(alignment.system_words) |
| aligned = len(alignment.matched_words) |
|
|
| if key_fn is None: |
| |
| return Score(gold, system, aligned) |
|
|
| def gold_aligned_gold(word): |
| return word |
| def gold_aligned_system(word): |
| return alignment.matched_words_map.get(word, "NotAligned") if word is not None else None |
| correct = 0 |
| for words in alignment.matched_words: |
| if filter_fn is None or filter_fn(words.gold_word): |
| if key_fn(words.gold_word, gold_aligned_gold) == key_fn(words.system_word, gold_aligned_system): |
| correct += 1 |
|
|
| return Score(gold, system, correct, aligned) |
|
|
| def beyond_end(words, i, multiword_span_end): |
| if i >= len(words): |
| return True |
| if words[i].is_multiword: |
| return words[i].span.start >= multiword_span_end |
| return words[i].span.end > multiword_span_end |
|
|
| def extend_end(word, multiword_span_end): |
| if word.is_multiword and word.span.end > multiword_span_end: |
| return word.span.end |
| return multiword_span_end |
|
|
| def find_multiword_span(gold_words, system_words, gi, si): |
| |
| |
| |
| if gold_words[gi].is_multiword: |
| multiword_span_end = gold_words[gi].span.end |
| if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start: |
| si += 1 |
| else: |
| multiword_span_end = system_words[si].span.end |
| if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start: |
| gi += 1 |
| gs, ss = gi, si |
|
|
| |
| |
| while not beyond_end(gold_words, gi, multiword_span_end) or \ |
| not beyond_end(system_words, si, multiword_span_end): |
| if gi < len(gold_words) and (si >= len(system_words) or |
| gold_words[gi].span.start <= system_words[si].span.start): |
| multiword_span_end = extend_end(gold_words[gi], multiword_span_end) |
| gi += 1 |
| else: |
| multiword_span_end = extend_end(system_words[si], multiword_span_end) |
| si += 1 |
| return gs, ss, gi, si |
|
|
| def compute_lcs(gold_words, system_words, gi, si, gs, ss): |
| lcs = [[0] * (si - ss) for i in range(gi - gs)] |
| for g in reversed(range(gi - gs)): |
| for s in reversed(range(si - ss)): |
| if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): |
| lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) |
| lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) |
| lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) |
| return lcs |
|
|
| def align_words(gold_words, system_words): |
| alignment = Alignment(gold_words, system_words) |
|
|
| gi, si = 0, 0 |
| while gi < len(gold_words) and si < len(system_words): |
| if gold_words[gi].is_multiword or system_words[si].is_multiword: |
| |
| gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) |
|
|
| if si > ss and gi > gs: |
| lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) |
|
|
| |
| s, g = 0, 0 |
| while g < gi - gs and s < si - ss: |
| if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): |
| alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) |
| g += 1 |
| s += 1 |
| elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0): |
| g += 1 |
| else: |
| s += 1 |
| else: |
| |
| if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end): |
| alignment.append_aligned_words(gold_words[gi], system_words[si]) |
| gi += 1 |
| si += 1 |
| elif gold_words[gi].span.start <= system_words[si].span.start: |
| gi += 1 |
| else: |
| si += 1 |
|
|
| return alignment |
|
|
| |
| if gold_ud.characters != system_ud.characters: |
| index = 0 |
| while index < len(gold_ud.characters) and index < len(system_ud.characters) and \ |
| gold_ud.characters[index] == system_ud.characters[index]: |
| index += 1 |
|
|
| raise UDError( |
| "The concatenation of tokens in gold file and in system file differ!\n" + |
| "First 20 differing characters in gold file: '{}' and system file: '{}'".format( |
| "".join(gold_ud.characters[index:index + 20]), |
| "".join(system_ud.characters[index:index + 20]) |
| ) |
| ) |
|
|
| |
| alignment = align_words(gold_ud.words, system_ud.words) |
|
|
| |
| return { |
| "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), |
| "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), |
| "Words": alignment_score(alignment), |
| "UPOS": alignment_score(alignment, lambda w, _: w.columns[UPOS]), |
| "XPOS": alignment_score(alignment, lambda w, _: w.columns[XPOS]), |
| "UFeats": alignment_score(alignment, lambda w, _: w.columns[FEATS]), |
| "AllTags": alignment_score(alignment, lambda w, _: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), |
| "Lemmas": alignment_score(alignment, lambda w, ga: w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), |
| "UAS": alignment_score(alignment, lambda w, ga: ga(w.parent)), |
| "LAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL])), |
| "CLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL]), |
| filter_fn=lambda w: w.is_content_deprel), |
| "MLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS], |
| [(ga(c), c.columns[DEPREL], c.columns[UPOS], c.columns[FEATS]) |
| for c in w.functional_children]), |
| filter_fn=lambda w: w.is_content_deprel), |
| "BLEX": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], |
| w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), |
| filter_fn=lambda w: w.is_content_deprel), |
| } |
|
|
|
|
| def load_conllu_file(path, single_root=1): |
| _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) |
| return load_conllu(_file, single_root) |
|
|
| def evaluate_wrapper(args): |
| |
| gold_ud = load_conllu_file(args.gold_file, args.single_root) |
| system_ud = load_conllu_file(args.system_file, args.single_root) |
| return evaluate(gold_ud, system_ud) |
|
|
| def main(): |
| |
| parser = argparse.ArgumentParser() |
| parser.add_argument("gold_file", type=str, |
| help="Name of the CoNLL-U file with the gold data.") |
| parser.add_argument("system_file", type=str, |
| help="Name of the CoNLL-U file with the predicted data.") |
| parser.add_argument("--verbose", "-v", default=False, action="store_true", |
| help="Print all metrics.") |
| parser.add_argument("--counts", "-c", default=False, action="store_true", |
| help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.") |
| parser.add_argument("--no_single_root", dest="single_root", default=True, action="store_false", |
| help="Allow multiple roots in a sentence.") |
| args = parser.parse_args() |
|
|
| |
| evaluation = evaluate_wrapper(args) |
|
|
| |
| if not args.verbose and not args.counts: |
| print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1)) |
| print("MLAS Score: {:.2f}".format(100 * evaluation["MLAS"].f1)) |
| print("BLEX Score: {:.2f}".format(100 * evaluation["BLEX"].f1)) |
| else: |
| if args.counts: |
| print("Metric | Correct | Gold | Predicted | Aligned") |
| else: |
| print("Metric | Precision | Recall | F1 Score | AligndAcc") |
| print("-----------+-----------+-----------+-----------+-----------") |
| for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"]: |
| if args.counts: |
| print("{:11}|{:10} |{:10} |{:10} |{:10}".format( |
| metric, |
| evaluation[metric].correct, |
| evaluation[metric].gold_total, |
| evaluation[metric].system_total, |
| evaluation[metric].aligned_total or (evaluation[metric].correct if metric == "Words" else "") |
| )) |
| else: |
| print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( |
| metric, |
| 100 * evaluation[metric].precision, |
| 100 * evaluation[metric].recall, |
| 100 * evaluation[metric].f1, |
| "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else "" |
| )) |
|
|
| if __name__ == "__main__": |
| main() |
|
|
| |
| class TestAlignment(unittest.TestCase): |
| @staticmethod |
| def _load_words(words): |
| """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors.""" |
| lines, num_words = [], 0 |
| for w in words: |
| parts = w.split(" ") |
| if len(parts) == 1: |
| num_words += 1 |
| lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) |
| else: |
| lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) |
| for part in parts[1:]: |
| num_words += 1 |
| lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) |
| return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) |
|
|
| def _test_exception(self, gold, system): |
| self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) |
|
|
| def _test_ok(self, gold, system, correct): |
| metrics = evaluate(self._load_words(gold), self._load_words(system)) |
| gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) |
| system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) |
| self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), |
| (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) |
|
|
| def test_exception(self): |
| self._test_exception(["a"], ["b"]) |
|
|
| def test_equal(self): |
| self._test_ok(["a"], ["a"], 1) |
| self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) |
|
|
| def test_equal_with_multiword(self): |
| self._test_ok(["abc a b c"], ["a", "b", "c"], 3) |
| self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) |
| self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) |
| self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) |
|
|
| def test_alignment(self): |
| self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) |
| self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) |
| self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) |
| self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) |
| self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) |
| self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) |
| self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1) |
|
|