Upload 66 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +3 -0
- dictionary/accents.json.gz +3 -0
- dictionary/accents_nn.json.gz +3 -0
- dictionary/omographs.json.gz +3 -0
- dictionary/rule_engine/accents.json +0 -0
- dictionary/rule_engine/forms.json +85 -0
- dictionary/yo_homographs.json.gz +3 -0
- dictionary/yo_omographs.json.gz +3 -0
- dictionary/yo_words.json.gz +3 -0
- koziev/rulemma/rulemma.dat +3 -0
- koziev/rulemma/rulemma.py +237 -0
- koziev/rupostagger/__init__.py +3 -0
- koziev/rupostagger/database/ruword2tags.db +3 -0
- koziev/rupostagger/rupostagger.config +11 -0
- koziev/rupostagger/rupostagger.model +3 -0
- koziev/rupostagger/rupostagger.py +173 -0
- koziev/rupostagger/rusyllab.py +589 -0
- koziev/rupostagger/ruword2tags.dat +3 -0
- koziev/rupostagger/ruword2tags.py +391 -0
- nn/nn_accent/big.onnx +3 -0
- nn/nn_accent/config.json +37 -0
- nn/nn_accent/model.onnx +3 -0
- nn/nn_accent/ort_config.json +30 -0
- nn/nn_accent/special_tokens_map.json +6 -0
- nn/nn_accent/tokenizer_config.json +10 -0
- nn/nn_accent/vocab.txt +45 -0
- nn/nn_omograph/big_poetry/added_tokens.json +4 -0
- nn/nn_omograph/big_poetry/config.json +31 -0
- nn/nn_omograph/big_poetry/model.onnx +3 -0
- nn/nn_omograph/big_poetry/special_tokens_map.json +7 -0
- nn/nn_omograph/big_poetry/tokenizer.json +0 -0
- nn/nn_omograph/big_poetry/tokenizer_config.json +15 -0
- nn/nn_omograph/big_poetry/vocab.txt +0 -0
- nn/nn_omograph/medium_poetry/added_tokens.json +4 -0
- nn/nn_omograph/medium_poetry/config.json +31 -0
- nn/nn_omograph/medium_poetry/model.onnx +3 -0
- nn/nn_omograph/medium_poetry/special_tokens_map.json +7 -0
- nn/nn_omograph/medium_poetry/tokenizer.json +0 -0
- nn/nn_omograph/medium_poetry/tokenizer_config.json +15 -0
- nn/nn_omograph/medium_poetry/vocab.txt +0 -0
- nn/nn_omograph/small_poetry/added_tokens.json +4 -0
- nn/nn_omograph/small_poetry/config.json +23 -0
- nn/nn_omograph/small_poetry/model.onnx +3 -0
- nn/nn_omograph/small_poetry/special_tokens_map.json +7 -0
- nn/nn_omograph/small_poetry/tokenizer.json +0 -0
- nn/nn_omograph/small_poetry/tokenizer_config.json +15 -0
- nn/nn_omograph/small_poetry/vocab.txt +0 -0
- nn/nn_omograph/turbo/added_tokens.json +4 -0
- nn/nn_omograph/turbo/config.json +28 -0
- nn/nn_omograph/turbo/merges.txt +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
koziev/rulemma/rulemma.dat filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
koziev/rupostagger/database/ruword2tags.db filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
koziev/rupostagger/ruword2tags.dat filter=lfs diff=lfs merge=lfs -text
|
dictionary/accents.json.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa460ebba90de00fbbf3d41d121961f605b98667e45efb7920f127473b15515e
|
| 3 |
+
size 20954156
|
dictionary/accents_nn.json.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8395664000b80c1afe09bfea3650945b0933482b8e3dee5bb9d429eb18c44935
|
| 3 |
+
size 845996
|
dictionary/omographs.json.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04a9e81c68d65f65ba493fe0110f99e79087548c2beeec3032e2b66e28706f36
|
| 3 |
+
size 219047
|
dictionary/rule_engine/accents.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dictionary/rule_engine/forms.json
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"diminutive": "",
|
| 3 |
+
"perfective/imperfective": "Aspect=Perf|Aspect=Imp",
|
| 4 |
+
"dative/prepositional": "Case=Dat|Case=Prep",
|
| 5 |
+
"inanimate": "Animacy=Inan",
|
| 6 |
+
"animate/inanimate": "Animacy=Anim|Animacy=Inan",
|
| 7 |
+
"dative": "Case=Dat",
|
| 8 |
+
"second-person": "Person=2",
|
| 9 |
+
"imperative": "Mood=Imp",
|
| 10 |
+
"archaic": "",
|
| 11 |
+
"nominative": "Case=Nom",
|
| 12 |
+
"locative": "Case=Loc",
|
| 13 |
+
"masculine": "Gender=Masc",
|
| 14 |
+
"female": "",
|
| 15 |
+
"canonical": "",
|
| 16 |
+
"plural": "Number=Plur",
|
| 17 |
+
"short": "Variant=Short",
|
| 18 |
+
"imperfective": "Aspect=Imp",
|
| 19 |
+
"form": "",
|
| 20 |
+
"augmentative": "",
|
| 21 |
+
"masculine/feminine": "Gender=Masc|Gender=Fem",
|
| 22 |
+
"superlative": "Degree=Sup",
|
| 23 |
+
"nominative/accusative": "Case=Nom|Case=Acc",
|
| 24 |
+
"third-person": "Person=3",
|
| 25 |
+
"nonstandard": "",
|
| 26 |
+
"genitive": "Case=Gen",
|
| 27 |
+
"feminine": "Gender=Fem",
|
| 28 |
+
"masculine/neuter": "Gender=Masc|Gender=Neut",
|
| 29 |
+
"dative/locative": "Case=Dat|Case=Loc",
|
| 30 |
+
"genitive/accusative/prepositional": "Case=Gen|Case=Acc|Case=Prep",
|
| 31 |
+
"partitive": "Case=Par",
|
| 32 |
+
"genitive/prepositional": "Case=Gen|Case=Prep",
|
| 33 |
+
"equivalent": "",
|
| 34 |
+
"endearing": "",
|
| 35 |
+
"degree": "Degree=",
|
| 36 |
+
"comparative": "Degree=Cmp",
|
| 37 |
+
"imperfective/perfective": "Aspect=Imp|Aspect=Perf",
|
| 38 |
+
"mainly": "",
|
| 39 |
+
"passive": "Voice=Pass",
|
| 40 |
+
"first-person": "Person=1",
|
| 41 |
+
"perfective": "Aspect=Perf",
|
| 42 |
+
"genitive/dative/instrumental/prepositional": "Case=Gen|Case=Dat|Case=Ins|Case=Prep",
|
| 43 |
+
"pejorative": "",
|
| 44 |
+
"accusative": "Case=Acc",
|
| 45 |
+
"spelling": "",
|
| 46 |
+
"dative/partitive": "Case=Dat|Case=Par",
|
| 47 |
+
"old-fashion": "",
|
| 48 |
+
"possessive": "Poss=Yes",
|
| 49 |
+
"dative/instrumental": "Case=Dat|Case=Ins",
|
| 50 |
+
"adverbial": "",
|
| 51 |
+
"neuter": "Gender=Neut",
|
| 52 |
+
"future": "Tense=Fut",
|
| 53 |
+
"neuter/masculine": "Gender=Neut|Gender=Masc",
|
| 54 |
+
"inanimate/animate": "Animacy=Inan|Animacy=Anim",
|
| 55 |
+
"(singular": "Number=Sing",
|
| 56 |
+
"alternative,": "",
|
| 57 |
+
"participle": "VerbForm=Part",
|
| 58 |
+
"genitive/accusative": "Case=Gen|Case=Acc",
|
| 59 |
+
"indicative": "Mood=Ind",
|
| 60 |
+
"dative/accusative": "Case=Dat|Case=Acc",
|
| 61 |
+
"singular/plural": "Number=Sing|Number=Plur",
|
| 62 |
+
"instrumental": "Case=Ins",
|
| 63 |
+
"&": "",
|
| 64 |
+
"vocative": "Case=Voc",
|
| 65 |
+
"prepositional": "Case=Prep",
|
| 66 |
+
"active": "Voice=Act",
|
| 67 |
+
"inanimate/animate": "Animacy=Inan|Animacy=Anim",
|
| 68 |
+
"past": "Tense=Past",
|
| 69 |
+
"first/second/third-person": "Person=1|Person=2|Person=3",
|
| 70 |
+
"second-personal": "Person=2",
|
| 71 |
+
"reflexive": "Reflex=Yes",
|
| 72 |
+
"singular": "Number=Sing",
|
| 73 |
+
"accusative/genitive": "Case=Acc|Case=Gen",
|
| 74 |
+
"acronym": "",
|
| 75 |
+
"(animated)": "Animacy=Anim",
|
| 76 |
+
"euphemistic": "",
|
| 77 |
+
"genitive/dative/prepositional": "Case=Gen|Case=Dat|Case=Prep",
|
| 78 |
+
"colloquial": "",
|
| 79 |
+
"a": "",
|
| 80 |
+
"initialism": "",
|
| 81 |
+
"present": "Tense=Pres",
|
| 82 |
+
"obsolete": "",
|
| 83 |
+
"singulative": "",
|
| 84 |
+
"animate": "Animacy=Anim"
|
| 85 |
+
}
|
dictionary/yo_homographs.json.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4ee777bbbab87f9eac838f370ad92974e079d02b21903e480c54b5f0c8c60d1
|
| 3 |
+
size 5747
|
dictionary/yo_omographs.json.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b91cc78dacb5a43e4d5e2e62efdbe5a57799195e5868db35282bee0d9e215a0d
|
| 3 |
+
size 7949
|
dictionary/yo_words.json.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a19fa89a964a0691d9fe4ee384783e3934904891843d8f59a1c480d67947a82a
|
| 3 |
+
size 548914
|
koziev/rulemma/rulemma.dat
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf2b3ef3ff7a0aa6e4250aa4e9c8ed568e25f825deebdb12dee1b46b785ba9fc
|
| 3 |
+
size 16703198
|
koziev/rulemma/rulemma.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Лемматизатор для R&D прототипирования NLP задач в Питоне
|
| 4 |
+
25.03.2020 добавлена ефикация в get_lemma2
|
| 5 |
+
05.04.2020 добавлено декодирование для частей речи CONJ, PART и PUNCT
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import division
|
| 9 |
+
from __future__ import print_function
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import pickle
|
| 13 |
+
import pathlib
|
| 14 |
+
import gzip
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def decode_pos(pos):
|
| 18 |
+
if pos in [u'ДЕЕПРИЧАСТИЕ', u'ГЛАГОЛ', u'ИНФИНИТИВ']:
|
| 19 |
+
return u'ГЛАГОЛ'
|
| 20 |
+
else:
|
| 21 |
+
return pos
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class Lemmatizer(object):
|
| 25 |
+
def __init__(self):
|
| 26 |
+
pass
|
| 27 |
+
|
| 28 |
+
def load(self, dict_path=None):
|
| 29 |
+
""" Загружаем модель лемматизации, созданную отдельным скриптом builder.py """
|
| 30 |
+
dict_filename = 'rulemma.dat'
|
| 31 |
+
if dict_path is None:
|
| 32 |
+
module_folder = str(pathlib.Path(__file__).resolve().parent)
|
| 33 |
+
p = os.path.join(module_folder, '../tmp', dict_filename)
|
| 34 |
+
if not os.path.exists(p):
|
| 35 |
+
p = os.path.join(module_folder, dict_filename)
|
| 36 |
+
else:
|
| 37 |
+
p = dict_path
|
| 38 |
+
|
| 39 |
+
with gzip.open(p, 'r') as f:
|
| 40 |
+
self.forms, self.forms2, self.special_lemmas, self.key2transducer = pickle.load(f)
|
| 41 |
+
|
| 42 |
+
def get_lemma(self, word):
|
| 43 |
+
if word in self.forms:
|
| 44 |
+
return self.forms[word]
|
| 45 |
+
elif word in self.forms2:
|
| 46 |
+
return self.forms2[word][0]
|
| 47 |
+
elif word in self.special_lemmas:
|
| 48 |
+
return self.special_lemmas[word]
|
| 49 |
+
else:
|
| 50 |
+
return word
|
| 51 |
+
|
| 52 |
+
def decode_pos_tags(self, pos_tags):
|
| 53 |
+
stags1 = []
|
| 54 |
+
part_of_speech = u'unk'
|
| 55 |
+
short_tag_index = -1
|
| 56 |
+
for tag in pos_tags.split('|'):
|
| 57 |
+
if tag == 'NOUN':
|
| 58 |
+
part_of_speech = u'СУЩЕСТВИТЕЛЬНОЕ'
|
| 59 |
+
elif tag == 'VERB':
|
| 60 |
+
part_of_speech = u'ГЛАГОЛ'
|
| 61 |
+
elif tag == 'ADJ':
|
| 62 |
+
part_of_speech = u'ПРИЛАГАТЕЛЬНОЕ'
|
| 63 |
+
stags1.append((u'КРАТКИЙ', u'0'))
|
| 64 |
+
short_tag_index = 0
|
| 65 |
+
elif tag == 'ADV':
|
| 66 |
+
part_of_speech = u'НАРЕЧИЕ'
|
| 67 |
+
elif tag == 'PRON':
|
| 68 |
+
part_of_speech = u'МЕСТОИМЕНИЕ'
|
| 69 |
+
elif tag == 'ADP':
|
| 70 |
+
part_of_speech = u'ПРЕДЛОГ'
|
| 71 |
+
elif tag == 'CONJ':
|
| 72 |
+
part_of_speech = u'СОЮЗ'
|
| 73 |
+
elif tag == 'PART':
|
| 74 |
+
part_of_speech = u'ЧАСТИЦА'
|
| 75 |
+
elif tag == 'PUNCT':
|
| 76 |
+
part_of_speech = u'ПУНКТУАТОР'
|
| 77 |
+
elif '=' in tag:
|
| 78 |
+
if part_of_speech == u'СУЩЕСТВИТЕЛЬНОЕ':
|
| 79 |
+
if tag == u'Case=Nom':
|
| 80 |
+
stags1.append((u'ПАДЕЖ', u'ИМ'))
|
| 81 |
+
elif tag == u'Case=Acc':
|
| 82 |
+
stags1.append((u'ПАДЕЖ', u'ВИН'))
|
| 83 |
+
elif tag == u'Case=Dat':
|
| 84 |
+
stags1.append((u'ПАДЕЖ', u'ДАТ'))
|
| 85 |
+
elif tag == u'Case=Ins':
|
| 86 |
+
stags1.append((u'ПАДЕЖ', u'ТВОР'))
|
| 87 |
+
elif tag == u'Case=Prep':
|
| 88 |
+
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
|
| 89 |
+
elif tag == u'Case=Loc':
|
| 90 |
+
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ'
|
| 91 |
+
elif tag == u'Case=Gen':
|
| 92 |
+
stags1.append((u'ПАДЕЖ', u'РОД'))
|
| 93 |
+
elif tag == u'Case=Voc':
|
| 94 |
+
stags1.append((u'ПАДЕЖ', u'ЗВАТ'))
|
| 95 |
+
elif tag == u'Number=Sing':
|
| 96 |
+
stags1.append((u'ЧИСЛО', u'ЕД'))
|
| 97 |
+
elif tag == u'Number=Plur':
|
| 98 |
+
stags1.append((u'ЧИСЛО', u'МН'))
|
| 99 |
+
elif tag == u'Gender=Masc':
|
| 100 |
+
stags1.append((u'РОД', u'МУЖ'))
|
| 101 |
+
elif tag == u'Gender=Fem':
|
| 102 |
+
stags1.append((u'РОД', u'ЖЕН'))
|
| 103 |
+
elif tag == u'Gender=Neut':
|
| 104 |
+
stags1.append((u'РОД', u'СР'))
|
| 105 |
+
else:
|
| 106 |
+
print(u'неизвестный тэг "{}"'.format(tag))
|
| 107 |
+
raise NotImplementedError()
|
| 108 |
+
elif part_of_speech == u'ПРИЛАГАТЕЛЬНОЕ':
|
| 109 |
+
if tag == u'Case=Nom':
|
| 110 |
+
stags1.append((u'ПАДЕЖ', u'ИМ'))
|
| 111 |
+
elif tag == u'Case=Acc':
|
| 112 |
+
stags1.append((u'ПАДЕЖ', u'ВИН'))
|
| 113 |
+
elif tag == u'Case=Dat':
|
| 114 |
+
stags1.append((u'ПАДЕЖ', u'ДАТ'))
|
| 115 |
+
elif tag == u'Case=Ins':
|
| 116 |
+
stags1.append((u'ПАДЕЖ', u'ТВОР'))
|
| 117 |
+
elif tag == u'Case=Prep':
|
| 118 |
+
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
|
| 119 |
+
elif tag == u'Case=Loc':
|
| 120 |
+
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ'
|
| 121 |
+
elif tag == u'Case=Gen':
|
| 122 |
+
stags1.append((u'ПАДЕЖ', u'РОД'))
|
| 123 |
+
elif tag == u'Number=Sing':
|
| 124 |
+
stags1.append((u'ЧИСЛО', u'ЕД'))
|
| 125 |
+
elif tag == u'Number=Plur':
|
| 126 |
+
stags1.append((u'ЧИСЛО', u'МН'))
|
| 127 |
+
elif tag == u'Gender=Masc':
|
| 128 |
+
stags1.append((u'РОД', u'МУЖ'))
|
| 129 |
+
elif tag == u'Gender=Fem':
|
| 130 |
+
stags1.append((u'РОД', u'ЖЕН'))
|
| 131 |
+
elif tag == u'Gender=Neut':
|
| 132 |
+
stags1.append((u'РОД', u'СР'))
|
| 133 |
+
elif tag == u'Degree=Cmp':
|
| 134 |
+
stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
|
| 135 |
+
elif tag == u'Degree=Pos':
|
| 136 |
+
stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
|
| 137 |
+
elif tag in (u'Variant=Short', u'Variant=Brev'):
|
| 138 |
+
stags1[short_tag_index] = (u'КРАТКИЙ', u'1')
|
| 139 |
+
else:
|
| 140 |
+
print(u'неизвестный тэг "{}"'.format(tag))
|
| 141 |
+
raise NotImplementedError()
|
| 142 |
+
elif part_of_speech == u'ГЛАГОЛ':
|
| 143 |
+
if tag == u'Number=Sing':
|
| 144 |
+
stags1.append((u'ЧИСЛО', u'ЕД'))
|
| 145 |
+
elif tag == u'Number=Plur':
|
| 146 |
+
stags1.append((u'ЧИСЛО', u'МН'))
|
| 147 |
+
elif tag == u'Gender=Masc':
|
| 148 |
+
stags1.append((u'РОД', u'МУЖ'))
|
| 149 |
+
elif tag == u'Gender=Fem':
|
| 150 |
+
stags1.append((u'РОД', u'ЖЕН'))
|
| 151 |
+
elif tag == u'Gender=Neut':
|
| 152 |
+
stags1.append((u'РОД', u'СР'))
|
| 153 |
+
elif tag == u'Mood=Ind':
|
| 154 |
+
stags1.append((u'НАКЛОНЕНИЕ', u'ИЗЪЯВ'))
|
| 155 |
+
elif tag == u'Mood=Imp':
|
| 156 |
+
stags1.append((u'НАКЛОНЕНИЕ', u'ПОБУД'))
|
| 157 |
+
elif tag == u'Tense=Past':
|
| 158 |
+
stags1.append((u'ВРЕМЯ', u'ПРОШЕДШЕЕ'))
|
| 159 |
+
elif tag == u'Tense=Fut':
|
| 160 |
+
stags1.append((u'ВРЕМЯ', u'БУДУЩЕЕ'))
|
| 161 |
+
elif tag == u'Tense=Notpast':
|
| 162 |
+
stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
|
| 163 |
+
elif tag == u'Tense=Pres':
|
| 164 |
+
stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
|
| 165 |
+
elif tag == u'Person=1':
|
| 166 |
+
stags1.append((u'ЛИЦО', u'1'))
|
| 167 |
+
elif tag == u'Person=2':
|
| 168 |
+
stags1.append((u'ЛИЦО', u'2'))
|
| 169 |
+
elif tag == u'Person=3':
|
| 170 |
+
stags1.append((u'ЛИЦО', u'3'))
|
| 171 |
+
elif tag == u'VerbForm=Fin':
|
| 172 |
+
pass
|
| 173 |
+
elif tag == u'VerbForm=Inf':
|
| 174 |
+
pass
|
| 175 |
+
elif tag == u'VerbForm=Conv':
|
| 176 |
+
pass
|
| 177 |
+
else:
|
| 178 |
+
msg = u'неизвестный тэг "{}"'.format(tag)
|
| 179 |
+
raise RuntimeError(msg)
|
| 180 |
+
elif part_of_speech == u'НАРЕЧИЕ':
|
| 181 |
+
if tag == u'Degree=Pos':
|
| 182 |
+
stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
|
| 183 |
+
elif tag == u'Degree=Cmp':
|
| 184 |
+
stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
|
| 185 |
+
else:
|
| 186 |
+
raise NotImplementedError()
|
| 187 |
+
else:
|
| 188 |
+
pass
|
| 189 |
+
|
| 190 |
+
return part_of_speech, stags1
|
| 191 |
+
|
| 192 |
+
def get_lemma2(self, word, pos_tags):
|
| 193 |
+
part_of_speech, decoded_tags = self.decode_pos_tags(pos_tags)
|
| 194 |
+
|
| 195 |
+
nword = word.lower().replace('ё', 'е')
|
| 196 |
+
|
| 197 |
+
if nword in self.special_lemmas:
|
| 198 |
+
return self.special_lemmas[nword], part_of_speech, decoded_tags
|
| 199 |
+
|
| 200 |
+
if nword in self.forms:
|
| 201 |
+
lemma = self.forms[nword]
|
| 202 |
+
return lemma, part_of_speech, decoded_tags
|
| 203 |
+
elif nword in self.forms2:
|
| 204 |
+
if part_of_speech == 'СУЩЕСТВИТЕЛЬНОЕ':
|
| 205 |
+
# Для существительных учитываем падеж.
|
| 206 |
+
required_case = None
|
| 207 |
+
for tag in decoded_tags:
|
| 208 |
+
if tag[0] == 'ПАДЕЖ':
|
| 209 |
+
required_case = tag[1]
|
| 210 |
+
break
|
| 211 |
+
|
| 212 |
+
for lemma, lemma_part_of_speech, tag in self.forms2[nword]:
|
| 213 |
+
if lemma_part_of_speech == part_of_speech and tag == required_case:
|
| 214 |
+
return lemma, part_of_speech, decoded_tags
|
| 215 |
+
else:
|
| 216 |
+
for lemma, lemma_part_of_speech, tags in self.forms2[nword]:
|
| 217 |
+
if lemma_part_of_speech == part_of_speech:
|
| 218 |
+
return lemma, part_of_speech, decoded_tags
|
| 219 |
+
elif len(word) > 4:
|
| 220 |
+
# используем модель лемматизации для OV-слов
|
| 221 |
+
ending = nword[-4:]
|
| 222 |
+
key = ending + u'|' + part_of_speech
|
| 223 |
+
if key in self.key2transducer:
|
| 224 |
+
transducer = self.key2transducer[key]
|
| 225 |
+
if transducer[0] > 0:
|
| 226 |
+
lemma = word[:-transducer[0]] + transducer[1]
|
| 227 |
+
else:
|
| 228 |
+
lemma = word + transducer[1]
|
| 229 |
+
|
| 230 |
+
return lemma.lower(), part_of_speech, decoded_tags
|
| 231 |
+
|
| 232 |
+
# fallback-вариант - возвращаем исходное слово в нижнем регистре в качестве леммы
|
| 233 |
+
return nword, part_of_speech, decoded_tags
|
| 234 |
+
|
| 235 |
+
def lemmatize(self, tagged_words):
|
| 236 |
+
"""Для результата работы rupostagger'а добавляем лемму и извлеченный код части речи"""
|
| 237 |
+
return [(word, tags,)+tuple(self.get_lemma2(word, tags)) for (word, tags) in tagged_words]
|
koziev/rupostagger/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import absolute_import
|
| 2 |
+
from .rupostagger import RuPosTagger
|
| 3 |
+
from .rupostagger import run_tests
|
koziev/rupostagger/database/ruword2tags.db
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a06848e656bef642aafb4440c03554fa78f2f32dde92ea66f3f86ce9977b167e
|
| 3 |
+
size 168816640
|
koziev/rupostagger/rupostagger.config
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"w2v_filename": "w2v.CBOW=1_WIN=5_DIM=64.bin",
|
| 3 |
+
"wc2v_filename": "wordchar2vector.dat",
|
| 4 |
+
"winspan": 3,
|
| 5 |
+
"use_w2v": false,
|
| 6 |
+
"use_gren": true,
|
| 7 |
+
"use_syllabs": false,
|
| 8 |
+
"use_shingles": false,
|
| 9 |
+
"ending_len": 0,
|
| 10 |
+
"model_filename": "rupostagger.model"
|
| 11 |
+
}
|
koziev/rupostagger/rupostagger.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21b7b0bfd7427b5fdc1604052176db8aa3b139b3ce03be440cfce48536f8e5ef
|
| 3 |
+
size 2417464
|
koziev/rupostagger/rupostagger.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Модель частеречной разметки для русскоязычных текстов (проект https://github.com/Koziev/rupostagger)
|
| 4 |
+
03.08.2019 небольшой баг с нормализацией (замена "ё" на "е") перед поиском в грамматическом словаре
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import print_function
|
| 8 |
+
from __future__ import division # for python2 compatibility
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import json
|
| 12 |
+
import pathlib
|
| 13 |
+
import re
|
| 14 |
+
|
| 15 |
+
import pycrfsuite
|
| 16 |
+
from .ruword2tags import RuWord2Tags
|
| 17 |
+
from .rusyllab import split_word
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
BEG_TOKEN = '<beg>'
|
| 21 |
+
END_TOKEN = '<end>'
|
| 22 |
+
|
| 23 |
+
token2tag = {BEG_TOKEN: BEG_TOKEN, END_TOKEN: END_TOKEN}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def is_num(token):
|
| 27 |
+
return re.match('^[0-9]+$', token)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RuPosTagger(object):
|
| 31 |
+
def __init__(self):
|
| 32 |
+
self.winspan = -1
|
| 33 |
+
self.use_w2v = -1
|
| 34 |
+
self.use_syllabs = -1
|
| 35 |
+
self.ending_len = -1
|
| 36 |
+
self.word2tags = None
|
| 37 |
+
|
| 38 |
+
def load(self, word2tags_path=None):
|
| 39 |
+
module_folder = str(pathlib.Path(__file__).resolve().parent)
|
| 40 |
+
data_folder = os.path.join(module_folder, '../tmp')
|
| 41 |
+
|
| 42 |
+
config_path = os.path.join(data_folder, 'rupostagger.config')
|
| 43 |
+
if not os.path.exists(config_path):
|
| 44 |
+
data_folder = module_folder
|
| 45 |
+
config_path = os.path.join(data_folder, 'rupostagger.config')
|
| 46 |
+
|
| 47 |
+
#print('DEBUG@47 module_folder={}'.format(module_folder))
|
| 48 |
+
#print('DEBUG@48 data_folder={}'.format(data_folder))
|
| 49 |
+
|
| 50 |
+
with open(config_path, 'r') as rdr:
|
| 51 |
+
self.config = json.load(rdr)
|
| 52 |
+
self.winspan = self.config['winspan']
|
| 53 |
+
self.use_gren = self.config['use_gren']
|
| 54 |
+
self.use_w2v = self.config['use_w2v']
|
| 55 |
+
self.use_syllabs = self.config['use_syllabs']
|
| 56 |
+
self.ending_len = self.config['ending_len']
|
| 57 |
+
|
| 58 |
+
self.word2tags = RuWord2Tags()
|
| 59 |
+
self.word2tags.load(word2tags_path)
|
| 60 |
+
|
| 61 |
+
model_path = os.path.join(data_folder, 'rupostagger.model')
|
| 62 |
+
self.tagger = pycrfsuite.Tagger()
|
| 63 |
+
self.tagger.open(model_path)
|
| 64 |
+
|
| 65 |
+
@staticmethod
|
| 66 |
+
def __normalize_word(word):
|
| 67 |
+
return word.replace(' - ', '-').replace(u'ё', u'е').lower()
|
| 68 |
+
|
| 69 |
+
def get_word_features(self, word, prefix):
|
| 70 |
+
assert(len(word) > 0)
|
| 71 |
+
features = []
|
| 72 |
+
if word in token2tag:
|
| 73 |
+
features.append((u'tag[{}]={}'.format(prefix, token2tag[word]), 1.0))
|
| 74 |
+
elif is_num(word):
|
| 75 |
+
features.append((u'tag[{}]=<num> tag[{}]=<num_{}>'.format(prefix, prefix, word[-1]), 1.0))
|
| 76 |
+
elif len(word) == 1 and word[0] in u'‼≠™®•·[¡+<>`~;.,‚?!-…№”“„{}|‹›/\'"–—_:«»*]()‘’≈':
|
| 77 |
+
features.append((u'tag[{}]=punct_{}'.format(prefix, ord(word[0])), 1.0))
|
| 78 |
+
else:
|
| 79 |
+
uword = self.__normalize_word(word)
|
| 80 |
+
first_char = word[0]
|
| 81 |
+
if first_char in u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
|
| 82 |
+
features.append((u'word[{}]=<latin>'.format(prefix), 1.0))
|
| 83 |
+
else:
|
| 84 |
+
if first_char in u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ':
|
| 85 |
+
features.append((u'word[{}]=<upper1>'.format(prefix), 1.0))
|
| 86 |
+
|
| 87 |
+
if self.ending_len > 0:
|
| 88 |
+
ending = '~' + uword[-self.ending_len:] if len(uword) > self.ending_len else uword
|
| 89 |
+
features.append((u'ending[{}]={}'.format(prefix, ending), 1.0))
|
| 90 |
+
|
| 91 |
+
if self.use_syllabs and first_char.lower() in u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя':
|
| 92 |
+
syllabs = split_word(uword)
|
| 93 |
+
if len(syllabs) > 0:
|
| 94 |
+
if len(syllabs) == 1:
|
| 95 |
+
features.append((u'slb[{}]={}'.format(prefix, syllabs[0] + '~'), 1.0))
|
| 96 |
+
else:
|
| 97 |
+
features.append((u'slb[{}]={}'.format(prefix, syllabs[0]+'~'), 1.0))
|
| 98 |
+
for s in syllabs[1:-1]:
|
| 99 |
+
features.append((u'slb[{}]={}'.format(prefix, '~'+s+'~'), 1.0))
|
| 100 |
+
features.append((u'slb[{}]={}'.format(prefix, '~'+syllabs[-1]), 1.0))
|
| 101 |
+
|
| 102 |
+
if self.use_gren:
|
| 103 |
+
tags = set()
|
| 104 |
+
for tagset in self.word2tags[uword]:
|
| 105 |
+
tags.update(tagset.split(' '))
|
| 106 |
+
|
| 107 |
+
for tag in tags:
|
| 108 |
+
features.append((u'tag[{}]={}'.format(prefix, tag), 1.0))
|
| 109 |
+
|
| 110 |
+
return features
|
| 111 |
+
|
| 112 |
+
def vectorize_sample(self, words):
|
| 113 |
+
lines2 = []
|
| 114 |
+
nb_words = len(words)
|
| 115 |
+
for iword, word in enumerate(words):
|
| 116 |
+
word_features = dict()
|
| 117 |
+
for j in range(-self.winspan, self.winspan + 1):
|
| 118 |
+
iword2 = iword + j
|
| 119 |
+
if iword2 < 0:
|
| 120 |
+
features = [('word[{}]=<beg>'.format(j), 1.0)]
|
| 121 |
+
elif iword2 >= nb_words:
|
| 122 |
+
features = [('word[{}]=<end>'.format(j), 1.0)]
|
| 123 |
+
else:
|
| 124 |
+
features = self.get_word_features(words[iword2], str(j))
|
| 125 |
+
word_features.update(features)
|
| 126 |
+
|
| 127 |
+
lines2.append(word_features)
|
| 128 |
+
|
| 129 |
+
return lines2
|
| 130 |
+
|
| 131 |
+
def tag(self, words):
|
| 132 |
+
#X = self.vectorize_sample([BEG_TOKEN]+words+[END_TOKEN])
|
| 133 |
+
X = self.vectorize_sample(words)
|
| 134 |
+
y_pred = self.tagger.tag(X)
|
| 135 |
+
#return zip(words, y_pred[1: -1])
|
| 136 |
+
return zip(words, y_pred)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def test1(tagger, phrase, required_labels):
|
| 140 |
+
pred_labels = list(tagger.tag(phrase.split()))
|
| 141 |
+
assert(len(required_labels.split()) == len(pred_labels))
|
| 142 |
+
for required_label, (word, pred_label) in zip(required_labels.split(), pred_labels):
|
| 143 |
+
for tag in required_label.split('|'):
|
| 144 |
+
if tag not in pred_label:
|
| 145 |
+
print(u'Error: phrase={} word={} required_label={} pred_label={}'.format(phrase, word, required_label, pred_label))
|
| 146 |
+
return False
|
| 147 |
+
|
| 148 |
+
return True
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def run_tests():
|
| 152 |
+
tagger = RuPosTagger()
|
| 153 |
+
tagger.load()
|
| 154 |
+
|
| 155 |
+
for phrase, required_labels in [(u'Кошки спят', u'NOUN|Number=Plur|Case=Nom VERB|Mood=Ind|Number=Plur|Person=3|Tense=Notpast|VerbForm=Fin'),
|
| 156 |
+
(u'Я рою колодец', u'PRON VERB NOUN|Number=Sing|Case=Acc'),
|
| 157 |
+
(u'Я мою окно', u'PRON VERB NOUN|Number=Sing|Case=Acc'),
|
| 158 |
+
(u'Ира мыла окно', u'NOUN|Case=Nom VERB NOUN|Number=Sing|Case=Acc'),
|
| 159 |
+
(u'Возьми мою пилу', u'VERB ADJ|Case=Acc NOUN|Case=Acc'),
|
| 160 |
+
(u'рой колодец', u'VERB NOUN|Number=Sing|Case=Acc'),
|
| 161 |
+
(u'У меня живёт черепаха', u'ADP PRON VERB NOUN'),
|
| 162 |
+
(u'какую еду ты любишь ?', u'ADJ NOUN PRON VERB PUNCT')
|
| 163 |
+
]:
|
| 164 |
+
if not test1(tagger, phrase, required_labels):
|
| 165 |
+
print('Tests FAILED')
|
| 166 |
+
return
|
| 167 |
+
|
| 168 |
+
print('Tests PASSED OK')
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
if __name__ == '__main__':
|
| 172 |
+
run_tests()
|
| 173 |
+
|
koziev/rupostagger/rusyllab.py
ADDED
|
@@ -0,0 +1,589 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
# autogenerated 2019-01-19 10:52:09.746954
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def V(c):
|
| 7 |
+
return c in u"АЕЁИОУЫЭЮЯаеёиоуыэюя"
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def C(c):
|
| 11 |
+
return c in u"БВГДЖЗКЛМНПРСТФХЦЧШЩбвгджзклмнпрстфхцчшщ"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def S(c):
|
| 15 |
+
return c in u"Йй"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def M(c):
|
| 19 |
+
return c in u"ЪЬъь"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def BEG(c):
|
| 23 |
+
return c == u"["
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def END(c):
|
| 27 |
+
return c == u"]"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def split(s):
|
| 31 |
+
cur_pos = 0
|
| 32 |
+
items = list(u"[" + s + u"]")
|
| 33 |
+
while cur_pos < len(items):
|
| 34 |
+
input_context = items[cur_pos:]
|
| 35 |
+
res = apply1(input_context)
|
| 36 |
+
if res is None:
|
| 37 |
+
cur_pos += 1
|
| 38 |
+
else:
|
| 39 |
+
items = items[:cur_pos] + res[0] + input_context[res[1]:]
|
| 40 |
+
cur_pos += res[2]
|
| 41 |
+
return items[1:-1]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def apply1(s):
|
| 45 |
+
if C(s[0]):
|
| 46 |
+
if V(s[1]):
|
| 47 |
+
if C(s[2]):
|
| 48 |
+
if V(s[3]):
|
| 49 |
+
return ([s[0]+s[1], s[2], s[3]], 4, 1) # SYLLABER_1
|
| 50 |
+
|
| 51 |
+
if C(s[3]):
|
| 52 |
+
if V(s[4]):
|
| 53 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_5
|
| 54 |
+
|
| 55 |
+
if C(s[4]):
|
| 56 |
+
if C(s[5]):
|
| 57 |
+
if END(s[6]):
|
| 58 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) # SYLLABER_11
|
| 59 |
+
|
| 60 |
+
if not END(s[6]):
|
| 61 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_12
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
if V(s[5]):
|
| 65 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_36
|
| 66 |
+
|
| 67 |
+
if END(s[5]):
|
| 68 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_120
|
| 69 |
+
|
| 70 |
+
if M(s[5]):
|
| 71 |
+
if END(s[6]):
|
| 72 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) # SYLLABER_330
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
if END(s[4]):
|
| 77 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_52
|
| 78 |
+
|
| 79 |
+
if M(s[4]):
|
| 80 |
+
if END(s[5]):
|
| 81 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_76
|
| 82 |
+
|
| 83 |
+
if C(s[5]):
|
| 84 |
+
if V(s[6]):
|
| 85 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_250
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
if V(s[5]):
|
| 89 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_260
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if END(s[3]):
|
| 94 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_6
|
| 95 |
+
|
| 96 |
+
if M(s[3]):
|
| 97 |
+
if C(s[4]):
|
| 98 |
+
if not END(s[5]):
|
| 99 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_13
|
| 100 |
+
|
| 101 |
+
if END(s[5]):
|
| 102 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_39
|
| 103 |
+
|
| 104 |
+
if C(s[5]):
|
| 105 |
+
if C(s[6]):
|
| 106 |
+
if END(s[7]):
|
| 107 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_350
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
if END(s[4]):
|
| 113 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_14
|
| 114 |
+
|
| 115 |
+
if V(s[4]):
|
| 116 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_20
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
if END(s[2]):
|
| 121 |
+
return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_7
|
| 122 |
+
|
| 123 |
+
if S(s[2]):
|
| 124 |
+
if C(s[3]):
|
| 125 |
+
if V(s[4]):
|
| 126 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_8
|
| 127 |
+
|
| 128 |
+
if C(s[4]):
|
| 129 |
+
if END(s[5]):
|
| 130 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_9
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
if END(s[4]):
|
| 134 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_280
|
| 135 |
+
|
| 136 |
+
if M(s[4]):
|
| 137 |
+
if END(s[5]):
|
| 138 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_400
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if END(s[3]):
|
| 143 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_10
|
| 144 |
+
|
| 145 |
+
return ([s[0]+s[1]+s[2]], 3, 1) # SYLLABER_64
|
| 146 |
+
|
| 147 |
+
if V(s[2]):
|
| 148 |
+
return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_31
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if C(s[1]):
|
| 152 |
+
if C(s[2]):
|
| 153 |
+
if V(s[3]):
|
| 154 |
+
if C(s[4]):
|
| 155 |
+
if C(s[5]):
|
| 156 |
+
if V(s[6]):
|
| 157 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_2
|
| 158 |
+
|
| 159 |
+
if M(s[6]):
|
| 160 |
+
if END(s[7]):
|
| 161 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_310
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
if END(s[5]):
|
| 166 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_3
|
| 167 |
+
|
| 168 |
+
if V(s[5]):
|
| 169 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_4
|
| 170 |
+
|
| 171 |
+
if M(s[5]):
|
| 172 |
+
if C(s[6]):
|
| 173 |
+
if M(s[7]):
|
| 174 |
+
if END(s[8]):
|
| 175 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6]+s[7], s[8]], 9, 1) # SYLLABER_300
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1) # SYLLABER_200
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
if S(s[4]):
|
| 183 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1) # SYLLABER_54
|
| 184 |
+
|
| 185 |
+
if V(s[4]):
|
| 186 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_68
|
| 187 |
+
|
| 188 |
+
if END(s[4]):
|
| 189 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_170
|
| 190 |
+
|
| 191 |
+
return ([s[0]+s[1]+s[2]+s[3]], 4, 1) # SYLLABER_210
|
| 192 |
+
|
| 193 |
+
if C(s[3]):
|
| 194 |
+
if V(s[4]):
|
| 195 |
+
if S(s[5]):
|
| 196 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1) # SYLLABER_220
|
| 197 |
+
|
| 198 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1) # SYLLABER_98
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if V(s[2]):
|
| 203 |
+
if C(s[3]):
|
| 204 |
+
if C(s[4]):
|
| 205 |
+
if V(s[5]):
|
| 206 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_15
|
| 207 |
+
|
| 208 |
+
if C(s[5]):
|
| 209 |
+
if C(s[6]):
|
| 210 |
+
if END(s[7]):
|
| 211 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_370
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_80
|
| 215 |
+
|
| 216 |
+
if M(s[5]):
|
| 217 |
+
if V(s[6]):
|
| 218 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) # SYLLABER_340
|
| 219 |
+
|
| 220 |
+
if C(s[6]):
|
| 221 |
+
if V(s[7]):
|
| 222 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1) # SYLLABER_390
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
if END(s[5]):
|
| 227 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_470
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
if M(s[4]):
|
| 231 |
+
if not C(s[5]):
|
| 232 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_21
|
| 233 |
+
|
| 234 |
+
if C(s[5]):
|
| 235 |
+
if V(s[6]):
|
| 236 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_48
|
| 237 |
+
|
| 238 |
+
if C(s[6]):
|
| 239 |
+
if V(s[7]):
|
| 240 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6], s[7]], 8, 1) # SYLLABER_240
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
if END(s[4]):
|
| 246 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_62
|
| 247 |
+
|
| 248 |
+
if V(s[4]):
|
| 249 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_230
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
if V(s[3]):
|
| 253 |
+
if C(s[4]):
|
| 254 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_17
|
| 255 |
+
|
| 256 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_82
|
| 257 |
+
|
| 258 |
+
if S(s[3]):
|
| 259 |
+
if END(s[4]):
|
| 260 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_33
|
| 261 |
+
|
| 262 |
+
if C(s[4]):
|
| 263 |
+
if V(s[5]):
|
| 264 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_92
|
| 265 |
+
|
| 266 |
+
if C(s[5]):
|
| 267 |
+
if C(s[6]):
|
| 268 |
+
if END(s[7]):
|
| 269 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_450
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
return ([s[0]+s[1]+s[2]+s[3]], 4, 1) # SYLLABER_190
|
| 275 |
+
|
| 276 |
+
if END(s[3]):
|
| 277 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_66
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
if M(s[2]):
|
| 281 |
+
if V(s[3]):
|
| 282 |
+
if END(s[4]):
|
| 283 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_410
|
| 284 |
+
|
| 285 |
+
if C(s[4]):
|
| 286 |
+
if V(s[5]):
|
| 287 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_480
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
if M(s[1]):
|
| 294 |
+
if V(s[2]):
|
| 295 |
+
if C(s[3]):
|
| 296 |
+
if V(s[4]):
|
| 297 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_16
|
| 298 |
+
|
| 299 |
+
if C(s[4]):
|
| 300 |
+
if END(s[5]):
|
| 301 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_19
|
| 302 |
+
|
| 303 |
+
if V(s[5]):
|
| 304 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_290
|
| 305 |
+
|
| 306 |
+
if C(s[5]):
|
| 307 |
+
if C(s[6]):
|
| 308 |
+
if V(s[7]):
|
| 309 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1) # SYLLABER_430
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
if END(s[4]):
|
| 315 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_22
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
if END(s[3]):
|
| 319 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_94
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
if C(s[2]):
|
| 323 |
+
if V(s[3]):
|
| 324 |
+
if S(s[4]):
|
| 325 |
+
if END(s[5]):
|
| 326 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_320
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
if V(s[4]):
|
| 330 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_360
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
if V(s[0]):
|
| 338 |
+
if C(s[1]):
|
| 339 |
+
if C(s[2]):
|
| 340 |
+
if END(s[3]):
|
| 341 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_18
|
| 342 |
+
|
| 343 |
+
if V(s[3]):
|
| 344 |
+
return ([s[0]+s[1], s[2], s[3]], 4, 1) # SYLLABER_28
|
| 345 |
+
|
| 346 |
+
if C(s[3]):
|
| 347 |
+
if V(s[4]):
|
| 348 |
+
if C(s[5]):
|
| 349 |
+
return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) # SYLLABER_96
|
| 350 |
+
|
| 351 |
+
return ([s[0]+s[1], s[2], s[3], s[4]], 5, 1) # SYLLABER_50
|
| 352 |
+
|
| 353 |
+
if C(s[4]):
|
| 354 |
+
if V(s[5]):
|
| 355 |
+
return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) # SYLLABER_460
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
if M(s[3]):
|
| 360 |
+
if END(s[4]):
|
| 361 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_72
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
if V(s[2]):
|
| 366 |
+
return ([s[0], s[1], s[2]], 3, 1) # SYLLABER_35
|
| 367 |
+
|
| 368 |
+
if M(s[2]):
|
| 369 |
+
if END(s[3]):
|
| 370 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_40
|
| 371 |
+
|
| 372 |
+
if C(s[3]):
|
| 373 |
+
if C(s[4]):
|
| 374 |
+
if V(s[5]):
|
| 375 |
+
return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) # SYLLABER_42
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
if V(s[4]):
|
| 379 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_84
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
if V(s[3]):
|
| 383 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_78
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
if END(s[2]):
|
| 387 |
+
return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_44
|
| 388 |
+
|
| 389 |
+
return ([s[0]+s[1]], 2, 1) # SYLLABER_56
|
| 390 |
+
|
| 391 |
+
if END(s[1]):
|
| 392 |
+
return ([s[0], s[1]], 2, 1) # SYLLABER_30
|
| 393 |
+
|
| 394 |
+
if V(s[1]):
|
| 395 |
+
return ([s[0], s[1]], 2, 1) # SYLLABER_34
|
| 396 |
+
|
| 397 |
+
if S(s[1]):
|
| 398 |
+
if END(s[2]):
|
| 399 |
+
return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_46
|
| 400 |
+
|
| 401 |
+
if C(s[2]):
|
| 402 |
+
if V(s[3]):
|
| 403 |
+
return ([s[0]+s[1], s[2], s[3]], 4, 1) # SYLLABER_180
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
if BEG(s[0]):
|
| 410 |
+
if C(s[1]):
|
| 411 |
+
if C(s[2]):
|
| 412 |
+
if V(s[3]):
|
| 413 |
+
if C(s[4]):
|
| 414 |
+
if END(s[5]):
|
| 415 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_23
|
| 416 |
+
|
| 417 |
+
if C(s[5]):
|
| 418 |
+
if END(s[6]):
|
| 419 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_60
|
| 420 |
+
|
| 421 |
+
if M(s[6]):
|
| 422 |
+
if END(s[7]):
|
| 423 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_74
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
if S(s[4]):
|
| 429 |
+
if END(s[5]):
|
| 430 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_24
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
if END(s[4]):
|
| 434 |
+
return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) # SYLLABER_27
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
if END(s[3]):
|
| 438 |
+
return ([s[0], s[1]+s[2], s[3]], 4, 2) # SYLLABER_70
|
| 439 |
+
|
| 440 |
+
if C(s[3]):
|
| 441 |
+
if C(s[4]):
|
| 442 |
+
if V(s[5]):
|
| 443 |
+
if C(s[6]):
|
| 444 |
+
if END(s[7]):
|
| 445 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_88
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
if V(s[4]):
|
| 451 |
+
if C(s[5]):
|
| 452 |
+
if M(s[6]):
|
| 453 |
+
if END(s[7]):
|
| 454 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_90
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
if END(s[5]):
|
| 459 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_140
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
if V(s[2]):
|
| 465 |
+
if C(s[3]):
|
| 466 |
+
if C(s[4]):
|
| 467 |
+
if M(s[5]):
|
| 468 |
+
if END(s[6]):
|
| 469 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_26
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
if END(s[5]):
|
| 473 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_37
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
if M(s[4]):
|
| 477 |
+
if C(s[5]):
|
| 478 |
+
if C(s[6]):
|
| 479 |
+
if END(s[7]):
|
| 480 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_440
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
if S(s[3]):
|
| 487 |
+
if C(s[4]):
|
| 488 |
+
if END(s[5]):
|
| 489 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_160
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
if END(s[2]):
|
| 495 |
+
return ([s[0], s[1], s[2]], 3, 2) # SYLLABER_32
|
| 496 |
+
|
| 497 |
+
if M(s[2]):
|
| 498 |
+
if C(s[3]):
|
| 499 |
+
if V(s[4]):
|
| 500 |
+
if END(s[5]):
|
| 501 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_58
|
| 502 |
+
|
| 503 |
+
if C(s[5]):
|
| 504 |
+
if END(s[6]):
|
| 505 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_100
|
| 506 |
+
|
| 507 |
+
if V(s[6]):
|
| 508 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 2) # SYLLABER_420
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
|
| 512 |
+
|
| 513 |
+
if V(s[3]):
|
| 514 |
+
if END(s[4]):
|
| 515 |
+
return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) # SYLLABER_86
|
| 516 |
+
|
| 517 |
+
if S(s[4]):
|
| 518 |
+
if END(s[5]):
|
| 519 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_110
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
if C(s[4]):
|
| 523 |
+
if M(s[5]):
|
| 524 |
+
if END(s[6]):
|
| 525 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_150
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
if V(s[1]):
|
| 533 |
+
if C(s[2]):
|
| 534 |
+
if M(s[3]):
|
| 535 |
+
if END(s[4]):
|
| 536 |
+
return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) # SYLLABER_25
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
if END(s[3]):
|
| 540 |
+
return ([s[0], s[1]+s[2], s[3]], 4, 2) # SYLLABER_29
|
| 541 |
+
|
| 542 |
+
if C(s[3]):
|
| 543 |
+
if C(s[4]):
|
| 544 |
+
if C(s[5]):
|
| 545 |
+
if END(s[6]):
|
| 546 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_130
|
| 547 |
+
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
if S(s[1]):
|
| 554 |
+
if V(s[2]):
|
| 555 |
+
if C(s[3]):
|
| 556 |
+
if V(s[4]):
|
| 557 |
+
return ([s[0], s[1]+s[2], s[3], s[4]], 5, 2) # SYLLABER_380
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
if __name__ == "__main__":
|
| 565 |
+
sx = split(u"спросил")
|
| 566 |
+
print(u"|".join(sx))
|
| 567 |
+
|
| 568 |
+
def split_word(word):
|
| 569 |
+
"""
|
| 570 |
+
Split single word to syllables
|
| 571 |
+
:param word: unicode string representing Russian word
|
| 572 |
+
:return: list of unicode strings for syllables
|
| 573 |
+
"""
|
| 574 |
+
return split(word)
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
def split_words(words):
|
| 578 |
+
"""
|
| 579 |
+
Split the words in list to contiguous list of sillables and word separators (single space chars)
|
| 580 |
+
:param words: list of words (unicode strings)
|
| 581 |
+
:return: list of tokens - syllables and spaces
|
| 582 |
+
"""
|
| 583 |
+
tokens = []
|
| 584 |
+
for word in words:
|
| 585 |
+
sx = split(word)
|
| 586 |
+
if len(tokens) > 0:
|
| 587 |
+
tokens.append(u' ')
|
| 588 |
+
tokens.extend(sx)
|
| 589 |
+
return tokens
|
koziev/rupostagger/ruword2tags.dat
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dde47b5f1d48ff899887ac07812dcabd2966e48e84646f3065bfd06627c2af58
|
| 3 |
+
size 9683765
|
koziev/rupostagger/ruword2tags.py
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
19.04.2019 - при парсинге словарной базы Solarix пропускаются словоформы с
|
| 4 |
+
отрицательным скорингом (неупотребимые слова).
|
| 5 |
+
|
| 6 |
+
26-10-2019 - переход на хранение части словарной базы в SQLite3
|
| 7 |
+
|
| 8 |
+
17-06-2020 refs #1 возникает ошибка при работе из нескольких тредов, добавил check_same_thread=False
|
| 9 |
+
|
| 10 |
+
13.06.2022 если файла БД ruword2tags.db нет, скачаем его и оставим в домашнем каталоге пользователя
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import gzip
|
| 14 |
+
import pathlib
|
| 15 |
+
import os
|
| 16 |
+
import pickle
|
| 17 |
+
import io
|
| 18 |
+
import argparse
|
| 19 |
+
import sqlite3
|
| 20 |
+
import threading
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def create_trie_node(char):
|
| 24 |
+
return char, [], dict()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def add_to_trie_node(node, next_chars, tagset_index):
|
| 28 |
+
if len(next_chars) == 0:
|
| 29 |
+
node[1].append(tagset_index)
|
| 30 |
+
else:
|
| 31 |
+
next_char = next_chars[0]
|
| 32 |
+
if next_char not in node[2]:
|
| 33 |
+
node[2][next_char] = create_trie_node(next_char)
|
| 34 |
+
|
| 35 |
+
add_to_trie_node(node[2][next_char], next_chars[1:], tagset_index)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def find_tagsets_in_trie_node(node, word):
|
| 39 |
+
if word:
|
| 40 |
+
found_tagsets = []
|
| 41 |
+
next_char = word[0]
|
| 42 |
+
if next_char in node[2]:
|
| 43 |
+
found_tagsets.extend(find_tagsets_in_trie_node(node[2][next_char], word[1:]))
|
| 44 |
+
return found_tagsets
|
| 45 |
+
else:
|
| 46 |
+
return node[1]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def trie_constructed(trie_node, tagset2id):
|
| 50 |
+
tagset = tuple(sorted(trie_node[1]))
|
| 51 |
+
if tagset in tagset2id:
|
| 52 |
+
id_tagsets = tagset2id[tagset]
|
| 53 |
+
else:
|
| 54 |
+
id_tagsets = len(tagset2id) + 1
|
| 55 |
+
tagset2id[tagset] = id_tagsets
|
| 56 |
+
|
| 57 |
+
new_children = dict()
|
| 58 |
+
for next_char, child in trie_node[2].items():
|
| 59 |
+
new_children[next_char] = trie_constructed(child, tagset2id)
|
| 60 |
+
|
| 61 |
+
return (trie_node[0], id_tagsets, new_children)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class RuWord2Tags:
|
| 66 |
+
dict_filename = 'ruword2tags.dat'
|
| 67 |
+
|
| 68 |
+
def __init__(self):
|
| 69 |
+
self.ending_len = None
|
| 70 |
+
self.index2tagset = None
|
| 71 |
+
self.ending2tagsets = None
|
| 72 |
+
self.trie_root = None
|
| 73 |
+
self.all_ending2tagsets = None
|
| 74 |
+
self.trie_tagsets = None
|
| 75 |
+
self.db_filepath = None
|
| 76 |
+
self.cnx = None
|
| 77 |
+
self.lock = threading.Lock()
|
| 78 |
+
self.word2tagsets_cache = dict()
|
| 79 |
+
|
| 80 |
+
def load(self, dict_path=None):
|
| 81 |
+
module_folder = str(pathlib.Path(__file__).resolve().parent)
|
| 82 |
+
self.db_filepath = os.path.join(module_folder, 'database', 'ruword2tags.db')
|
| 83 |
+
try:
|
| 84 |
+
# 17-06-2020 refs #1 возникает ошибка при работе из нескольких тредов, добавил check_same_thread=False
|
| 85 |
+
self.cnx = sqlite3.connect(self.db_filepath, check_same_thread=False)
|
| 86 |
+
except Exception as ex:
|
| 87 |
+
msg = u'Could not open db file "{}", error: {}'.format(self.db_filepath, ex)
|
| 88 |
+
raise RuntimeError(msg)
|
| 89 |
+
|
| 90 |
+
self.cnx.isolation_level = None
|
| 91 |
+
self.cur = self.cnx.cursor()
|
| 92 |
+
|
| 93 |
+
with open(os.path.join(module_folder,"ruword2tags.dat"), 'rb') as f:
|
| 94 |
+
data = pickle.load(f)
|
| 95 |
+
self.ending_lens = data['ending_lens']
|
| 96 |
+
self.index2tagset = data['index2tagset']
|
| 97 |
+
self.ending2tagsets = data['ending2tagsets']
|
| 98 |
+
self.all_ending2tagsets = data['all_ending2tagsets']
|
| 99 |
+
self.id2tagsets = data['id2tagsets']
|
| 100 |
+
|
| 101 |
+
if False:
|
| 102 |
+
trie_filepath = os.path.join(os.path.dirname(p), 'ruword2tags_trie.dat')
|
| 103 |
+
with gzip.open(trie_filepath, 'r') as f:
|
| 104 |
+
self.trie_root = pickle.load(f)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def __getitem__(self, word):
|
| 108 |
+
hit = False
|
| 109 |
+
for ending_len in self.ending_lens:
|
| 110 |
+
ending = word[-ending_len:] if len(word) > ending_len else u''
|
| 111 |
+
if ending in self.ending2tagsets:
|
| 112 |
+
for itagset in self.ending2tagsets[ending]:
|
| 113 |
+
yield self.index2tagset[itagset]
|
| 114 |
+
hit = True
|
| 115 |
+
break
|
| 116 |
+
|
| 117 |
+
if not hit:
|
| 118 |
+
#for itagset in find_tagsets_in_trie_node(self.trie_root, word):
|
| 119 |
+
# hit = True
|
| 120 |
+
# yield self.index2tagset[itagset]
|
| 121 |
+
|
| 122 |
+
if word in self.word2tagsets_cache:
|
| 123 |
+
id_tagsets = self.word2tagsets_cache[word]
|
| 124 |
+
for itagset in self.id2tagsets[id_tagsets]:
|
| 125 |
+
yield self.index2tagset[itagset]
|
| 126 |
+
hit = True
|
| 127 |
+
else:
|
| 128 |
+
with self.lock: # для многопоточной работы в чатботе
|
| 129 |
+
for r in self.cur.execute('SELECT id_tagsets FROM word_tagsets WHERE word=:word', {'word': word}):
|
| 130 |
+
id_tagsets = int(r[0])
|
| 131 |
+
self.word2tagsets_cache[word] = id_tagsets
|
| 132 |
+
for itagset in self.id2tagsets[id_tagsets]:
|
| 133 |
+
yield self.index2tagset[itagset]
|
| 134 |
+
hit = True
|
| 135 |
+
|
| 136 |
+
if not hit:
|
| 137 |
+
for ending_len in reversed(self.ending_lens):
|
| 138 |
+
ending = word[-ending_len:] if len(word) > ending_len else u''
|
| 139 |
+
if ending in self.all_ending2tagsets:
|
| 140 |
+
for itagset in self.all_ending2tagsets[ending]:
|
| 141 |
+
yield self.index2tagset[itagset]
|
| 142 |
+
hit = True
|
| 143 |
+
break
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def run_tests(dict_path=None):
|
| 147 |
+
print('Start testing...')
|
| 148 |
+
word2tags = RuWord2Tags()
|
| 149 |
+
word2tags.load(dict_path)
|
| 150 |
+
|
| 151 |
+
cases = [(u'очень', [u'НАРЕЧИЕ СТЕПЕНЬ=АТРИБ ТИП_МОДИФ=ГЛАГ ТИП_МОДИФ=НАРЕЧ ТИП_МОДИФ=ПРИЛ']),
|
| 152 |
+
(u'поскорее', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН ТИП_МОДИФ=ГЛАГ']),
|
| 153 |
+
(u'поскорей', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН ТИП_МОДИФ=ГЛАГ']),
|
| 154 |
+
(u'сильнее', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН', u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 СТЕПЕНЬ=СРАВН']),
|
| 155 |
+
(u'синее', [u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 ПАДЕЖ=ВИН РОД=СР СТЕПЕНЬ=АТРИБ ЧИСЛО=ЕД', u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 ПАДЕЖ=ИМ РОД=СР СТЕПЕНЬ=АТРИБ ЧИСЛО=ЕД']),
|
| 156 |
+
(u'трахее', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ДАТ РОД=ЖЕН ЧИСЛО=ЕД', u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ПРЕДЛ РОД=ЖЕН ЧИСЛО=ЕД']),
|
| 157 |
+
(u'полдня', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ИМ ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД',
|
| 158 |
+
u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ВИН ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД',
|
| 159 |
+
u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=РОД ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД'
|
| 160 |
+
]),
|
| 161 |
+
(u'а', [u'СОЮЗ', u'ЧАСТИЦА']),
|
| 162 |
+
(u'кошки', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=ОДУШ ПАДЕЖ=ИМ РОД=ЖЕН ЧИСЛО=МН',
|
| 163 |
+
u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=ОДУШ ПАДЕЖ=РОД РОД=ЖЕН ЧИСЛО=ЕД']),
|
| 164 |
+
(u'на', [#u'ГЛАГОЛ ВИД=НЕСОВЕРШ ЛИЦО=2 НАКЛОНЕНИЕ=ПОБУД ТИП_ГЛАГОЛА=СТАТИЧ ЧИСЛО=ЕД',
|
| 165 |
+
u'ПРЕДЛОГ ПАДЕЖ=ВИН ПАДЕЖ=МЕСТ ПАДЕЖ=ПРЕДЛ',
|
| 166 |
+
#u'ЧАСТИЦА'
|
| 167 |
+
]),
|
| 168 |
+
(u'заводим', [u'ГЛАГОЛ ВИД=НЕСОВЕРШ ВРЕМЯ=НАСТОЯЩЕЕ ЛИЦО=1 НАКЛОНЕНИЕ=ИЗЪЯВ ПАДЕЖ=ВИН ПАДЕЖ=РОД ПАДЕЖ=ТВОР ЧИСЛО=МН'])
|
| 169 |
+
]
|
| 170 |
+
|
| 171 |
+
for word, required_tagsets in cases:
|
| 172 |
+
model_tagsets = list(word2tags[word])
|
| 173 |
+
if len(model_tagsets) != len(required_tagsets):
|
| 174 |
+
#for tagset in model_tagsets:
|
| 175 |
+
# print(u'DEBUG@112 word={} tagset={}'.format(word, tagset))
|
| 176 |
+
raise AssertionError(u'word="{}": {} tagset(s) required, {} found'.format(word, len(required_tagsets), len(model_tagsets)))
|
| 177 |
+
|
| 178 |
+
for model_tagset in model_tagsets:
|
| 179 |
+
if model_tagset not in required_tagsets:
|
| 180 |
+
raise AssertionError(u'Predicted tagset "{}" for word "{}" is not valid'.format(model_tagset, word))
|
| 181 |
+
|
| 182 |
+
print('All tests PASSED.')
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def normalize_word(s):
|
| 186 |
+
if len(s) > 2 and s[0] == "'" and s[-1] == "'":
|
| 187 |
+
s = s[1:-1]
|
| 188 |
+
|
| 189 |
+
return s.replace(' - ', '-').replace('ё', 'е').strip().lower()
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
ignore_tags = set('ПАДЕЖВАЛ:РОД МОДАЛЬНЫЙ:0 ПЕРЕЧИСЛИМОСТЬ:ДА ПЕРЕХОДНОСТЬ:ПЕРЕХОДНЫЙ ПЕРЕХОДНОСТЬ:НЕПЕРЕХОДНЫЙ ПАДЕЖВАЛ:ТВОР ПАДЕЖВАЛ:ИМ ПАДЕЖВАЛ:ДАТ ПАДЕЖВАЛ:ВИН СГД_ВРЕМЯ:Начать ВОЗВРАТНОСТЬ:0 ВОЗВРАТНОСТЬ:1'.split())
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def clean_tagset(tagset):
|
| 196 |
+
return ' '.join(t for t in tagset.split() if t not in ignore_tags).replace(':', '=')
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
if __name__ == '__main__':
|
| 200 |
+
parser = argparse.ArgumentParser(description='Сборка грамматического словаря')
|
| 201 |
+
parser.add_argument('--src', type=str, default='../data/word2tags.dat', help='Source grammatical dictionary file path')
|
| 202 |
+
parser.add_argument('--output', type=str, default='../output/ruword2tags.dat', help='Result dictionary file path')
|
| 203 |
+
parser.add_argument('--words', type=str, help='List of known words (all dictionary words are included by default)')
|
| 204 |
+
|
| 205 |
+
args = parser.parse_args()
|
| 206 |
+
knownwords_file = args.words
|
| 207 |
+
word2tags_path = args.src
|
| 208 |
+
output_file = args.output
|
| 209 |
+
|
| 210 |
+
# Строим словарь из исходных данных
|
| 211 |
+
|
| 212 |
+
known_words = None
|
| 213 |
+
if knownwords_file is not None:
|
| 214 |
+
# Загружаем из указанного файла список слов, которые попадут в итоговую модель.
|
| 215 |
+
print('Загружаем список слов для сборки кастомного словаря из {}'.format(knownwords_file))
|
| 216 |
+
known_words = set()
|
| 217 |
+
with io.open(knownwords_file, 'r', encoding='utf-8') as rdr:
|
| 218 |
+
for line in rdr:
|
| 219 |
+
word = line.replace(chr(65279), '').strip()
|
| 220 |
+
known_words.add(word.lower())
|
| 221 |
+
print('Загружено {} слов из {}'.format(len(known_words), knownwords_file))
|
| 222 |
+
|
| 223 |
+
word2tagsets = dict()
|
| 224 |
+
tagset2index = dict()
|
| 225 |
+
nb_words = 0
|
| 226 |
+
filter_negative_scores = True
|
| 227 |
+
print('Loading dictionary from {}'.format(word2tags_path))
|
| 228 |
+
|
| 229 |
+
# В первом проходе по списку словоформ отберем формы, которые будем игнорировать из-за присвоенной
|
| 230 |
+
# им частоты < 0. Если все варианты распознавания слова имеют присвоенную частоту < 0, то не будем отсекать
|
| 231 |
+
# такие формы.
|
| 232 |
+
wordform2max_score = dict()
|
| 233 |
+
with io.open(word2tags_path, 'r', encoding='utf-8') as rdr:
|
| 234 |
+
for line in rdr:
|
| 235 |
+
tx = line.replace(chr(65279), '').strip().split('\t')
|
| 236 |
+
if len(tx) == 5:
|
| 237 |
+
score = int(tx[4])
|
| 238 |
+
word = normalize_word(tx[0])
|
| 239 |
+
wordform2max_score[word] = max(score, wordform2max_score.get(word, -1000000))
|
| 240 |
+
|
| 241 |
+
# Основной, второй проход.
|
| 242 |
+
with io.open(word2tags_path, 'r', encoding='utf-8') as rdr:
|
| 243 |
+
for line in rdr:
|
| 244 |
+
tx = line.replace(chr(65279), '').strip().split('\t')
|
| 245 |
+
if len(tx) == 5:
|
| 246 |
+
word = normalize_word(tx[0])
|
| 247 |
+
if filter_negative_scores and wordform2max_score[word] >= 0 and int(tx[4]) < 0:
|
| 248 |
+
# пропускаем формы, которые помечены как редкие или неграмматические (частотность < 0),
|
| 249 |
+
# и для которых есть альтернативы с частотой >= 0.
|
| 250 |
+
continue
|
| 251 |
+
|
| 252 |
+
if known_words is None or word in known_words:
|
| 253 |
+
pos = tx[1]
|
| 254 |
+
lemma = normalize_word(tx[2])
|
| 255 |
+
tags = clean_tagset(tx[3]) if len(tx) == 5 else u''
|
| 256 |
+
|
| 257 |
+
tagset = (pos + ' ' + tags).strip()
|
| 258 |
+
|
| 259 |
+
if tagset not in tagset2index:
|
| 260 |
+
tagset2index[tagset] = len(tagset2index)
|
| 261 |
+
|
| 262 |
+
itagset = tagset2index[tagset]
|
| 263 |
+
|
| 264 |
+
if word not in word2tagsets:
|
| 265 |
+
word2tagsets[word] = [itagset]
|
| 266 |
+
else:
|
| 267 |
+
word2tagsets[word].append(itagset)
|
| 268 |
+
|
| 269 |
+
nb_words += 1
|
| 270 |
+
|
| 271 |
+
print('Number of wordentries={}'.format(nb_words))
|
| 272 |
+
print('Number of tagsets={}'.format(len(tagset2index)))
|
| 273 |
+
|
| 274 |
+
for word in u'а и у с к'.split():
|
| 275 |
+
assert(word in word2tagsets)
|
| 276 |
+
|
| 277 |
+
ending_lens = [3, 4, 5]
|
| 278 |
+
processed_words = set()
|
| 279 |
+
ending2tagsets = dict()
|
| 280 |
+
all_ending2tagsets = dict()
|
| 281 |
+
|
| 282 |
+
for ending_len in ending_lens:
|
| 283 |
+
print('Start processing ending_len={}'.format(ending_len))
|
| 284 |
+
e2tagsets = dict()
|
| 285 |
+
for word, tagsets in word2tagsets.items():
|
| 286 |
+
if word not in processed_words and len(word) > ending_len:
|
| 287 |
+
ending = word[-ending_len:]
|
| 288 |
+
if ending not in e2tagsets:
|
| 289 |
+
e2tagsets[ending] = set(tagsets)
|
| 290 |
+
else:
|
| 291 |
+
e2tagsets[ending].update(tagsets)
|
| 292 |
+
|
| 293 |
+
all_ending2tagsets.update(e2tagsets)
|
| 294 |
+
print('Number of distinct endings={}'.format(len(e2tagsets)))
|
| 295 |
+
|
| 296 |
+
# Уберем окончания, которые дают списки тегов хотя бы с 1 ошибкой
|
| 297 |
+
bad_endings = set()
|
| 298 |
+
for word, word_tagsets in word2tagsets.items():
|
| 299 |
+
if word not in processed_words and len(word) > ending_len:
|
| 300 |
+
ending = word[-ending_len:]
|
| 301 |
+
ending_tagsets = e2tagsets[ending]
|
| 302 |
+
if set(word_tagsets) != ending_tagsets:
|
| 303 |
+
bad_endings.add(ending)
|
| 304 |
+
|
| 305 |
+
print('Number of bad endings={}'.format(len(bad_endings)))
|
| 306 |
+
|
| 307 |
+
e2tagsets = dict(filter(lambda z: z[0] not in bad_endings, e2tagsets.items()))
|
| 308 |
+
|
| 309 |
+
# Теперь пометим слова, которые подходят под оставшиеся хорошие окончания.
|
| 310 |
+
nb_matched_words = 0
|
| 311 |
+
for word in word2tagsets.keys():
|
| 312 |
+
if len(word) > ending_len:
|
| 313 |
+
ending = word[-ending_len:]
|
| 314 |
+
if ending in e2tagsets:
|
| 315 |
+
processed_words.add(word)
|
| 316 |
+
nb_matched_words += 1
|
| 317 |
+
|
| 318 |
+
print('nb_matched_words={}'.format(nb_matched_words))
|
| 319 |
+
|
| 320 |
+
# Переносим оставшиеся хорошие ��кончания в основной список
|
| 321 |
+
ending2tagsets.update(e2tagsets)
|
| 322 |
+
|
| 323 |
+
print('Number of good endings={}'.format(len(ending2tagsets)))
|
| 324 |
+
print('Number of all endings={}'.format(len(all_ending2tagsets)))
|
| 325 |
+
|
| 326 |
+
print('Building TRIE for {} words...'.format(len(word2tagsets)))
|
| 327 |
+
trie_words = []
|
| 328 |
+
for word, word_tagsets in word2tagsets.items():
|
| 329 |
+
if word not in processed_words:
|
| 330 |
+
# Слово не было обработано окончаниями.
|
| 331 |
+
for itagset in word_tagsets:
|
| 332 |
+
trie_words.append((word, itagset))
|
| 333 |
+
|
| 334 |
+
trie_root = create_trie_node('')
|
| 335 |
+
for word, itagset in trie_words:
|
| 336 |
+
add_to_trie_node(trie_root, word, itagset)
|
| 337 |
+
|
| 338 |
+
print('Number of words in TRIE={}'.format(len(trie_words)))
|
| 339 |
+
|
| 340 |
+
index2tagset = dict((i, t) for (t, i) in tagset2index.items())
|
| 341 |
+
|
| 342 |
+
trie_tagsets = dict()
|
| 343 |
+
trie_root = trie_constructed(trie_root, trie_tagsets)
|
| 344 |
+
|
| 345 |
+
db_filepath = os.path.join(os.path.dirname(output_file), 'ruword2tags.db')
|
| 346 |
+
print('Writing "{}"...'.format(db_filepath))
|
| 347 |
+
with sqlite3.connect(db_filepath) as cnx:
|
| 348 |
+
cursor = cnx.cursor()
|
| 349 |
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='word_tagsets'")
|
| 350 |
+
if not cursor.fetchone():
|
| 351 |
+
cnx.execute('CREATE TABLE word_tagsets(word TEXT NOT NULL PRIMARY KEY, id_tagsets INT not null)')
|
| 352 |
+
else:
|
| 353 |
+
cnx.execute('DELETE FROM word_tagsets')
|
| 354 |
+
|
| 355 |
+
for word, word_tagsets in word2tagsets.items():
|
| 356 |
+
if word not in processed_words:
|
| 357 |
+
tagsets2 = tuple(sorted(word_tagsets))
|
| 358 |
+
id_tagsets = trie_tagsets[tagsets2]
|
| 359 |
+
cursor.execute("INSERT INTO word_tagsets(word, id_tagsets) VALUES(:word, :tagsets)",
|
| 360 |
+
{'word': word, 'tagsets': id_tagsets})
|
| 361 |
+
|
| 362 |
+
cnx.commit()
|
| 363 |
+
|
| 364 |
+
lexicon_data = {'ending_lens': ending_lens,
|
| 365 |
+
'index2tagset': index2tagset,
|
| 366 |
+
'ending2tagsets': ending2tagsets,
|
| 367 |
+
'all_ending2tagsets': all_ending2tagsets,
|
| 368 |
+
'id2tagsets': dict((id, tagsets) for (tagsets, id) in trie_tagsets.items())
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
print('Writing "{}"...'.format(output_file))
|
| 372 |
+
with open(output_file, 'wb') as f:
|
| 373 |
+
pickle.dump(lexicon_data, f, protocol=2)
|
| 374 |
+
|
| 375 |
+
trie_filepath = os.path.join(os.path.dirname(output_file), 'ruword2tags_trie.dat')
|
| 376 |
+
print('Writing "{}"...'.format(trie_filepath))
|
| 377 |
+
with gzip.open(trie_filepath, 'wb') as f:
|
| 378 |
+
pickle.dump(trie_root, f)
|
| 379 |
+
|
| 380 |
+
#print('Сохранен файл словаря размером {:d} Мб'.format(int(os.path.getsize(output_file)/1000000)))
|
| 381 |
+
print('All data stored.')
|
| 382 |
+
|
| 383 |
+
# Теперь запускаем проверки для построенного словаря
|
| 384 |
+
run_tests(output_file)
|
| 385 |
+
|
| 386 |
+
word2tags = RuWord2Tags()
|
| 387 |
+
word2tags.load(output_file)
|
| 388 |
+
|
| 389 |
+
for word in u'кошки ккошки на'.split():
|
| 390 |
+
for i, tagset in enumerate(word2tags[word]):
|
| 391 |
+
print(u'{}[{}] => {}'.format(word, i, tagset))
|
nn/nn_accent/big.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47e69d9ae19f2a82e21b1c70f6a4bbfb1abc5759e98b2e67d009c5e9d7af18c9
|
| 3 |
+
size 2285217
|
nn/nn_accent/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "onnx_out",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"RoFormerForTokenClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.2,
|
| 7 |
+
"embedding_size": 128,
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_dropout_prob": 0.2,
|
| 10 |
+
"hidden_size": 128,
|
| 11 |
+
"id2label": {
|
| 12 |
+
"0": "NO",
|
| 13 |
+
"1": "STRESS_PRIMARY",
|
| 14 |
+
"2": "STRESS_SECONDARY"
|
| 15 |
+
},
|
| 16 |
+
"initializer_range": 0.02,
|
| 17 |
+
"intermediate_size": 256,
|
| 18 |
+
"label2id": {
|
| 19 |
+
"NO": 0,
|
| 20 |
+
"STRESS_PRIMARY": 1,
|
| 21 |
+
"STRESS_SECONDARY": 2
|
| 22 |
+
},
|
| 23 |
+
"layer_norm_eps": 1e-12,
|
| 24 |
+
"max_length": 40,
|
| 25 |
+
"max_position_embeddings": 60,
|
| 26 |
+
"max_relative_positions": 60,
|
| 27 |
+
"model_type": "roformer",
|
| 28 |
+
"num_attention_heads": 8,
|
| 29 |
+
"num_hidden_layers": 4,
|
| 30 |
+
"pad_token_id": 0,
|
| 31 |
+
"relative_attention": true,
|
| 32 |
+
"rotary_value": false,
|
| 33 |
+
"transformers_version": "4.29.2",
|
| 34 |
+
"type_vocab_size": 2,
|
| 35 |
+
"use_cache": true,
|
| 36 |
+
"vocab_size": 45
|
| 37 |
+
}
|
nn/nn_accent/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e393144e45626f6f1062a0784ef06f921b97321a8e7b87ac2a09a892286500a
|
| 3 |
+
size 803402
|
nn/nn_accent/ort_config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"one_external_file": true,
|
| 3 |
+
"opset": null,
|
| 4 |
+
"optimization": {},
|
| 5 |
+
"optimum_version": "1.8.5",
|
| 6 |
+
"quantization": {
|
| 7 |
+
"activations_dtype": "QUInt8",
|
| 8 |
+
"activations_symmetric": false,
|
| 9 |
+
"format": "QOperator",
|
| 10 |
+
"is_static": false,
|
| 11 |
+
"mode": "IntegerOps",
|
| 12 |
+
"nodes_to_exclude": [],
|
| 13 |
+
"nodes_to_quantize": [],
|
| 14 |
+
"operators_to_quantize": [
|
| 15 |
+
"MatMul",
|
| 16 |
+
"Add"
|
| 17 |
+
],
|
| 18 |
+
"per_channel": false,
|
| 19 |
+
"qdq_add_pair_to_weight": false,
|
| 20 |
+
"qdq_dedicated_pair": false,
|
| 21 |
+
"qdq_op_type_per_channel_support_to_axis": {
|
| 22 |
+
"MatMul": 1
|
| 23 |
+
},
|
| 24 |
+
"reduce_range": false,
|
| 25 |
+
"weights_dtype": "QInt8",
|
| 26 |
+
"weights_symmetric": true
|
| 27 |
+
},
|
| 28 |
+
"transformers_version": "4.29.2",
|
| 29 |
+
"use_external_data_format": false
|
| 30 |
+
}
|
nn/nn_accent/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "[bos]",
|
| 3 |
+
"eos_token": "[eos]",
|
| 4 |
+
"pad_token": "[pad]",
|
| 5 |
+
"unk_token": "[unk]"
|
| 6 |
+
}
|
nn/nn_accent/tokenizer_config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "[bos]",
|
| 3 |
+
"clean_up_tokenization_spaces": true,
|
| 4 |
+
"do_lower_case": true,
|
| 5 |
+
"eos_token": "[eos]",
|
| 6 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 7 |
+
"pad_token": "[pad]",
|
| 8 |
+
"tokenizer_class": "CharTokenizer",
|
| 9 |
+
"unk_token": "[unk]"
|
| 10 |
+
}
|
nn/nn_accent/vocab.txt
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pad]
|
| 2 |
+
[unk]
|
| 3 |
+
[bos]
|
| 4 |
+
[eos]
|
| 5 |
+
'
|
| 6 |
+
-
|
| 7 |
+
.
|
| 8 |
+
?
|
| 9 |
+
`
|
| 10 |
+
c
|
| 11 |
+
e
|
| 12 |
+
́
|
| 13 |
+
а
|
| 14 |
+
б
|
| 15 |
+
в
|
| 16 |
+
г
|
| 17 |
+
д
|
| 18 |
+
е
|
| 19 |
+
ж
|
| 20 |
+
з
|
| 21 |
+
и
|
| 22 |
+
й
|
| 23 |
+
к
|
| 24 |
+
л
|
| 25 |
+
м
|
| 26 |
+
н
|
| 27 |
+
о
|
| 28 |
+
п
|
| 29 |
+
р
|
| 30 |
+
с
|
| 31 |
+
т
|
| 32 |
+
у
|
| 33 |
+
ф
|
| 34 |
+
х
|
| 35 |
+
ц
|
| 36 |
+
ч
|
| 37 |
+
ш
|
| 38 |
+
щ
|
| 39 |
+
ъ
|
| 40 |
+
ы
|
| 41 |
+
ь
|
| 42 |
+
э
|
| 43 |
+
ю
|
| 44 |
+
я
|
| 45 |
+
ё
|
nn/nn_omograph/big_poetry/added_tokens.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</w>": 120139,
|
| 3 |
+
"<w>": 120138
|
| 4 |
+
}
|
nn/nn_omograph/big_poetry/config.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "rubert_base/",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BertForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"directionality": "bidi",
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"pooler_fc_size": 768,
|
| 21 |
+
"pooler_num_attention_heads": 12,
|
| 22 |
+
"pooler_num_fc_layers": 3,
|
| 23 |
+
"pooler_size_per_head": 128,
|
| 24 |
+
"pooler_type": "first_token_transform",
|
| 25 |
+
"position_embedding_type": "absolute",
|
| 26 |
+
"problem_type": "single_label_classification",
|
| 27 |
+
"transformers_version": "4.29.2",
|
| 28 |
+
"type_vocab_size": 2,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"vocab_size": 120140
|
| 31 |
+
}
|
nn/nn_omograph/big_poetry/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7d1d58e5ad908f4187d3c44f640106b721e293ec954c9c4603abc25ba5f7e8a
|
| 3 |
+
size 713508364
|
nn/nn_omograph/big_poetry/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
nn/nn_omograph/big_poetry/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nn/nn_omograph/big_poetry/tokenizer_config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"clean_up_tokenization_spaces": true,
|
| 3 |
+
"cls_token": "[CLS]",
|
| 4 |
+
"do_basic_tokenize": true,
|
| 5 |
+
"do_lower_case": true,
|
| 6 |
+
"mask_token": "[MASK]",
|
| 7 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 8 |
+
"never_split": null,
|
| 9 |
+
"pad_token": "[PAD]",
|
| 10 |
+
"sep_token": "[SEP]",
|
| 11 |
+
"strip_accents": null,
|
| 12 |
+
"tokenize_chinese_chars": true,
|
| 13 |
+
"tokenizer_class": "BertTokenizer",
|
| 14 |
+
"unk_token": "[UNK]"
|
| 15 |
+
}
|
nn/nn_omograph/big_poetry/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nn/nn_omograph/medium_poetry/added_tokens.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</w>": 64001,
|
| 3 |
+
"<w>": 64000
|
| 4 |
+
}
|
nn/nn_omograph/medium_poetry/config.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "SRUElectra-medium/checkpoint-4500000/",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"ElectraForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"embedding_size": 576,
|
| 9 |
+
"generator_size": "0.25",
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.1,
|
| 12 |
+
"hidden_size": 576,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 2304,
|
| 15 |
+
"layer_norm_eps": 1e-12,
|
| 16 |
+
"max_position_embeddings": 512,
|
| 17 |
+
"model_type": "electra",
|
| 18 |
+
"num_attention_heads": 9,
|
| 19 |
+
"num_hidden_layers": 12,
|
| 20 |
+
"pad_token_id": 0,
|
| 21 |
+
"position_embedding_type": "absolute",
|
| 22 |
+
"problem_type": "single_label_classification",
|
| 23 |
+
"summary_activation": "gelu",
|
| 24 |
+
"summary_last_dropout": 0.1,
|
| 25 |
+
"summary_type": "first",
|
| 26 |
+
"summary_use_proj": true,
|
| 27 |
+
"transformers_version": "4.29.2",
|
| 28 |
+
"type_vocab_size": 2,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"vocab_size": 64002
|
| 31 |
+
}
|
nn/nn_omograph/medium_poetry/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:689752e4bff9eb0b8837482d9ea724f72356aab19822c2e4ae3de6b5a2fc08b1
|
| 3 |
+
size 341725861
|
nn/nn_omograph/medium_poetry/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
nn/nn_omograph/medium_poetry/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nn/nn_omograph/medium_poetry/tokenizer_config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"clean_up_tokenization_spaces": true,
|
| 3 |
+
"cls_token": "[CLS]",
|
| 4 |
+
"do_basic_tokenize": true,
|
| 5 |
+
"do_lower_case": true,
|
| 6 |
+
"mask_token": "[MASK]",
|
| 7 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 8 |
+
"never_split": null,
|
| 9 |
+
"pad_token": "[PAD]",
|
| 10 |
+
"sep_token": "[SEP]",
|
| 11 |
+
"strip_accents": null,
|
| 12 |
+
"tokenize_chinese_chars": true,
|
| 13 |
+
"tokenizer_class": "ElectraTokenizer",
|
| 14 |
+
"unk_token": "[UNK]"
|
| 15 |
+
}
|
nn/nn_omograph/medium_poetry/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nn/nn_omograph/small_poetry/added_tokens.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</w>": 30523,
|
| 3 |
+
"<w>": 30522
|
| 4 |
+
}
|
nn/nn_omograph/small_poetry/config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "output/checkpoint-440000/",
|
| 3 |
+
"activation": "gelu",
|
| 4 |
+
"architectures": [
|
| 5 |
+
"DistilBertForSequenceClassification"
|
| 6 |
+
],
|
| 7 |
+
"attention_dropout": 0.1,
|
| 8 |
+
"dim": 264,
|
| 9 |
+
"dropout": 0.1,
|
| 10 |
+
"hidden_dim": 792,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"max_position_embeddings": 512,
|
| 13 |
+
"model_type": "distilbert",
|
| 14 |
+
"n_heads": 12,
|
| 15 |
+
"n_layers": 3,
|
| 16 |
+
"pad_token_id": 0,
|
| 17 |
+
"problem_type": "single_label_classification",
|
| 18 |
+
"qa_dropout": 0.1,
|
| 19 |
+
"seq_classif_dropout": 0.2,
|
| 20 |
+
"sinusoidal_pos_embds": false,
|
| 21 |
+
"transformers_version": "4.29.2",
|
| 22 |
+
"vocab_size": 30524
|
| 23 |
+
}
|
nn/nn_omograph/small_poetry/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fcea1b8d8c164276d2e593d53261ca3c21d6fc9fed4f04abb8f69e2b95ba842d
|
| 3 |
+
size 41532079
|
nn/nn_omograph/small_poetry/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
nn/nn_omograph/small_poetry/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nn/nn_omograph/small_poetry/tokenizer_config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"clean_up_tokenization_spaces": true,
|
| 3 |
+
"cls_token": "[CLS]",
|
| 4 |
+
"do_basic_tokenize": true,
|
| 5 |
+
"do_lower_case": false,
|
| 6 |
+
"mask_token": "[MASK]",
|
| 7 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 8 |
+
"never_split": null,
|
| 9 |
+
"pad_token": "[PAD]",
|
| 10 |
+
"sep_token": "[SEP]",
|
| 11 |
+
"strip_accents": null,
|
| 12 |
+
"tokenize_chinese_chars": true,
|
| 13 |
+
"tokenizer_class": "DistilBertTokenizer",
|
| 14 |
+
"unk_token": "[UNK]"
|
| 15 |
+
}
|
nn/nn_omograph/small_poetry/vocab.txt
ADDED
|
Binary file (382 kB). View file
|
|
|
nn/nn_omograph/turbo/added_tokens.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</w>": 50257,
|
| 3 |
+
"<w>": 50256
|
| 4 |
+
}
|
nn/nn_omograph/turbo/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "rudeberta_distilled/checkpoint-220000/",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"DebertaForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"hidden_act": "gelu",
|
| 8 |
+
"hidden_dropout_prob": 0.1,
|
| 9 |
+
"hidden_size": 768,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 3072,
|
| 12 |
+
"layer_norm_eps": 1e-07,
|
| 13 |
+
"max_position_embeddings": 512,
|
| 14 |
+
"max_relative_positions": -1,
|
| 15 |
+
"model_type": "deberta",
|
| 16 |
+
"num_attention_heads": 12,
|
| 17 |
+
"num_hidden_layers": 6,
|
| 18 |
+
"pad_token_id": 0,
|
| 19 |
+
"pooler_dropout": 0,
|
| 20 |
+
"pooler_hidden_act": "gelu",
|
| 21 |
+
"pooler_hidden_size": 768,
|
| 22 |
+
"pos_att_type": null,
|
| 23 |
+
"position_biased_input": true,
|
| 24 |
+
"relative_attention": false,
|
| 25 |
+
"transformers_version": "4.28.1",
|
| 26 |
+
"type_vocab_size": 0,
|
| 27 |
+
"vocab_size": 50258
|
| 28 |
+
}
|
nn/nn_omograph/turbo/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|