| class Doc: |
| def __init__(self, fulltext: str = '', title: str = '', params: dict = {}): |
| self.params = params |
| self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()] |
| self.title, self.lines = self._get_title(title) |
| self.container = Container(lines=self.lines, title=self.title, father=self, params=params) |
| self.fulltext = fulltext |
|
|
| def _get_title(self, title): |
| lines = self.lines |
| if self.params['type'] == 'input_text': |
| if self.lines and self.lines[0] and self.lines[0].type == 'title': |
| title = self.lines[0].text |
| lines = lines[1:] |
| else: |
| title = 'the title is missing' |
| return title, lines |
|
|
|
|
| class WikiPage(Doc): |
|
|
| def __init__(self, fulltext='', title=''): |
| self.params = { |
| 'type': 'wiki', |
| 'startswith_': |
| {'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'}, |
| 'endswith_': |
| [' ==', ' ===', ' ====', ' =====', ' ======', ' ======'], |
|
|
| 'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography", |
| "Cinematic adaptations", "Further reading", "Maps"] |
| } |
| super().__init__(fulltext=fulltext, title=title, params=self.params) |
|
|
| def get_paragraphs(self, chunk=500): |
| return self.container.get_paragraphs(chunk) |
|
|
|
|
| class Container: |
|
|
| def __init__(self, lines=[], level=0, title='', father=None, params={}): |
|
|
| self.children = [] |
| self.level = level |
| self.title = title |
| self.father = father |
| self.lines = [] |
| self._expand(lines) |
| if params and 'discarded' in params.keys(): |
| self.children = [child for child in self.children if child.title not in params['discarded']] |
| self.containers = [self] |
| for child in self.children: |
| self.containers += child.containers |
| self.text = '' |
| for child in self.children: |
| self.text += ' ' + child.text |
|
|
| def _expand(self, lines): |
| new_child = False |
| new_child_lines = [] |
| new_child_title = [] |
| for line in lines: |
| if not new_child: |
| if line.is_structure: |
| new_child = True |
| new_child_lines = [] |
| new_child_title = line.text |
| line.level = self.level + 1 |
| else: |
| self.lines.append(line) |
|
|
| else: |
| if self.level + 1 < line.level or not line.is_structure: |
| new_child_lines.append(line) |
| elif self.level + 1 == line.level: |
| self.children.append(Container(lines=new_child_lines, |
| level=self.level + 1, |
| title=new_child_title, |
| father=self)) |
| new_child_lines = [] |
| new_child_title = line.text |
| if new_child: |
| self.children.append(Container(lines=new_child_lines, |
| level=self.level + 1, |
| title=new_child_title, |
| father=self)) |
|
|
| def get_paragraphs(self, chunk=500): |
| if len(self.text) < chunk: |
| paragraphs = [self.text] |
| else: |
| paragraphs = [self.root_text] |
| for child in self.children: |
| paragraphs += child.get_paragraphs(chunk) |
| return paragraphs |
|
|
|
|
| class Line: |
|
|
| def __init__(self, text, params): |
| self.text = text |
| self.params = params |
| self.type, self.text = self._parse_text() |
| self.level = int(self.type) if self.type.isdigit() else -1 |
| self.is_structure = 0 < self.level |
|
|
|
|
| def _parse_text(self): |
| def strip_text(text_, start, end): |
| text_ = text_.split(start)[1] |
| if end != "": |
| text_ = text_.split(end)[0] |
| |
| return text_.strip() |
|
|
| startswith_ = self.params['startswith_'] |
|
|
| endswith_ = self.params['endswith_'] if 'endswith_' in self.params.keys() else [""] * len(startswith_) |
| types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter]) |
| for i, starter in enumerate(startswith_.keys()) |
| if self.text.startswith(starter)] |
| (text, type_) = types[0] if types else (self.text, 'normal') |
| return type_, text.strip() |
|
|