Spaces:

YvesP
/

GenDoc

Runtime error

App Files Files Community

GenDoc / src /domain /wikidoc.py

YvesP

initial commit

4cf88e8 over 2 years ago

raw

history blame contribute delete

4.69 kB

	class Doc:
	def __init__(self, fulltext: str = '', title: str = '', params: dict = {}):
	self.params = params
	self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()]
	self.title, self.lines = self._get_title(title)
	self.container = Container(lines=self.lines, title=self.title, father=self, params=params)
	self.fulltext = fulltext

	def _get_title(self, title):
	lines = self.lines
	if self.params['type'] == 'input_text':
	if self.lines and self.lines[0] and self.lines[0].type == 'title':
	title = self.lines[0].text
	lines = lines[1:]
	else:
	title = 'the title is missing'
	return title, lines


	class WikiPage(Doc):

	def __init__(self, fulltext='', title=''):
	self.params = {
	'type': 'wiki',
	'startswith_':
	{'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'},
	'endswith_':
	[' ==', ' ===', ' ====', ' =====', ' ======', ' ======'],

	'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography",
	"Cinematic adaptations", "Further reading", "Maps"]
	}
	super().__init__(fulltext=fulltext, title=title, params=self.params)

	def get_paragraphs(self, chunk=500):
	return self.container.get_paragraphs(chunk)


	class Container:

	def __init__(self, lines=[], level=0, title='', father=None, params={}):

	self.children = []
	self.level = level
	self.title = title
	self.father = father
	self.lines = []
	self._expand(lines)
	if params and 'discarded' in params.keys():
	self.children = [child for child in self.children if child.title not in params['discarded']]
	self.containers = [self]
	for child in self.children:
	self.containers += child.containers
	self.text = ''
	for child in self.children:
	self.text += ' ' + child.text

	def _expand(self, lines):
	new_child = False
	new_child_lines = []
	new_child_title = []
	for line in lines:
	if not new_child:
	if line.is_structure:
	new_child = True
	new_child_lines = []
	new_child_title = line.text
	line.level = self.level + 1
	else:
	self.lines.append(line)

	else:
	if self.level + 1 < line.level or not line.is_structure:
	new_child_lines.append(line)
	elif self.level + 1 == line.level:
	self.children.append(Container(lines=new_child_lines,
	level=self.level + 1,
	title=new_child_title,
	father=self))
	new_child_lines = []
	new_child_title = line.text
	if new_child:
	self.children.append(Container(lines=new_child_lines,
	level=self.level + 1,
	title=new_child_title,
	father=self))

	def get_paragraphs(self, chunk=500):
	if len(self.text) < chunk:
	paragraphs = [self.text]
	else:
	paragraphs = [self.root_text]
	for child in self.children:
	paragraphs += child.get_paragraphs(chunk)
	return paragraphs


	class Line:

	def __init__(self, text, params):
	self.text = text
	self.params = params
	self.type, self.text = self._parse_text()
	self.level = int(self.type) if self.type.isdigit() else -1
	self.is_structure = 0 < self.level


	def _parse_text(self):
	def strip_text(text_, start, end):
	text_ = text_.split(start)[1]
	if end != "":
	text_ = text_.split(end)[0]
	# text += ". \n"
	return text_.strip()

	startswith_ = self.params['startswith_']

	endswith_ = self.params['endswith_'] if 'endswith_' in self.params.keys() else [""] * len(startswith_)
	types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter])
	for i, starter in enumerate(startswith_.keys())
	if self.text.startswith(starter)]
	(text, type_) = types[0] if types else (self.text, 'normal')
	return type_, text.strip()