Spaces:

Hexamind
/

GenProp

Runtime error

GenProp / src /reader /reader_for_requirements.py

adrien.aribaut-gaudin

feat: new public GenProp

498db6b about 2 years ago

5.61 kB

	import docx
	import os
	from docx.document import Document as _Document
	from src.domain.requirements_paragraphs import Requirement_Paragraph
	from docx.oxml.text.paragraph import CT_P
	from docx.oxml.table import CT_Tbl
	from docx.table import _Cell, Table
	from docx.text.paragraph import Paragraph

	class WordReader:

	def __init__(self, path):
	self.path = path
	self.paragraphs = self.get_paragraphs()

	def iter_block_items(self, parent):
	if isinstance(parent, _Document):
	parent_elm = parent.element.body
	elif isinstance(parent, _Cell):
	parent_elm = parent._tc
	else:
	raise ValueError("Unsupported parent type")

	for child in parent_elm.iterchildren():
	if isinstance(child, CT_P):
	yield Paragraph(child, parent)
	elif isinstance(child, CT_Tbl):
	yield Table(child, parent)

	def get_paragraphs(self):
	if not os.path.exists(self.path):
	raise FileNotFoundError(f"The file {self.path} does not exist.")
	try:
	doc = docx.Document(self.path)
	paragraph_objects = []
	paragraph_id = 0
	page_id = 1 # Example page ID
	total_characters = 0
	for block in self.iter_block_items(doc):
	if isinstance(block, Paragraph):
	paragraph_info = self.extract_paragraph_info(block)
	if paragraph_info: # Only append if paragraph is not empty
	page_id = self.estimate_page_number(total_characters)
	p_obj = Requirement_Paragraph(text=paragraph_info['text'], font_style=paragraph_info['style'], id_=paragraph_id, page_id=page_id)
	#print(f"Found paragraph: {paragraph_info['style']}...") # DEBUG
	paragraph_objects.append(p_obj)
	paragraph_id += 1
	total_characters += len(paragraph_info['text'])
	elif isinstance(block, Table):
	table_paragraph, table_style = self.table_to_paragraph(block)
	if table_paragraph.strip(): # Check if table paragraph is not empty
	#print(f"Found table. Predominant style: {table_style}") # DEBUG
	p_obj = Requirement_Paragraph(text=table_paragraph, font_style=table_style, id_=paragraph_id, page_id=page_id)
	paragraph_objects.append(p_obj)
	paragraph_id += 1
	return paragraph_objects
	except Exception as e:
	raise ValueError(f"Error reading the .docx file. Original error: {str(e)}")


	def determine_predominant_style(self, styles):
	# Count the occurrences of each style
	style_counts = {}
	for style in styles:
	if style in style_counts:
	style_counts[style] += 1
	else:
	style_counts[style] = 1

	# Find the style with the highest count
	predominant_style = max(style_counts, key=style_counts.get, default="None")
	return predominant_style

	def estimate_page_number(self, total_characters):
	avg_chars_per_page = 2000
	return total_characters // avg_chars_per_page + 1

	def extract_paragraph_info(self, paragraph):
	# Check if paragraph is empty
	if not paragraph.text.strip():
	return None # Return None for empty paragraphs

	paragraph_style = paragraph.style.name if paragraph.style else 'None'

	runs = []
	for run in paragraph.runs:
	run_details = {
	'text': run.text,
	'font_name': run.font.name,
	'font_size': run.font.size.pt if run.font.size else None,
	'bold': run.bold,
	'italic': run.italic,
	'underline': run.underline
	}
	runs.append(run_details)

	return {
	'text': paragraph.text,
	'style': paragraph_style,
	'runs': runs
	}



	def table_to_paragraph(self, table):
	table_text = ""
	table_styles = set()

	for row in table.rows:
	for cell in row.cells:
	cell_text = ""
	for paragraph in cell.paragraphs:
	paragraph_style = paragraph.style.name if paragraph.style else 'None'
	table_styles.add(paragraph_style)

	for run in paragraph.runs:
	cell_text += run.text

	cell_text += " "
	table_text += cell_text.strip() + " \| " # Add a separator for cells
	table_text = table_text.strip() + "\n" # Add a newline for rows

	predominant_style = self.determine_predominant_style(table_styles)

	return table_text.strip(), predominant_style

	def print_paragraphs_and_tables(self):
	try:
	print("start")
	doc_items = self.get_paragraphs()
	for item in doc_items:
	if 'paragraph' in item:
	print("Paragraph:", item['paragraph']['text'])
	elif 'table' in item:
	print("Table:")
	for row in item['table']:
	for cell in row:
	for paragraph in cell:
	print(" Cell Paragraph:", paragraph['text'])
	print('-' * 40) # separator for clarity

	except Exception as e:
	print(f"Error: {str(e)}")