| import docx |
| import os |
| from docx.document import Document as _Document |
| from src.domain.requirements_paragraphs import Requirement_Paragraph |
| from docx.oxml.text.paragraph import CT_P |
| from docx.oxml.table import CT_Tbl |
| from docx.table import _Cell, Table |
| from docx.text.paragraph import Paragraph |
|
|
| class WordReader: |
|
|
| def __init__(self, path): |
| self.path = path |
| self.paragraphs = self.get_paragraphs() |
|
|
| def iter_block_items(self, parent): |
| if isinstance(parent, _Document): |
| parent_elm = parent.element.body |
| elif isinstance(parent, _Cell): |
| parent_elm = parent._tc |
| else: |
| raise ValueError("Unsupported parent type") |
|
|
| for child in parent_elm.iterchildren(): |
| if isinstance(child, CT_P): |
| yield Paragraph(child, parent) |
| elif isinstance(child, CT_Tbl): |
| yield Table(child, parent) |
|
|
| def get_paragraphs(self): |
| if not os.path.exists(self.path): |
| raise FileNotFoundError(f"The file {self.path} does not exist.") |
| try: |
| doc = docx.Document(self.path) |
| paragraph_objects = [] |
| paragraph_id = 0 |
| page_id = 1 |
| total_characters = 0 |
| for block in self.iter_block_items(doc): |
| if isinstance(block, Paragraph): |
| paragraph_info = self.extract_paragraph_info(block) |
| if paragraph_info: |
| page_id = self.estimate_page_number(total_characters) |
| p_obj = Requirement_Paragraph(text=paragraph_info['text'], font_style=paragraph_info['style'], id_=paragraph_id, page_id=page_id) |
| |
| paragraph_objects.append(p_obj) |
| paragraph_id += 1 |
| total_characters += len(paragraph_info['text']) |
| elif isinstance(block, Table): |
| table_paragraph, table_style = self.table_to_paragraph(block) |
| if table_paragraph.strip(): |
| |
| p_obj = Requirement_Paragraph(text=table_paragraph, font_style=table_style, id_=paragraph_id, page_id=page_id) |
| paragraph_objects.append(p_obj) |
| paragraph_id += 1 |
| return paragraph_objects |
| except Exception as e: |
| raise ValueError(f"Error reading the .docx file. Original error: {str(e)}") |
|
|
| |
| def determine_predominant_style(self, styles): |
| |
| style_counts = {} |
| for style in styles: |
| if style in style_counts: |
| style_counts[style] += 1 |
| else: |
| style_counts[style] = 1 |
|
|
| |
| predominant_style = max(style_counts, key=style_counts.get, default="None") |
| return predominant_style |
|
|
| def estimate_page_number(self, total_characters): |
| avg_chars_per_page = 2000 |
| return total_characters // avg_chars_per_page + 1 |
|
|
| def extract_paragraph_info(self, paragraph): |
| |
| if not paragraph.text.strip(): |
| return None |
|
|
| paragraph_style = paragraph.style.name if paragraph.style else 'None' |
|
|
| runs = [] |
| for run in paragraph.runs: |
| run_details = { |
| 'text': run.text, |
| 'font_name': run.font.name, |
| 'font_size': run.font.size.pt if run.font.size else None, |
| 'bold': run.bold, |
| 'italic': run.italic, |
| 'underline': run.underline |
| } |
| runs.append(run_details) |
|
|
| return { |
| 'text': paragraph.text, |
| 'style': paragraph_style, |
| 'runs': runs |
| } |
|
|
|
|
|
|
| def table_to_paragraph(self, table): |
| table_text = "" |
| table_styles = set() |
|
|
| for row in table.rows: |
| for cell in row.cells: |
| cell_text = "" |
| for paragraph in cell.paragraphs: |
| paragraph_style = paragraph.style.name if paragraph.style else 'None' |
| table_styles.add(paragraph_style) |
|
|
| for run in paragraph.runs: |
| cell_text += run.text |
|
|
| cell_text += " " |
| table_text += cell_text.strip() + " | " |
| table_text = table_text.strip() + "\n" |
|
|
| predominant_style = self.determine_predominant_style(table_styles) |
|
|
| return table_text.strip(), predominant_style |
|
|
| def print_paragraphs_and_tables(self): |
| try: |
| print("start") |
| doc_items = self.get_paragraphs() |
| for item in doc_items: |
| if 'paragraph' in item: |
| print("Paragraph:", item['paragraph']['text']) |
| elif 'table' in item: |
| print("Table:") |
| for row in item['table']: |
| for cell in row: |
| for paragraph in cell: |
| print(" Cell Paragraph:", paragraph['text']) |
| print('-' * 40) |
|
|
| except Exception as e: |
| print(f"Error: {str(e)}") |
|
|