| ''' |
| This module contains helperfunctions to load pdfs, extract their texts and generate additional metadata |
| |
| It was initially created for the businessresponsibility.ch project of the Prototype Fund. For more |
| information visit https://github.com/bizres |
| |
| ''' |
| from pdfminer.high_level import extract_pages |
| from pdfminer.layout import LTTextContainer |
| from pdfminer.high_level import extract_text |
|
|
| import fitz |
|
|
| import langid |
| langid.set_languages(['en', 'de','fr','it']) |
|
|
| import pandas as pd |
|
|
| def pdf_to_text(file): |
| ''' |
| This function extracts text from a pdf. |
| |
| Parameters: |
| path: path to pdf |
| ''' |
|
|
| text = extract_text(file) |
| paragraphs = text.split('\n\n') |
| return paragraphs |
|
|
|
|
| def detect_language(text): |
| ''' |
| This function detects the language of a text using langid |
| ''' |
| return langid.classify(text) |
|
|
| def count_pages(pdf_file): |
| return len(list(extract_pages(pdf_file))) |
|
|
| def pdf_text_to_sections(text): |
| ''' |
| This function generates a pandas DataFrame from the extracted text. Each section |
| is provided with the page it is on and a section_index |
| ''' |
| sections = [] |
| page_nr = 0 |
| section_index = 0 |
| for page in text.split('\n\n'): |
| page_nr += 1 |
| for section in page.split('\n'): |
| sections.append([page_nr, section_index, section]) |
| section_index += 1 |
|
|
| return pd.DataFrame(sections, columns=['page', 'section_index', 'section_text']) |
|
|