| from pptx import Presentation |
| import gradio as gr |
| from pdf2image import convert_from_path |
| import pdfplumber |
| from docx import Document |
| import subprocess |
| import os |
| from typing import Optional, List |
| import string |
| import random |
| import re |
|
|
|
|
| def extract_text_from_pptx(file_path): |
| prs = Presentation(file_path) |
| text_content = [] |
|
|
| for slide in prs.slides: |
| slide_text = [] |
| for shape in slide.shapes: |
| if hasattr(shape, "text"): |
| slide_text.append(shape.text) |
| text_content.append("\n".join(slide_text)) |
|
|
| return "\n\n".join(text_content) |
|
|
|
|
| def extract_text_from_ppt(file_path): |
| try: |
| |
| pptx_file_path = os.path.splitext(file_path)[0] + ".pptx" |
| subprocess.run(["unoconv", "-f", "pptx", file_path], check=True) |
|
|
| |
| presentation = Presentation(pptx_file_path) |
| text_content = [] |
|
|
| for slide in presentation.slides: |
| slide_text = [] |
| for shape in slide.shapes: |
| if hasattr(shape, "text"): |
| slide_text.append(shape.text) |
| text_content.append("\n".join(slide_text)) |
|
|
| |
| os.remove(pptx_file_path) |
|
|
| return "\n\n".join(text_content) |
| except Exception as e: |
| print(f"Error extracting text from PPT file: {e}") |
| return "Error extracting text from PPT file" |
|
|
|
|
| def extract_text_from_ppt_or_pptx(file_path): |
| if file_path.endswith(".pptx"): |
| return extract_text_from_pptx(file_path) |
| elif file_path.endswith(".ppt"): |
| return extract_text_from_ppt(file_path) |
| else: |
| return "Unsupported file type. Please provide a .ppt or .pptx file." |
|
|
|
|
| def convert_pdf_to_image(file): |
| images = convert_from_path(file) |
| return images |
|
|
|
|
| def extract_text_from_pdf(file): |
| text = "" |
| with pdfplumber.open(file) as pdf: |
| for page in pdf.pages: |
| text += page.extract_text() + "\n" |
| return text |
|
|
|
|
| def extract_text_from_docx(file): |
| text = "" |
| doc = Document(file.name) |
| for paragraph in doc.paragraphs: |
| text += paragraph.text + "\n" |
| return text |
|
|
|
|
| def convert_doc_to_text(doc_path): |
| try: |
| subprocess.run( |
| ["unoconv", "--format", "txt", doc_path], |
| capture_output=True, |
| text=True, |
| check=True, |
| ) |
| txt_file_path = doc_path.replace(".doc", ".txt") |
| with open(txt_file_path, "r") as f: |
| text = f.read() |
| text = text.lstrip("\ufeff") |
| os.remove(txt_file_path) |
| return text |
| except subprocess.CalledProcessError as e: |
| print(f"Error converting {doc_path} to text: {e}") |
| return "" |
|
|
|
|
| def extract_text_from_doc_or_docx(file): |
| if file.name.endswith(".docx"): |
| return extract_text_from_docx(file) |
| elif file.name.endswith(".doc"): |
| return convert_doc_to_text(file.name) |
| else: |
| return "Unsupported file type. Please upload a .doc or .docx file." |
|
|
|
|
| |
| def generate_random_string(length=23): |
| characters = string.ascii_letters + string.digits |
| random_string = ''.join(random.choice(characters) for _ in range(length)) |
| return random_string |
|
|
| |
| def handle_json_output(json_list : list) : |
| n = len(json_list) |
| for i in range(n) : |
| |
| random_string1 = generate_random_string() |
| random_string2 = generate_random_string() |
| element = json_list[i] |
| front = element["frontText"] |
| back = element["backText"] |
| element["frontHTML"] = (f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;\">' |
| f'<p>{front}</p></div>') |
| element["backtHTML"] = (f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;\">' |
| f'<p>{back}</p></div>') |
| element["termType"] = "basic" |
| cloze_matches = re.findall(r'_{2,}', front) |
| |
| if cloze_matches != [] & len(cloze_matches != 2): |
| |
| element["termType"] = "cloze" |
| |
| def replace_cloze(match): |
| return f'</p><p><span class="closure">{back}</span></p><p>' |
| front = re.sub(r'_{2,}', replace_cloze, front) |
| |
| element["frontHTML"] = (f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;\">' |
| f'<p>{front}</p></div>') |
| element["backText"] = "" |
| element["backtHTML"] = (f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;\">' |
| f'<p><br></p></div>') |
|
|
| return json_list |
|
|
|
|
| def sanitize_list_of_lists(text: str) -> Optional[List[List]]: |
| left = text.find("[") |
| right = text.rfind("]") |
| text = text[left : right + 1] |
| try: |
| |
| list_of_lists = eval(text) |
| if isinstance(list_of_lists, list): |
| out = [] |
| try: |
| |
| for front, back in list_of_lists: |
| out.append({"frontText": front, "backText": back}) |
| return handle_json_output(out) |
| |
| except Exception as e: |
| print(e) |
| |
| if out != []: |
| return handle_json_output(out) |
| |
| else: |
| return None |
| else: |
| print("The evaluated object is not a list.") |
| return None |
| except Exception as e: |
| print(f"Error parsing the list of lists: {e}") |
| return None |
|
|
|
|
| pdf_to_img = gr.Interface( |
| convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img" |
| ) |
| pdf_to_text = gr.Interface( |
| extract_text_from_pdf, |
| gr.File(), |
| gr.Textbox(placeholder="Extracted text will appear here"), |
| api_name="pdf_to_text", |
| ) |
|
|
| doc_or_docx_to_text = gr.Interface( |
| extract_text_from_doc_or_docx, |
| gr.File(), |
| gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"), |
| api_name="doc_or_docx_to_text", |
| ) |
|
|
| pptx_or_ppt_to_text = gr.Interface( |
| extract_text_from_ppt_or_pptx, |
| gr.File(), |
| gr.Textbox(placeholder="Extracted text from PPTX will appear here"), |
| api_name="pptx_or_ppt_to_text", |
| ) |
|
|
| str_to_json = gr.Interface( |
| sanitize_list_of_lists, |
| gr.Text(), |
| gr.JSON(), |
| api_name="str_to_json", |
| examples=[ |
| """[ |
| ["What year was the Carthaginian Empire founded?", "Around 814 BCE"], |
| ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"], |
| ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"], |
| ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"], |
| ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"], |
| ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"], |
| ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"], |
| ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"] |
| ]""" |
| ], |
| ) |
|
|
| demo = gr.TabbedInterface( |
| [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json], |
| [ |
| "PDF to Image", |
| "Extract PDF Text", |
| "Extract DOC/DOCX Text", |
| "Extract PPTX/PPT Text", |
| "Extract Json", |
| ], |
| ) |
|
|
| demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True) |
|
|