Spaces:

LazyHuman
/

plexi

Running

App Files Files Community

LazyHuman10 commited on Apr 4

Commit

8657f7c

1 Parent(s): dcfde3c

Sync lightweight pure-Python PDF conversion to space

Browse files

Files changed (3) hide show

pages/Study_Material_Hub.py +58 -19
requirements.txt +3 -0
utils.py +169 -0

pages/Study_Material_Hub.py CHANGED Viewed

@@ -4,10 +4,10 @@ from pathlib import Path
 from urllib.parse import quote
 import streamlit as st
-import streamlit.components.v1 as components
 from streamlit_pdf_viewer import pdf_viewer
 from utils import (
     APP_ICON_PATH,
     download_github_file,
     get_manifest,
     get_mime_type,
@@ -24,12 +24,7 @@ st.set_page_config(
 )
 inject_theme()
-OFFICE_PREVIEW_MIMES = {
-    "application/msword",
-    "application/vnd.ms-powerpoint",
-    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-}
 def format_file_label(filename):
@@ -55,15 +50,54 @@ def display_pdf(file_content):
     pdf_viewer(file_content, width="100%", height=700)
-def display_office_document(download_url, filename):
-    """Display Word and PowerPoint files using Office Web Viewer."""
-    encoded_url = quote(download_url, safe="")
-    preview_url = f"https://view.officeapps.live.com/op/embed.aspx?src={encoded_url}"
-    components.iframe(preview_url, width=None, height=700, scrolling=True)
-    st.caption(
-        f"Inline preview for `{format_file_label(filename)}` is powered by Office Web Viewer. "
-        "If it does not load, use the download button."
-    )
 try:
     manifest = get_manifest()
@@ -214,12 +248,17 @@ with preview_col:
         unsafe_allow_html=True,
     )
-    if file_mime == "application/pdf":
         display_pdf(file_content)
-    elif file_mime in OFFICE_PREVIEW_MIMES:
         display_office_document(
-            selected_file_obj["download_url"], selected_file_obj["name"]
         )
     else:
         st.info(
             "Preview is not available for this file type. Download it to inspect the content."

 from urllib.parse import quote
 import streamlit as st
 from streamlit_pdf_viewer import pdf_viewer
 from utils import (
     APP_ICON_PATH,
+    convert_office_to_pdf,
     download_github_file,
     get_manifest,
     get_mime_type,
 )
 inject_theme()
+# No longer need strict MIME-type mappings since we check extensions robustly.
 def format_file_label(filename):
     pdf_viewer(file_content, width="100%", height=700)
+def display_office_document(file_content, download_url, filename):
+    """Display Word / PowerPoint files by converting to PDF server-side.
+    Uses pure Python libraries to extract content and render a PDF preview.
+    If the conversion fails or is unsupported, it falls back to a link
+    that opens the file in Microsoft Office Web Viewer in a new tab.
+    """
+    suffix = Path(filename).suffix.lower().lstrip(".")
+    type_label = "presentation" if suffix in ("ppt", "pptx") else "document"
+    with st.spinner(f"Converting {type_label} to PDF for preview…"):
+        pdf_bytes = convert_office_to_pdf(file_content, filename)
+    if pdf_bytes:
+        pdf_viewer(pdf_bytes, width="100%", height=700)
+        st.caption(
+            f"Inline preview of `{format_file_label(filename)}` "
+            "(converted to PDF on the server)."
+        )
+    else:
+        # Fallback – open in Office Web Viewer in a new tab
+        encoded_url = quote(download_url, safe="")
+        preview_url = (
+            f"https://view.officeapps.live.com/op/view.aspx?src={encoded_url}"
+        )
+        st.markdown(
+            f"""
+            <section class="plexi-callout" style="text-align:center;padding:2.5rem 1.5rem;">
+                <div style="font-size:3rem;margin-bottom:0.6rem;">📄</div>
+                <div class="plexi-sidecard-title">{format_file_label(filename)}</div>
+                <div class="plexi-muted" style="margin-bottom:1rem;">
+                    Server-side conversion is not available right now.
+                    Open the {type_label} in Microsoft Office Web Viewer instead.
+                </div>
+            </section>
+            """,
+            unsafe_allow_html=True,
+        )
+        st.link_button(
+            f"🔗  Open {type_label.capitalize()} in Office Viewer",
+            preview_url,
+            use_container_width=True,
+            type="primary",
+        )
+        st.caption(
+            "Powered by Microsoft Office Web Viewer. "
+            "You can also download the file directly using the button on the right."
+        )
 try:
     manifest = get_manifest()
         unsafe_allow_html=True,
     )
+    ext = Path(selected_file_obj["name"]).suffix.lower()
+    if ext == ".pdf":
         display_pdf(file_content)
+    elif ext in (".ppt", ".pptx", ".doc", ".docx"):
         display_office_document(
+            file_content, selected_file_obj["download_url"], selected_file_obj["name"]
         )
+    elif file_mime.startswith("text/"):
+        # Basic text preview support (optional, if needed)
+        st.code(file_content.decode("utf-8", errors="replace"))
     else:
         st.info(
             "Preview is not available for this file type. Download it to inspect the content."

requirements.txt CHANGED Viewed

@@ -3,6 +3,9 @@ python-dotenv
 requests
 PyPDF2
 streamlit-pdf-viewer
 llama-index-core
 llama-index-embeddings-huggingface
 sentence-transformers

 requests
 PyPDF2
 streamlit-pdf-viewer
+python-pptx
+python-docx
+fpdf2
 llama-index-core
 llama-index-embeddings-huggingface
 sentence-transformers

utils.py CHANGED Viewed

@@ -789,6 +789,175 @@ def get_mime_type(filename):
     return mime or "application/octet-stream"
 def render_sidebar_intro():
     """Render the shared sidebar intro card."""
     with st.sidebar:

     return mime or "application/octet-stream"
+# ── Lightweight Office → PDF conversion (pure Python) ──────────────────
+#
+# Uses python-pptx / python-docx to extract content and fpdf2 to render
+# PDF pages.  No system packages (LibreOffice, etc.) required.
+# The output is a readable *preview*, not a pixel-perfect replica.
+def _pptx_to_pdf(file_bytes):
+    """Convert PPTX bytes to PDF bytes using python-pptx + fpdf2."""
+    from pptx import Presentation as PptxPresentation
+    from pptx.util import Emu
+    from fpdf import FPDF
+    prs = PptxPresentation(io.BytesIO(file_bytes))
+    # Slide dimensions in mm (default is 10" × 7.5")
+    slide_w_mm = prs.slide_width / Emu(914400) * 25.4  # EMU → inches → mm
+    slide_h_mm = prs.slide_height / Emu(914400) * 25.4
+    pdf = FPDF(orientation="L", unit="mm", format=(slide_h_mm, slide_w_mm))
+    pdf.set_auto_page_break(auto=True, margin=12)
+    # Use built-in Helvetica (no font file needed)
+    TITLE_SIZE = 18
+    BODY_SIZE = 11
+    MARGIN = 14
+    for slide_idx, slide in enumerate(prs.slides, start=1):
+        pdf.add_page()
+        pdf.set_left_margin(MARGIN)
+        pdf.set_right_margin(MARGIN)
+        pdf.set_y(MARGIN)
+        # ── Slide number chip ──
+        pdf.set_font("Helvetica", "I", 8)
+        pdf.set_text_color(120, 120, 120)
+        pdf.cell(0, 5, f"Slide {slide_idx}", ln=True)
+        pdf.ln(2)
+        # ── Extract text from shapes ──
+        title_text = ""
+        body_parts = []
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                for para in shape.text_frame.paragraphs:
+                    text = para.text.strip()
+                    if not text:
+                        continue
+                    # Heuristic: first non-empty text in a title placeholder
+                    if not title_text and hasattr(shape, "placeholder_format"):
+                        ph = shape.placeholder_format
+                        if ph is not None and ph.idx in (0, 1):
+                            title_text = text
+                            continue
+                    body_parts.append(text)
+            # ── Embedded images ──
+            if shape.shape_type == 13:  # MSO_SHAPE_TYPE.PICTURE
+                try:
+                    img_bytes = shape.image.blob
+                    img_stream = io.BytesIO(img_bytes)
+                    # Scale image to fit page width (with margins)
+                    max_w = slide_w_mm - 2 * MARGIN
+                    pdf.image(img_stream, x=MARGIN, w=min(max_w, 120))
+                    pdf.ln(4)
+                except Exception:
+                    pass  # skip unreadable images
+        # ── Render title ──
+        if title_text:
+            pdf.set_font("Helvetica", "B", TITLE_SIZE)
+            pdf.set_text_color(22, 49, 44)  # plexi-ink dark
+            pdf.multi_cell(0, TITLE_SIZE * 0.5, title_text)
+            pdf.ln(4)
+        # ── Render body text ──
+        if body_parts:
+            pdf.set_font("Helvetica", "", BODY_SIZE)
+            pdf.set_text_color(50, 50, 50)
+            for part in body_parts:
+                pdf.multi_cell(0, BODY_SIZE * 0.45, part)
+                pdf.ln(2)
+    if len(prs.slides) == 0:
+        pdf.add_page()
+        pdf.set_font("Helvetica", "I", 12)
+        pdf.cell(0, 10, "This presentation has no slides.", ln=True)
+    return bytes(pdf.output())
+def _docx_to_pdf(file_bytes):
+    """Convert DOCX bytes to PDF bytes using python-docx + fpdf2."""
+    from docx import Document as DocxDocument
+    from fpdf import FPDF
+    doc = DocxDocument(io.BytesIO(file_bytes))
+    pdf = FPDF(orientation="P", unit="mm", format="A4")
+    pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.add_page()
+    MARGIN = 16
+    pdf.set_left_margin(MARGIN)
+    pdf.set_right_margin(MARGIN)
+    HEADING_SIZES = {"Heading 1": 20, "Heading 2": 16, "Heading 3": 14}
+    BODY_SIZE = 11
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if not text:
+            pdf.ln(3)
+            continue
+        style_name = para.style.name if para.style else ""
+        if style_name in HEADING_SIZES:
+            size = HEADING_SIZES[style_name]
+            pdf.set_font("Helvetica", "B", size)
+            pdf.set_text_color(22, 49, 44)
+            pdf.ln(4)
+            pdf.multi_cell(0, size * 0.5, text)
+            pdf.ln(3)
+        else:
+            is_bold = any(
+                run.bold for run in para.runs if run.bold is not None
+            )
+            pdf.set_font("Helvetica", "B" if is_bold else "", BODY_SIZE)
+            pdf.set_text_color(50, 50, 50)
+            pdf.multi_cell(0, BODY_SIZE * 0.45, text)
+            pdf.ln(1.5)
+    # ── Inline images ──
+    for rel in doc.part.rels.values():
+        if "image" in rel.reltype:
+            try:
+                img_stream = io.BytesIO(rel.target_part.blob)
+                pdf.image(img_stream, x=MARGIN, w=100)
+                pdf.ln(4)
+            except Exception:
+                pass
+    return bytes(pdf.output())
+def convert_office_to_pdf(file_bytes, filename):
+    """Convert an Office document (PPTX/DOCX/PPT/DOC) to PDF bytes.
+    Uses pure-Python libraries (python-pptx, python-docx, fpdf2) so no
+    system packages like LibreOffice are needed.  The output is a readable
+    preview rather than a pixel-perfect replica.
+    Returns
+    -------
+    bytes | None
+        PDF bytes on success, or ``None`` on failure.
+    """
+    ext = Path(filename).suffix.lower()
+    try:
+        if ext in (".pptx", ".ppt"):
+            return _pptx_to_pdf(file_bytes)
+        elif ext in (".docx", ".doc"):
+            return _docx_to_pdf(file_bytes)
+    except Exception as err:
+        print(f"Office-to-PDF conversion error ({filename}): {err}")
+    return None
 def render_sidebar_intro():
     """Render the shared sidebar intro card."""
     with st.sidebar: