LazyHuman10 commited on
Commit Β·
8657f7c
1
Parent(s): dcfde3c
Sync lightweight pure-Python PDF conversion to space
Browse files- pages/Study_Material_Hub.py +58 -19
- requirements.txt +3 -0
- utils.py +169 -0
pages/Study_Material_Hub.py
CHANGED
|
@@ -4,10 +4,10 @@ from pathlib import Path
|
|
| 4 |
from urllib.parse import quote
|
| 5 |
|
| 6 |
import streamlit as st
|
| 7 |
-
import streamlit.components.v1 as components
|
| 8 |
from streamlit_pdf_viewer import pdf_viewer
|
| 9 |
from utils import (
|
| 10 |
APP_ICON_PATH,
|
|
|
|
| 11 |
download_github_file,
|
| 12 |
get_manifest,
|
| 13 |
get_mime_type,
|
|
@@ -24,12 +24,7 @@ st.set_page_config(
|
|
| 24 |
)
|
| 25 |
inject_theme()
|
| 26 |
|
| 27 |
-
|
| 28 |
-
"application/msword",
|
| 29 |
-
"application/vnd.ms-powerpoint",
|
| 30 |
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
| 31 |
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 32 |
-
}
|
| 33 |
|
| 34 |
|
| 35 |
def format_file_label(filename):
|
|
@@ -55,15 +50,54 @@ def display_pdf(file_content):
|
|
| 55 |
pdf_viewer(file_content, width="100%", height=700)
|
| 56 |
|
| 57 |
|
| 58 |
-
def display_office_document(download_url, filename):
|
| 59 |
-
"""Display Word
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
try:
|
| 69 |
manifest = get_manifest()
|
|
@@ -214,12 +248,17 @@ with preview_col:
|
|
| 214 |
unsafe_allow_html=True,
|
| 215 |
)
|
| 216 |
|
| 217 |
-
|
|
|
|
|
|
|
| 218 |
display_pdf(file_content)
|
| 219 |
-
elif
|
| 220 |
display_office_document(
|
| 221 |
-
selected_file_obj["download_url"], selected_file_obj["name"]
|
| 222 |
)
|
|
|
|
|
|
|
|
|
|
| 223 |
else:
|
| 224 |
st.info(
|
| 225 |
"Preview is not available for this file type. Download it to inspect the content."
|
|
|
|
| 4 |
from urllib.parse import quote
|
| 5 |
|
| 6 |
import streamlit as st
|
|
|
|
| 7 |
from streamlit_pdf_viewer import pdf_viewer
|
| 8 |
from utils import (
|
| 9 |
APP_ICON_PATH,
|
| 10 |
+
convert_office_to_pdf,
|
| 11 |
download_github_file,
|
| 12 |
get_manifest,
|
| 13 |
get_mime_type,
|
|
|
|
| 24 |
)
|
| 25 |
inject_theme()
|
| 26 |
|
| 27 |
+
# No longer need strict MIME-type mappings since we check extensions robustly.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def format_file_label(filename):
|
|
|
|
| 50 |
pdf_viewer(file_content, width="100%", height=700)
|
| 51 |
|
| 52 |
|
| 53 |
+
def display_office_document(file_content, download_url, filename):
|
| 54 |
+
"""Display Word / PowerPoint files by converting to PDF server-side.
|
| 55 |
+
|
| 56 |
+
Uses pure Python libraries to extract content and render a PDF preview.
|
| 57 |
+
If the conversion fails or is unsupported, it falls back to a link
|
| 58 |
+
that opens the file in Microsoft Office Web Viewer in a new tab.
|
| 59 |
+
"""
|
| 60 |
+
suffix = Path(filename).suffix.lower().lstrip(".")
|
| 61 |
+
type_label = "presentation" if suffix in ("ppt", "pptx") else "document"
|
| 62 |
+
|
| 63 |
+
with st.spinner(f"Converting {type_label} to PDF for previewβ¦"):
|
| 64 |
+
pdf_bytes = convert_office_to_pdf(file_content, filename)
|
| 65 |
+
|
| 66 |
+
if pdf_bytes:
|
| 67 |
+
pdf_viewer(pdf_bytes, width="100%", height=700)
|
| 68 |
+
st.caption(
|
| 69 |
+
f"Inline preview of `{format_file_label(filename)}` "
|
| 70 |
+
"(converted to PDF on the server)."
|
| 71 |
+
)
|
| 72 |
+
else:
|
| 73 |
+
# Fallback β open in Office Web Viewer in a new tab
|
| 74 |
+
encoded_url = quote(download_url, safe="")
|
| 75 |
+
preview_url = (
|
| 76 |
+
f"https://view.officeapps.live.com/op/view.aspx?src={encoded_url}"
|
| 77 |
+
)
|
| 78 |
+
st.markdown(
|
| 79 |
+
f"""
|
| 80 |
+
<section class="plexi-callout" style="text-align:center;padding:2.5rem 1.5rem;">
|
| 81 |
+
<div style="font-size:3rem;margin-bottom:0.6rem;">π</div>
|
| 82 |
+
<div class="plexi-sidecard-title">{format_file_label(filename)}</div>
|
| 83 |
+
<div class="plexi-muted" style="margin-bottom:1rem;">
|
| 84 |
+
Server-side conversion is not available right now.
|
| 85 |
+
Open the {type_label} in Microsoft Office Web Viewer instead.
|
| 86 |
+
</div>
|
| 87 |
+
</section>
|
| 88 |
+
""",
|
| 89 |
+
unsafe_allow_html=True,
|
| 90 |
+
)
|
| 91 |
+
st.link_button(
|
| 92 |
+
f"π Open {type_label.capitalize()} in Office Viewer",
|
| 93 |
+
preview_url,
|
| 94 |
+
use_container_width=True,
|
| 95 |
+
type="primary",
|
| 96 |
+
)
|
| 97 |
+
st.caption(
|
| 98 |
+
"Powered by Microsoft Office Web Viewer. "
|
| 99 |
+
"You can also download the file directly using the button on the right."
|
| 100 |
+
)
|
| 101 |
|
| 102 |
try:
|
| 103 |
manifest = get_manifest()
|
|
|
|
| 248 |
unsafe_allow_html=True,
|
| 249 |
)
|
| 250 |
|
| 251 |
+
ext = Path(selected_file_obj["name"]).suffix.lower()
|
| 252 |
+
|
| 253 |
+
if ext == ".pdf":
|
| 254 |
display_pdf(file_content)
|
| 255 |
+
elif ext in (".ppt", ".pptx", ".doc", ".docx"):
|
| 256 |
display_office_document(
|
| 257 |
+
file_content, selected_file_obj["download_url"], selected_file_obj["name"]
|
| 258 |
)
|
| 259 |
+
elif file_mime.startswith("text/"):
|
| 260 |
+
# Basic text preview support (optional, if needed)
|
| 261 |
+
st.code(file_content.decode("utf-8", errors="replace"))
|
| 262 |
else:
|
| 263 |
st.info(
|
| 264 |
"Preview is not available for this file type. Download it to inspect the content."
|
requirements.txt
CHANGED
|
@@ -3,6 +3,9 @@ python-dotenv
|
|
| 3 |
requests
|
| 4 |
PyPDF2
|
| 5 |
streamlit-pdf-viewer
|
|
|
|
|
|
|
|
|
|
| 6 |
llama-index-core
|
| 7 |
llama-index-embeddings-huggingface
|
| 8 |
sentence-transformers
|
|
|
|
| 3 |
requests
|
| 4 |
PyPDF2
|
| 5 |
streamlit-pdf-viewer
|
| 6 |
+
python-pptx
|
| 7 |
+
python-docx
|
| 8 |
+
fpdf2
|
| 9 |
llama-index-core
|
| 10 |
llama-index-embeddings-huggingface
|
| 11 |
sentence-transformers
|
utils.py
CHANGED
|
@@ -789,6 +789,175 @@ def get_mime_type(filename):
|
|
| 789 |
return mime or "application/octet-stream"
|
| 790 |
|
| 791 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
def render_sidebar_intro():
|
| 793 |
"""Render the shared sidebar intro card."""
|
| 794 |
with st.sidebar:
|
|
|
|
| 789 |
return mime or "application/octet-stream"
|
| 790 |
|
| 791 |
|
| 792 |
+
# ββ Lightweight Office β PDF conversion (pure Python) ββββββββββββββββββ
|
| 793 |
+
#
|
| 794 |
+
# Uses python-pptx / python-docx to extract content and fpdf2 to render
|
| 795 |
+
# PDF pages. No system packages (LibreOffice, etc.) required.
|
| 796 |
+
# The output is a readable *preview*, not a pixel-perfect replica.
|
| 797 |
+
|
| 798 |
+
def _pptx_to_pdf(file_bytes):
|
| 799 |
+
"""Convert PPTX bytes to PDF bytes using python-pptx + fpdf2."""
|
| 800 |
+
from pptx import Presentation as PptxPresentation
|
| 801 |
+
from pptx.util import Emu
|
| 802 |
+
from fpdf import FPDF
|
| 803 |
+
|
| 804 |
+
prs = PptxPresentation(io.BytesIO(file_bytes))
|
| 805 |
+
|
| 806 |
+
# Slide dimensions in mm (default is 10" Γ 7.5")
|
| 807 |
+
slide_w_mm = prs.slide_width / Emu(914400) * 25.4 # EMU β inches β mm
|
| 808 |
+
slide_h_mm = prs.slide_height / Emu(914400) * 25.4
|
| 809 |
+
|
| 810 |
+
pdf = FPDF(orientation="L", unit="mm", format=(slide_h_mm, slide_w_mm))
|
| 811 |
+
pdf.set_auto_page_break(auto=True, margin=12)
|
| 812 |
+
|
| 813 |
+
# Use built-in Helvetica (no font file needed)
|
| 814 |
+
TITLE_SIZE = 18
|
| 815 |
+
BODY_SIZE = 11
|
| 816 |
+
MARGIN = 14
|
| 817 |
+
|
| 818 |
+
for slide_idx, slide in enumerate(prs.slides, start=1):
|
| 819 |
+
pdf.add_page()
|
| 820 |
+
pdf.set_left_margin(MARGIN)
|
| 821 |
+
pdf.set_right_margin(MARGIN)
|
| 822 |
+
pdf.set_y(MARGIN)
|
| 823 |
+
|
| 824 |
+
# ββ Slide number chip ββ
|
| 825 |
+
pdf.set_font("Helvetica", "I", 8)
|
| 826 |
+
pdf.set_text_color(120, 120, 120)
|
| 827 |
+
pdf.cell(0, 5, f"Slide {slide_idx}", ln=True)
|
| 828 |
+
pdf.ln(2)
|
| 829 |
+
|
| 830 |
+
# ββ Extract text from shapes ββ
|
| 831 |
+
title_text = ""
|
| 832 |
+
body_parts = []
|
| 833 |
+
|
| 834 |
+
for shape in slide.shapes:
|
| 835 |
+
if shape.has_text_frame:
|
| 836 |
+
for para in shape.text_frame.paragraphs:
|
| 837 |
+
text = para.text.strip()
|
| 838 |
+
if not text:
|
| 839 |
+
continue
|
| 840 |
+
# Heuristic: first non-empty text in a title placeholder
|
| 841 |
+
if not title_text and hasattr(shape, "placeholder_format"):
|
| 842 |
+
ph = shape.placeholder_format
|
| 843 |
+
if ph is not None and ph.idx in (0, 1):
|
| 844 |
+
title_text = text
|
| 845 |
+
continue
|
| 846 |
+
body_parts.append(text)
|
| 847 |
+
|
| 848 |
+
# ββ Embedded images ββ
|
| 849 |
+
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
|
| 850 |
+
try:
|
| 851 |
+
img_bytes = shape.image.blob
|
| 852 |
+
img_stream = io.BytesIO(img_bytes)
|
| 853 |
+
# Scale image to fit page width (with margins)
|
| 854 |
+
max_w = slide_w_mm - 2 * MARGIN
|
| 855 |
+
pdf.image(img_stream, x=MARGIN, w=min(max_w, 120))
|
| 856 |
+
pdf.ln(4)
|
| 857 |
+
except Exception:
|
| 858 |
+
pass # skip unreadable images
|
| 859 |
+
|
| 860 |
+
# ββ Render title ββ
|
| 861 |
+
if title_text:
|
| 862 |
+
pdf.set_font("Helvetica", "B", TITLE_SIZE)
|
| 863 |
+
pdf.set_text_color(22, 49, 44) # plexi-ink dark
|
| 864 |
+
pdf.multi_cell(0, TITLE_SIZE * 0.5, title_text)
|
| 865 |
+
pdf.ln(4)
|
| 866 |
+
|
| 867 |
+
# ββ Render body text ββ
|
| 868 |
+
if body_parts:
|
| 869 |
+
pdf.set_font("Helvetica", "", BODY_SIZE)
|
| 870 |
+
pdf.set_text_color(50, 50, 50)
|
| 871 |
+
for part in body_parts:
|
| 872 |
+
pdf.multi_cell(0, BODY_SIZE * 0.45, part)
|
| 873 |
+
pdf.ln(2)
|
| 874 |
+
|
| 875 |
+
if len(prs.slides) == 0:
|
| 876 |
+
pdf.add_page()
|
| 877 |
+
pdf.set_font("Helvetica", "I", 12)
|
| 878 |
+
pdf.cell(0, 10, "This presentation has no slides.", ln=True)
|
| 879 |
+
|
| 880 |
+
return bytes(pdf.output())
|
| 881 |
+
|
| 882 |
+
|
| 883 |
+
def _docx_to_pdf(file_bytes):
|
| 884 |
+
"""Convert DOCX bytes to PDF bytes using python-docx + fpdf2."""
|
| 885 |
+
from docx import Document as DocxDocument
|
| 886 |
+
from fpdf import FPDF
|
| 887 |
+
|
| 888 |
+
doc = DocxDocument(io.BytesIO(file_bytes))
|
| 889 |
+
|
| 890 |
+
pdf = FPDF(orientation="P", unit="mm", format="A4")
|
| 891 |
+
pdf.set_auto_page_break(auto=True, margin=15)
|
| 892 |
+
pdf.add_page()
|
| 893 |
+
|
| 894 |
+
MARGIN = 16
|
| 895 |
+
pdf.set_left_margin(MARGIN)
|
| 896 |
+
pdf.set_right_margin(MARGIN)
|
| 897 |
+
|
| 898 |
+
HEADING_SIZES = {"Heading 1": 20, "Heading 2": 16, "Heading 3": 14}
|
| 899 |
+
BODY_SIZE = 11
|
| 900 |
+
|
| 901 |
+
for para in doc.paragraphs:
|
| 902 |
+
text = para.text.strip()
|
| 903 |
+
if not text:
|
| 904 |
+
pdf.ln(3)
|
| 905 |
+
continue
|
| 906 |
+
|
| 907 |
+
style_name = para.style.name if para.style else ""
|
| 908 |
+
|
| 909 |
+
if style_name in HEADING_SIZES:
|
| 910 |
+
size = HEADING_SIZES[style_name]
|
| 911 |
+
pdf.set_font("Helvetica", "B", size)
|
| 912 |
+
pdf.set_text_color(22, 49, 44)
|
| 913 |
+
pdf.ln(4)
|
| 914 |
+
pdf.multi_cell(0, size * 0.5, text)
|
| 915 |
+
pdf.ln(3)
|
| 916 |
+
else:
|
| 917 |
+
is_bold = any(
|
| 918 |
+
run.bold for run in para.runs if run.bold is not None
|
| 919 |
+
)
|
| 920 |
+
pdf.set_font("Helvetica", "B" if is_bold else "", BODY_SIZE)
|
| 921 |
+
pdf.set_text_color(50, 50, 50)
|
| 922 |
+
pdf.multi_cell(0, BODY_SIZE * 0.45, text)
|
| 923 |
+
pdf.ln(1.5)
|
| 924 |
+
|
| 925 |
+
# ββ Inline images ββ
|
| 926 |
+
for rel in doc.part.rels.values():
|
| 927 |
+
if "image" in rel.reltype:
|
| 928 |
+
try:
|
| 929 |
+
img_stream = io.BytesIO(rel.target_part.blob)
|
| 930 |
+
pdf.image(img_stream, x=MARGIN, w=100)
|
| 931 |
+
pdf.ln(4)
|
| 932 |
+
except Exception:
|
| 933 |
+
pass
|
| 934 |
+
|
| 935 |
+
return bytes(pdf.output())
|
| 936 |
+
|
| 937 |
+
|
| 938 |
+
def convert_office_to_pdf(file_bytes, filename):
|
| 939 |
+
"""Convert an Office document (PPTX/DOCX/PPT/DOC) to PDF bytes.
|
| 940 |
+
|
| 941 |
+
Uses pure-Python libraries (python-pptx, python-docx, fpdf2) so no
|
| 942 |
+
system packages like LibreOffice are needed. The output is a readable
|
| 943 |
+
preview rather than a pixel-perfect replica.
|
| 944 |
+
|
| 945 |
+
Returns
|
| 946 |
+
-------
|
| 947 |
+
bytes | None
|
| 948 |
+
PDF bytes on success, or ``None`` on failure.
|
| 949 |
+
"""
|
| 950 |
+
ext = Path(filename).suffix.lower()
|
| 951 |
+
try:
|
| 952 |
+
if ext in (".pptx", ".ppt"):
|
| 953 |
+
return _pptx_to_pdf(file_bytes)
|
| 954 |
+
elif ext in (".docx", ".doc"):
|
| 955 |
+
return _docx_to_pdf(file_bytes)
|
| 956 |
+
except Exception as err:
|
| 957 |
+
print(f"Office-to-PDF conversion error ({filename}): {err}")
|
| 958 |
+
return None
|
| 959 |
+
|
| 960 |
+
|
| 961 |
def render_sidebar_intro():
|
| 962 |
"""Render the shared sidebar intro card."""
|
| 963 |
with st.sidebar:
|