LazyHuman10 commited on
Commit
8657f7c
Β·
1 Parent(s): dcfde3c

Sync lightweight pure-Python PDF conversion to space

Browse files
Files changed (3) hide show
  1. pages/Study_Material_Hub.py +58 -19
  2. requirements.txt +3 -0
  3. utils.py +169 -0
pages/Study_Material_Hub.py CHANGED
@@ -4,10 +4,10 @@ from pathlib import Path
4
  from urllib.parse import quote
5
 
6
  import streamlit as st
7
- import streamlit.components.v1 as components
8
  from streamlit_pdf_viewer import pdf_viewer
9
  from utils import (
10
  APP_ICON_PATH,
 
11
  download_github_file,
12
  get_manifest,
13
  get_mime_type,
@@ -24,12 +24,7 @@ st.set_page_config(
24
  )
25
  inject_theme()
26
 
27
- OFFICE_PREVIEW_MIMES = {
28
- "application/msword",
29
- "application/vnd.ms-powerpoint",
30
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
31
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
32
- }
33
 
34
 
35
  def format_file_label(filename):
@@ -55,15 +50,54 @@ def display_pdf(file_content):
55
  pdf_viewer(file_content, width="100%", height=700)
56
 
57
 
58
- def display_office_document(download_url, filename):
59
- """Display Word and PowerPoint files using Office Web Viewer."""
60
- encoded_url = quote(download_url, safe="")
61
- preview_url = f"https://view.officeapps.live.com/op/embed.aspx?src={encoded_url}"
62
- components.iframe(preview_url, width=None, height=700, scrolling=True)
63
- st.caption(
64
- f"Inline preview for `{format_file_label(filename)}` is powered by Office Web Viewer. "
65
- "If it does not load, use the download button."
66
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  try:
69
  manifest = get_manifest()
@@ -214,12 +248,17 @@ with preview_col:
214
  unsafe_allow_html=True,
215
  )
216
 
217
- if file_mime == "application/pdf":
 
 
218
  display_pdf(file_content)
219
- elif file_mime in OFFICE_PREVIEW_MIMES:
220
  display_office_document(
221
- selected_file_obj["download_url"], selected_file_obj["name"]
222
  )
 
 
 
223
  else:
224
  st.info(
225
  "Preview is not available for this file type. Download it to inspect the content."
 
4
  from urllib.parse import quote
5
 
6
  import streamlit as st
 
7
  from streamlit_pdf_viewer import pdf_viewer
8
  from utils import (
9
  APP_ICON_PATH,
10
+ convert_office_to_pdf,
11
  download_github_file,
12
  get_manifest,
13
  get_mime_type,
 
24
  )
25
  inject_theme()
26
 
27
+ # No longer need strict MIME-type mappings since we check extensions robustly.
 
 
 
 
 
28
 
29
 
30
  def format_file_label(filename):
 
50
  pdf_viewer(file_content, width="100%", height=700)
51
 
52
 
53
+ def display_office_document(file_content, download_url, filename):
54
+ """Display Word / PowerPoint files by converting to PDF server-side.
55
+
56
+ Uses pure Python libraries to extract content and render a PDF preview.
57
+ If the conversion fails or is unsupported, it falls back to a link
58
+ that opens the file in Microsoft Office Web Viewer in a new tab.
59
+ """
60
+ suffix = Path(filename).suffix.lower().lstrip(".")
61
+ type_label = "presentation" if suffix in ("ppt", "pptx") else "document"
62
+
63
+ with st.spinner(f"Converting {type_label} to PDF for preview…"):
64
+ pdf_bytes = convert_office_to_pdf(file_content, filename)
65
+
66
+ if pdf_bytes:
67
+ pdf_viewer(pdf_bytes, width="100%", height=700)
68
+ st.caption(
69
+ f"Inline preview of `{format_file_label(filename)}` "
70
+ "(converted to PDF on the server)."
71
+ )
72
+ else:
73
+ # Fallback – open in Office Web Viewer in a new tab
74
+ encoded_url = quote(download_url, safe="")
75
+ preview_url = (
76
+ f"https://view.officeapps.live.com/op/view.aspx?src={encoded_url}"
77
+ )
78
+ st.markdown(
79
+ f"""
80
+ <section class="plexi-callout" style="text-align:center;padding:2.5rem 1.5rem;">
81
+ <div style="font-size:3rem;margin-bottom:0.6rem;">πŸ“„</div>
82
+ <div class="plexi-sidecard-title">{format_file_label(filename)}</div>
83
+ <div class="plexi-muted" style="margin-bottom:1rem;">
84
+ Server-side conversion is not available right now.
85
+ Open the {type_label} in Microsoft Office Web Viewer instead.
86
+ </div>
87
+ </section>
88
+ """,
89
+ unsafe_allow_html=True,
90
+ )
91
+ st.link_button(
92
+ f"πŸ”— Open {type_label.capitalize()} in Office Viewer",
93
+ preview_url,
94
+ use_container_width=True,
95
+ type="primary",
96
+ )
97
+ st.caption(
98
+ "Powered by Microsoft Office Web Viewer. "
99
+ "You can also download the file directly using the button on the right."
100
+ )
101
 
102
  try:
103
  manifest = get_manifest()
 
248
  unsafe_allow_html=True,
249
  )
250
 
251
+ ext = Path(selected_file_obj["name"]).suffix.lower()
252
+
253
+ if ext == ".pdf":
254
  display_pdf(file_content)
255
+ elif ext in (".ppt", ".pptx", ".doc", ".docx"):
256
  display_office_document(
257
+ file_content, selected_file_obj["download_url"], selected_file_obj["name"]
258
  )
259
+ elif file_mime.startswith("text/"):
260
+ # Basic text preview support (optional, if needed)
261
+ st.code(file_content.decode("utf-8", errors="replace"))
262
  else:
263
  st.info(
264
  "Preview is not available for this file type. Download it to inspect the content."
requirements.txt CHANGED
@@ -3,6 +3,9 @@ python-dotenv
3
  requests
4
  PyPDF2
5
  streamlit-pdf-viewer
 
 
 
6
  llama-index-core
7
  llama-index-embeddings-huggingface
8
  sentence-transformers
 
3
  requests
4
  PyPDF2
5
  streamlit-pdf-viewer
6
+ python-pptx
7
+ python-docx
8
+ fpdf2
9
  llama-index-core
10
  llama-index-embeddings-huggingface
11
  sentence-transformers
utils.py CHANGED
@@ -789,6 +789,175 @@ def get_mime_type(filename):
789
  return mime or "application/octet-stream"
790
 
791
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
792
  def render_sidebar_intro():
793
  """Render the shared sidebar intro card."""
794
  with st.sidebar:
 
789
  return mime or "application/octet-stream"
790
 
791
 
792
+ # ── Lightweight Office β†’ PDF conversion (pure Python) ──────────────────
793
+ #
794
+ # Uses python-pptx / python-docx to extract content and fpdf2 to render
795
+ # PDF pages. No system packages (LibreOffice, etc.) required.
796
+ # The output is a readable *preview*, not a pixel-perfect replica.
797
+
798
+ def _pptx_to_pdf(file_bytes):
799
+ """Convert PPTX bytes to PDF bytes using python-pptx + fpdf2."""
800
+ from pptx import Presentation as PptxPresentation
801
+ from pptx.util import Emu
802
+ from fpdf import FPDF
803
+
804
+ prs = PptxPresentation(io.BytesIO(file_bytes))
805
+
806
+ # Slide dimensions in mm (default is 10" Γ— 7.5")
807
+ slide_w_mm = prs.slide_width / Emu(914400) * 25.4 # EMU β†’ inches β†’ mm
808
+ slide_h_mm = prs.slide_height / Emu(914400) * 25.4
809
+
810
+ pdf = FPDF(orientation="L", unit="mm", format=(slide_h_mm, slide_w_mm))
811
+ pdf.set_auto_page_break(auto=True, margin=12)
812
+
813
+ # Use built-in Helvetica (no font file needed)
814
+ TITLE_SIZE = 18
815
+ BODY_SIZE = 11
816
+ MARGIN = 14
817
+
818
+ for slide_idx, slide in enumerate(prs.slides, start=1):
819
+ pdf.add_page()
820
+ pdf.set_left_margin(MARGIN)
821
+ pdf.set_right_margin(MARGIN)
822
+ pdf.set_y(MARGIN)
823
+
824
+ # ── Slide number chip ──
825
+ pdf.set_font("Helvetica", "I", 8)
826
+ pdf.set_text_color(120, 120, 120)
827
+ pdf.cell(0, 5, f"Slide {slide_idx}", ln=True)
828
+ pdf.ln(2)
829
+
830
+ # ── Extract text from shapes ──
831
+ title_text = ""
832
+ body_parts = []
833
+
834
+ for shape in slide.shapes:
835
+ if shape.has_text_frame:
836
+ for para in shape.text_frame.paragraphs:
837
+ text = para.text.strip()
838
+ if not text:
839
+ continue
840
+ # Heuristic: first non-empty text in a title placeholder
841
+ if not title_text and hasattr(shape, "placeholder_format"):
842
+ ph = shape.placeholder_format
843
+ if ph is not None and ph.idx in (0, 1):
844
+ title_text = text
845
+ continue
846
+ body_parts.append(text)
847
+
848
+ # ── Embedded images ──
849
+ if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
850
+ try:
851
+ img_bytes = shape.image.blob
852
+ img_stream = io.BytesIO(img_bytes)
853
+ # Scale image to fit page width (with margins)
854
+ max_w = slide_w_mm - 2 * MARGIN
855
+ pdf.image(img_stream, x=MARGIN, w=min(max_w, 120))
856
+ pdf.ln(4)
857
+ except Exception:
858
+ pass # skip unreadable images
859
+
860
+ # ── Render title ──
861
+ if title_text:
862
+ pdf.set_font("Helvetica", "B", TITLE_SIZE)
863
+ pdf.set_text_color(22, 49, 44) # plexi-ink dark
864
+ pdf.multi_cell(0, TITLE_SIZE * 0.5, title_text)
865
+ pdf.ln(4)
866
+
867
+ # ── Render body text ──
868
+ if body_parts:
869
+ pdf.set_font("Helvetica", "", BODY_SIZE)
870
+ pdf.set_text_color(50, 50, 50)
871
+ for part in body_parts:
872
+ pdf.multi_cell(0, BODY_SIZE * 0.45, part)
873
+ pdf.ln(2)
874
+
875
+ if len(prs.slides) == 0:
876
+ pdf.add_page()
877
+ pdf.set_font("Helvetica", "I", 12)
878
+ pdf.cell(0, 10, "This presentation has no slides.", ln=True)
879
+
880
+ return bytes(pdf.output())
881
+
882
+
883
+ def _docx_to_pdf(file_bytes):
884
+ """Convert DOCX bytes to PDF bytes using python-docx + fpdf2."""
885
+ from docx import Document as DocxDocument
886
+ from fpdf import FPDF
887
+
888
+ doc = DocxDocument(io.BytesIO(file_bytes))
889
+
890
+ pdf = FPDF(orientation="P", unit="mm", format="A4")
891
+ pdf.set_auto_page_break(auto=True, margin=15)
892
+ pdf.add_page()
893
+
894
+ MARGIN = 16
895
+ pdf.set_left_margin(MARGIN)
896
+ pdf.set_right_margin(MARGIN)
897
+
898
+ HEADING_SIZES = {"Heading 1": 20, "Heading 2": 16, "Heading 3": 14}
899
+ BODY_SIZE = 11
900
+
901
+ for para in doc.paragraphs:
902
+ text = para.text.strip()
903
+ if not text:
904
+ pdf.ln(3)
905
+ continue
906
+
907
+ style_name = para.style.name if para.style else ""
908
+
909
+ if style_name in HEADING_SIZES:
910
+ size = HEADING_SIZES[style_name]
911
+ pdf.set_font("Helvetica", "B", size)
912
+ pdf.set_text_color(22, 49, 44)
913
+ pdf.ln(4)
914
+ pdf.multi_cell(0, size * 0.5, text)
915
+ pdf.ln(3)
916
+ else:
917
+ is_bold = any(
918
+ run.bold for run in para.runs if run.bold is not None
919
+ )
920
+ pdf.set_font("Helvetica", "B" if is_bold else "", BODY_SIZE)
921
+ pdf.set_text_color(50, 50, 50)
922
+ pdf.multi_cell(0, BODY_SIZE * 0.45, text)
923
+ pdf.ln(1.5)
924
+
925
+ # ── Inline images ──
926
+ for rel in doc.part.rels.values():
927
+ if "image" in rel.reltype:
928
+ try:
929
+ img_stream = io.BytesIO(rel.target_part.blob)
930
+ pdf.image(img_stream, x=MARGIN, w=100)
931
+ pdf.ln(4)
932
+ except Exception:
933
+ pass
934
+
935
+ return bytes(pdf.output())
936
+
937
+
938
+ def convert_office_to_pdf(file_bytes, filename):
939
+ """Convert an Office document (PPTX/DOCX/PPT/DOC) to PDF bytes.
940
+
941
+ Uses pure-Python libraries (python-pptx, python-docx, fpdf2) so no
942
+ system packages like LibreOffice are needed. The output is a readable
943
+ preview rather than a pixel-perfect replica.
944
+
945
+ Returns
946
+ -------
947
+ bytes | None
948
+ PDF bytes on success, or ``None`` on failure.
949
+ """
950
+ ext = Path(filename).suffix.lower()
951
+ try:
952
+ if ext in (".pptx", ".ppt"):
953
+ return _pptx_to_pdf(file_bytes)
954
+ elif ext in (".docx", ".doc"):
955
+ return _docx_to_pdf(file_bytes)
956
+ except Exception as err:
957
+ print(f"Office-to-PDF conversion error ({filename}): {err}")
958
+ return None
959
+
960
+
961
  def render_sidebar_intro():
962
  """Render the shared sidebar intro card."""
963
  with st.sidebar: