Spaces:

known57
/

ai-notes-maker

Sleeping

App Files Files Community

Harsha commited on Jan 16

Commit

81ab677

1 Parent(s): 167e4ef

Add AI Notes Maker app

Browse files

Files changed (3) hide show

README.md +28 -6
app.py +261 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,12 +1,34 @@
 ---
-title: Ai Notes Maker
-emoji: 📊
-colorFrom: gray
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.3.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AI Notes Maker
+emoji: 📑
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.16.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# AI Notes Maker 📑
+Turn your PDF documents into concise summaries, bullet points, and study questions instantly.
+## Features
+- **PDF Text Extraction**: Handles text-based PDFs.
+- **Smart Summarization**: Uses `facebook/bart-large-cnn` to distill long content.
+- **Auto-Chunking**: Automatically splits large files to handle context limits.
+- **Key Notes**: Converts prose into easy-to-read bullet points.
+- **Study Questions**: Generates 10 relevant questions using `valhalla/t5-small-e2e-qg`.
+## How to run locally
+1. Clone the repository.
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Run the app:
+   ```bash
+   python app.py
+   ```

app.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import gradio as gr
+from transformers import pipeline
+from pypdf import PdfReader
+import torch
+import math
+# --- Configuration & Model Loading ---
+# Use GPU if available, otherwise CPU
+device = 0 if torch.cuda.is_available() else -1
+print(f"Loading models on device: {'GPU' if device == 0 else 'CPU'}...")
+# 1. Summarization Model
+# 'facebook/bart-large-cnn' is excellent for abstractive summarization
+summarizer = pipeline(
+    "summarization",
+    model="facebook/bart-large-cnn",
+    device=device
+)
+# 2. Question Generation Model
+# Using a specific lightweight model for QG to ensure quality questions
+# Running this on CPU is fast enough if GPU isn't available
+qg_pipeline = pipeline(
+    "text2text-generation",
+    model="valhalla/t5-small-e2e-qg",
+    device=device
+)
+print("Models loaded successfully.")
+# --- Core Logic Functions ---
+def extract_text_from_pdf(pdf_file):
+    """Extracts text from the uploaded PDF file."""
+    if pdf_file is None:
+        return ""
+    try:
+        reader = PdfReader(pdf_file.name)
+        text = ""
+        for page in reader.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
+        return text.strip()
+    except Exception as e:
+        return f"Error reading PDF: {str(e)}"
+def split_text_into_chunks(text, max_chunk_len=3000):
+    """
+    Splits text into chunks safe for the model (BART limit is ~1024 tokens).
+    We use character length as a safe proxy (~4 chars/token).
+    """
+    words = text.split()
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for word in words:
+        if current_length + len(word) + 1 > max_chunk_len:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [word]
+            current_length = len(word)
+        else:
+            current_chunk.append(word)
+            current_length += len(word) + 1
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
+def generate_summary(text, length_mode="Medium"):
+    """
+    Summarizes text. Handles long text by chunking.
+    recursive summarization is applied if text is too long.
+    """
+    if not text:
+        return "No text provided."
+    # Define constraints based on user choice
+    if length_mode == "Short":
+        max_len, min_len = 100, 30
+    elif length_mode == "Long":
+        max_len, min_len = 400, 150
+    else: # Medium
+        max_len, min_len = 250, 60
+    # If text is short enough, summarize directly
+    if len(text) < 3000:
+        try:
+            # Clamp constraints to text length to avoid model errors on very short inputs
+            input_len = len(text.split())
+            adjusted_max = min(max_len, max(input_len // 2, 20))
+            adjusted_min = min(min_len, max(adjusted_max - 10, 5))
+            summary = summarizer(text, max_length=adjusted_max, min_length=adjusted_min, do_sample=False)[0]['summary_text']
+            return summary
+        except Exception as e:
+            return f"Error in summarization: {str(e)}"
+    # If text is long, chunk it
+    chunks = split_text_into_chunks(text, max_chunk_len=3000)
+    chunk_summaries = []
+    for chunk in chunks:
+        try:
+            # Summarize each chunk
+            res = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
+            chunk_summaries.append(res[0]['summary_text'])
+        except Exception as e:
+            print(f"Skipping chunk due to error: {e}")
+            continue
+    # Combine chunk summaries
+    combined_text = " ".join(chunk_summaries)
+    # Recursive pass: if the combined summary is still too long, summarize it again
+    # Otherwise return the concatenated summaries (to avoid losing too much detail)
+    if len(combined_text) > 4000:
+        return generate_summary(combined_text, length_mode)
+    else:
+        return combined_text
+def generate_questions_list(text, num_questions=10):
+    """Generates a list of questions based on the text."""
+    if not text:
+        return []
+    # QG models work best on shorter contexts. We'll use the generated summary
+    # as context if the text is too long, or the text itself if short.
+    # However, generating 10 distinct questions usually requires providing
+    # answers or using an end-to-end generator.
+    # valhalla/t5-small-e2e-qg generates questions directly.
+    try:
+        # We process the text in segments to get enough questions
+        chunks = split_text_into_chunks(text, max_chunk_len=2000)
+        questions = []
+        # Limit chunks to avoid taking forever (process first few chunks or spread them)
+        selected_chunks = chunks[:5]
+        for chunk in selected_chunks:
+            # This specific model generates questions given text with "generate questions: " prefix
+            # Note: actual usage might vary, but standard T5-e2e works like this or just raw text
+            # The valhalla model is trained to output questions.
+            input_text = "generate questions: " + chunk
+            # Generate multiple sequences
+            outputs = qg_pipeline(
+                input_text,
+                max_length=64,
+                num_return_sequences=2,
+                do_sample=True,
+                top_k=50,
+                top_p=0.95
+            )
+            for out in outputs:
+                q = out['generated_text']
+                if q not in questions:
+                    questions.append(q)
+            if len(questions) >= num_questions:
+                break
+        return questions[:num_questions]
+    except Exception as e:
+        return [f"Could not generate questions: {str(e)}"]
+def format_bullet_notes(summary_text):
+    """Parses a prose summary into bullet points by splitting sentences."""
+    sentences = summary_text.replace(". ", ".\n").split("\n")
+    bullets = [f"- {s.strip()}" for s in sentences if s.strip()]
+    return "\n".join(bullets)
+# --- Main App Logic ---
+def process_pdf_data(file_obj, length_mode, enable_questions):
+    if file_obj is None:
+        return "Please upload a PDF file.", "", ""
+    # 1. Extract Text
+    raw_text = extract_text_from_pdf(file_obj)
+    if not raw_text or len(raw_text) < 50:
+        return "Error: Could not extract text from PDF or PDF is empty.", "", ""
+    status_msg = f"Extracted {len(raw_text)} characters. Processing..."
+    print(status_msg)
+    # 2. Summarize
+    # We pass the raw text. The function handles chunking.
+    final_summary = generate_summary(raw_text, length_mode)
+    # 3. Create Notes (Formatted Summary)
+    notes_markdown = "### 📝 Key Bullet Notes\n\n" + format_bullet_notes(final_summary)
+    # 4. Generate Questions (if requested)
+    questions_markdown = ""
+    if enable_questions:
+        # We use the summary as context for questions to ensure they focus on key points,
+        # unless summary is too short, then we use a part of raw text.
+        context_for_q = final_summary if len(final_summary) > 500 else raw_text[:2000]
+        qs = generate_questions_list(context_for_q, num_questions=10)
+        questions_markdown = "### ❓ Important Questions\n\n"
+        for i, q in enumerate(qs, 1):
+            questions_markdown += f"{i}. {q}\n"
+    # Combine Summary for display
+    summary_markdown = f"### 📖 Summary\n\n{final_summary}"
+    return summary_markdown, notes_markdown, questions_markdown
+# --- Gradio UI ---
+theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="slate",
+)
+with gr.Blocks(theme=theme, title="AI Notes Maker") as app:
+    gr.Markdown(
+        """
+        # 📑 AI Notes Maker
+        Upload a PDF lecture, paper, or article. Get a summary, key notes, and study questions instantly.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+            with gr.Accordion("Settings", open=True):
+                length_slider = gr.Radio(
+                    ["Short", "Medium", "Long"],
+                    label="Notes Length",
+                    value="Medium"
+                )
+                question_check = gr.Checkbox(
+                    label="Generate Important Questions",
+                    value=True
+                )
+            submit_btn = gr.Button("Generate Notes", variant="primary")
+        with gr.Column(scale=2):
+            output_summary = gr.Markdown(label="Summary")
+            output_notes = gr.Markdown(label="Key Notes")
+            output_questions = gr.Markdown(label="Questions")
+    submit_btn.click(
+        fn=process_pdf_data,
+        inputs=[pdf_input, length_slider, question_check],
+        outputs=[output_summary, output_notes, output_questions]
+    )
+if __name__ == "__main__":
+    app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+transformers
+torch
+pypdf
+sentencepiece