Harsha commited on
Commit
81ab677
Β·
1 Parent(s): 167e4ef

Add AI Notes Maker app

Browse files
Files changed (3) hide show
  1. README.md +28 -6
  2. app.py +261 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,34 @@
1
  ---
2
- title: Ai Notes Maker
3
- emoji: πŸ“Š
4
- colorFrom: gray
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.3.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: AI Notes Maker
3
+ emoji: πŸ“‘
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.16.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # AI Notes Maker πŸ“‘
14
+
15
+ Turn your PDF documents into concise summaries, bullet points, and study questions instantly.
16
+
17
+ ## Features
18
+ - **PDF Text Extraction**: Handles text-based PDFs.
19
+ - **Smart Summarization**: Uses `facebook/bart-large-cnn` to distill long content.
20
+ - **Auto-Chunking**: Automatically splits large files to handle context limits.
21
+ - **Key Notes**: Converts prose into easy-to-read bullet points.
22
+ - **Study Questions**: Generates 10 relevant questions using `valhalla/t5-small-e2e-qg`.
23
+
24
+ ## How to run locally
25
+
26
+ 1. Clone the repository.
27
+ 2. Install dependencies:
28
+ ```bash
29
+ pip install -r requirements.txt
30
+ ```
31
+ 3. Run the app:
32
+ ```bash
33
+ python app.py
34
+ ```
app.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ from pypdf import PdfReader
4
+ import torch
5
+ import math
6
+
7
+ # --- Configuration & Model Loading ---
8
+
9
+ # Use GPU if available, otherwise CPU
10
+ device = 0 if torch.cuda.is_available() else -1
11
+
12
+ print(f"Loading models on device: {'GPU' if device == 0 else 'CPU'}...")
13
+
14
+ # 1. Summarization Model
15
+ # 'facebook/bart-large-cnn' is excellent for abstractive summarization
16
+ summarizer = pipeline(
17
+ "summarization",
18
+ model="facebook/bart-large-cnn",
19
+ device=device
20
+ )
21
+
22
+ # 2. Question Generation Model
23
+ # Using a specific lightweight model for QG to ensure quality questions
24
+ # Running this on CPU is fast enough if GPU isn't available
25
+ qg_pipeline = pipeline(
26
+ "text2text-generation",
27
+ model="valhalla/t5-small-e2e-qg",
28
+ device=device
29
+ )
30
+
31
+ print("Models loaded successfully.")
32
+
33
+ # --- Core Logic Functions ---
34
+
35
+ def extract_text_from_pdf(pdf_file):
36
+ """Extracts text from the uploaded PDF file."""
37
+ if pdf_file is None:
38
+ return ""
39
+
40
+ try:
41
+ reader = PdfReader(pdf_file.name)
42
+ text = ""
43
+ for page in reader.pages:
44
+ page_text = page.extract_text()
45
+ if page_text:
46
+ text += page_text + "\n"
47
+ return text.strip()
48
+ except Exception as e:
49
+ return f"Error reading PDF: {str(e)}"
50
+
51
+ def split_text_into_chunks(text, max_chunk_len=3000):
52
+ """
53
+ Splits text into chunks safe for the model (BART limit is ~1024 tokens).
54
+ We use character length as a safe proxy (~4 chars/token).
55
+ """
56
+ words = text.split()
57
+ chunks = []
58
+ current_chunk = []
59
+ current_length = 0
60
+
61
+ for word in words:
62
+ if current_length + len(word) + 1 > max_chunk_len:
63
+ chunks.append(" ".join(current_chunk))
64
+ current_chunk = [word]
65
+ current_length = len(word)
66
+ else:
67
+ current_chunk.append(word)
68
+ current_length += len(word) + 1
69
+
70
+ if current_chunk:
71
+ chunks.append(" ".join(current_chunk))
72
+ return chunks
73
+
74
+ def generate_summary(text, length_mode="Medium"):
75
+ """
76
+ Summarizes text. Handles long text by chunking.
77
+ recursive summarization is applied if text is too long.
78
+ """
79
+ if not text:
80
+ return "No text provided."
81
+
82
+ # Define constraints based on user choice
83
+ if length_mode == "Short":
84
+ max_len, min_len = 100, 30
85
+ elif length_mode == "Long":
86
+ max_len, min_len = 400, 150
87
+ else: # Medium
88
+ max_len, min_len = 250, 60
89
+
90
+ # If text is short enough, summarize directly
91
+ if len(text) < 3000:
92
+ try:
93
+ # Clamp constraints to text length to avoid model errors on very short inputs
94
+ input_len = len(text.split())
95
+ adjusted_max = min(max_len, max(input_len // 2, 20))
96
+ adjusted_min = min(min_len, max(adjusted_max - 10, 5))
97
+
98
+ summary = summarizer(text, max_length=adjusted_max, min_length=adjusted_min, do_sample=False)[0]['summary_text']
99
+ return summary
100
+ except Exception as e:
101
+ return f"Error in summarization: {str(e)}"
102
+
103
+ # If text is long, chunk it
104
+ chunks = split_text_into_chunks(text, max_chunk_len=3000)
105
+ chunk_summaries = []
106
+
107
+ for chunk in chunks:
108
+ try:
109
+ # Summarize each chunk
110
+ res = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
111
+ chunk_summaries.append(res[0]['summary_text'])
112
+ except Exception as e:
113
+ print(f"Skipping chunk due to error: {e}")
114
+ continue
115
+
116
+ # Combine chunk summaries
117
+ combined_text = " ".join(chunk_summaries)
118
+
119
+ # Recursive pass: if the combined summary is still too long, summarize it again
120
+ # Otherwise return the concatenated summaries (to avoid losing too much detail)
121
+ if len(combined_text) > 4000:
122
+ return generate_summary(combined_text, length_mode)
123
+ else:
124
+ return combined_text
125
+
126
+ def generate_questions_list(text, num_questions=10):
127
+ """Generates a list of questions based on the text."""
128
+ if not text:
129
+ return []
130
+
131
+ # QG models work best on shorter contexts. We'll use the generated summary
132
+ # as context if the text is too long, or the text itself if short.
133
+ # However, generating 10 distinct questions usually requires providing
134
+ # answers or using an end-to-end generator.
135
+ # valhalla/t5-small-e2e-qg generates questions directly.
136
+
137
+ try:
138
+ # We process the text in segments to get enough questions
139
+ chunks = split_text_into_chunks(text, max_chunk_len=2000)
140
+ questions = []
141
+
142
+ # Limit chunks to avoid taking forever (process first few chunks or spread them)
143
+ selected_chunks = chunks[:5]
144
+
145
+ for chunk in selected_chunks:
146
+ # This specific model generates questions given text with "generate questions: " prefix
147
+ # Note: actual usage might vary, but standard T5-e2e works like this or just raw text
148
+ # The valhalla model is trained to output questions.
149
+ input_text = "generate questions: " + chunk
150
+
151
+ # Generate multiple sequences
152
+ outputs = qg_pipeline(
153
+ input_text,
154
+ max_length=64,
155
+ num_return_sequences=2,
156
+ do_sample=True,
157
+ top_k=50,
158
+ top_p=0.95
159
+ )
160
+
161
+ for out in outputs:
162
+ q = out['generated_text']
163
+ if q not in questions:
164
+ questions.append(q)
165
+
166
+ if len(questions) >= num_questions:
167
+ break
168
+
169
+ return questions[:num_questions]
170
+ except Exception as e:
171
+ return [f"Could not generate questions: {str(e)}"]
172
+
173
+ def format_bullet_notes(summary_text):
174
+ """Parses a prose summary into bullet points by splitting sentences."""
175
+ sentences = summary_text.replace(". ", ".\n").split("\n")
176
+ bullets = [f"- {s.strip()}" for s in sentences if s.strip()]
177
+ return "\n".join(bullets)
178
+
179
+ # --- Main App Logic ---
180
+
181
+ def process_pdf_data(file_obj, length_mode, enable_questions):
182
+ if file_obj is None:
183
+ return "Please upload a PDF file.", "", ""
184
+
185
+ # 1. Extract Text
186
+ raw_text = extract_text_from_pdf(file_obj)
187
+ if not raw_text or len(raw_text) < 50:
188
+ return "Error: Could not extract text from PDF or PDF is empty.", "", ""
189
+
190
+ status_msg = f"Extracted {len(raw_text)} characters. Processing..."
191
+ print(status_msg)
192
+
193
+ # 2. Summarize
194
+ # We pass the raw text. The function handles chunking.
195
+ final_summary = generate_summary(raw_text, length_mode)
196
+
197
+ # 3. Create Notes (Formatted Summary)
198
+ notes_markdown = "### πŸ“ Key Bullet Notes\n\n" + format_bullet_notes(final_summary)
199
+
200
+ # 4. Generate Questions (if requested)
201
+ questions_markdown = ""
202
+ if enable_questions:
203
+ # We use the summary as context for questions to ensure they focus on key points,
204
+ # unless summary is too short, then we use a part of raw text.
205
+ context_for_q = final_summary if len(final_summary) > 500 else raw_text[:2000]
206
+ qs = generate_questions_list(context_for_q, num_questions=10)
207
+
208
+ questions_markdown = "### ❓ Important Questions\n\n"
209
+ for i, q in enumerate(qs, 1):
210
+ questions_markdown += f"{i}. {q}\n"
211
+
212
+ # Combine Summary for display
213
+ summary_markdown = f"### πŸ“– Summary\n\n{final_summary}"
214
+
215
+ return summary_markdown, notes_markdown, questions_markdown
216
+
217
+ # --- Gradio UI ---
218
+
219
+ theme = gr.themes.Soft(
220
+ primary_hue="blue",
221
+ secondary_hue="slate",
222
+ )
223
+
224
+ with gr.Blocks(theme=theme, title="AI Notes Maker") as app:
225
+ gr.Markdown(
226
+ """
227
+ # πŸ“‘ AI Notes Maker
228
+ Upload a PDF lecture, paper, or article. Get a summary, key notes, and study questions instantly.
229
+ """
230
+ )
231
+
232
+ with gr.Row():
233
+ with gr.Column(scale=1):
234
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
235
+
236
+ with gr.Accordion("Settings", open=True):
237
+ length_slider = gr.Radio(
238
+ ["Short", "Medium", "Long"],
239
+ label="Notes Length",
240
+ value="Medium"
241
+ )
242
+ question_check = gr.Checkbox(
243
+ label="Generate Important Questions",
244
+ value=True
245
+ )
246
+
247
+ submit_btn = gr.Button("Generate Notes", variant="primary")
248
+
249
+ with gr.Column(scale=2):
250
+ output_summary = gr.Markdown(label="Summary")
251
+ output_notes = gr.Markdown(label="Key Notes")
252
+ output_questions = gr.Markdown(label="Questions")
253
+
254
+ submit_btn.click(
255
+ fn=process_pdf_data,
256
+ inputs=[pdf_input, length_slider, question_check],
257
+ outputs=[output_summary, output_notes, output_questions]
258
+ )
259
+
260
+ if __name__ == "__main__":
261
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ pypdf
5
+ sentencepiece