Babajaan commited on
Commit
9d97b7e
Β·
verified Β·
1 Parent(s): 73bbc7f

Add app.py

Browse files
Files changed (1) hide show
  1. manuscript_mimic/app.py +236 -0
manuscript_mimic/app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py β€” Manuscript-Mimic Gradio UI
3
+
4
+ Upload a reference PDF (or paste reference text) and paste your AI-generated
5
+ draft. The agent rewrites the draft to statistically match the reference's
6
+ academic writing style.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import os
13
+ import traceback
14
+
15
+ import gradio as gr
16
+
17
+ from style_extractor import extract_style_metrics, StyleExtractorTool
18
+ from rewrite_agent import run_mimic, DEMO_REFERENCE, DEMO_TARGET
19
+
20
+
21
+ # ── PDF extraction helper ───────────────────────────────────────────────────────
22
+
23
+ def extract_pdf_text(pdf_path: str) -> str:
24
+ """Best-effort plain-text extraction from a PDF file."""
25
+ try:
26
+ import pymupdf # PyMuPDF (fitz)
27
+ doc = pymupdf.open(pdf_path)
28
+ pages = [page.get_text() for page in doc]
29
+ doc.close()
30
+ return "\n\n".join(pages).strip()
31
+ except ImportError:
32
+ pass
33
+
34
+ try:
35
+ from pypdf import PdfReader
36
+ reader = PdfReader(pdf_path)
37
+ pages = [p.extract_text() or "" for p in reader.pages]
38
+ return "\n\n".join(pages).strip()
39
+ except ImportError:
40
+ pass
41
+
42
+ return (
43
+ "⚠️ Could not extract PDF text. Install pymupdf or pypdf:\n"
44
+ " pip install pymupdf # OR pip install pypdf"
45
+ )
46
+
47
+
48
+ # ── Metric formatting ──────────────────────────────────────────────────────────
49
+
50
+ def fmt_metrics(m: dict) -> str:
51
+ """Pretty-print metrics dict."""
52
+ lines = [
53
+ f" Sentences : {m['num_sentences']}",
54
+ f" Total words : {m['total_words']}",
55
+ f" Avg sentence length : {m['avg_sentence_length']:.2f}",
56
+ f" Sentence length Οƒ : {m['sentence_length_variance']:.4f}",
57
+ f" Hedging density : {m['hedging_density']:.4f} (hedge words / sentence)",
58
+ f" Passive voice density: {m['passive_voice_density']:.4f} (passives / sentence)",
59
+ ]
60
+ return "\n".join(lines)
61
+
62
+
63
+ def metric_delta(ref: dict, other: dict) -> str:
64
+ """Show deltas between two metric dicts."""
65
+ keys = ["sentence_length_variance", "hedging_density", "passive_voice_density"]
66
+ lines = []
67
+ for k in keys:
68
+ r, o = ref[k], other[k]
69
+ if r == 0:
70
+ pct = "N/A" if o == 0 else "+∞"
71
+ else:
72
+ pct = f"{((o - r) / r) * 100:+.1f}%"
73
+ lines.append(f" {k:>28s}: ref={r:.4f} now={o:.4f} Ξ”={pct}")
74
+ return "\n".join(lines)
75
+
76
+
77
+ # ── Main Gradio callback ───────────────────────────────────────────────────────
78
+
79
+ def process(
80
+ pdf_file,
81
+ reference_text: str,
82
+ target_text: str,
83
+ model_id: str,
84
+ max_steps: int,
85
+ ):
86
+ """
87
+ Main pipeline:
88
+ 1. Extract reference text (from PDF or textbox)
89
+ 2. Compute metrics for both texts
90
+ 3. Run the agent to rewrite
91
+ 4. Compute metrics on result
92
+ 5. Return everything
93
+ """
94
+ # ── 1. Reference text ────────────────────────────────────────────────────
95
+ if pdf_file is not None:
96
+ reference_text = extract_pdf_text(pdf_file.name if hasattr(pdf_file, "name") else str(pdf_file))
97
+ if reference_text.startswith("⚠️"):
98
+ return reference_text, "", "", "", ""
99
+
100
+ if not reference_text or not reference_text.strip():
101
+ return "❌ Please provide reference text (paste or upload PDF).", "", "", "", ""
102
+ if not target_text or not target_text.strip():
103
+ return "❌ Please provide a target draft to rewrite.", "", "", "", ""
104
+
105
+ # ── 2. Compute pre-rewrite metrics ───────────────────────────────────────
106
+ ref_metrics = extract_style_metrics(reference_text)
107
+ tgt_metrics = extract_style_metrics(target_text)
108
+
109
+ ref_report = "πŸ“– REFERENCE TEXT METRICS\n" + fmt_metrics(ref_metrics)
110
+ tgt_report = "πŸ“ TARGET DRAFT METRICS (before)\n" + fmt_metrics(tgt_metrics)
111
+ pre_delta = "πŸ“ DELTA (target vs reference)\n" + metric_delta(ref_metrics, tgt_metrics)
112
+
113
+ # ── 3. Run the agent ─────────────────────────────────────────────────────
114
+ try:
115
+ rewritten = run_mimic(
116
+ reference_text=reference_text,
117
+ target_text=target_text,
118
+ model_id=model_id,
119
+ max_steps=int(max_steps),
120
+ verbosity=1,
121
+ )
122
+ except Exception as e:
123
+ tb = traceback.format_exc()
124
+ return ref_report, tgt_report, pre_delta, f"❌ Agent error:\n{e}\n\n{tb}", ""
125
+
126
+ # ── 4. Post-rewrite metrics ──────────────────────────────────────────────
127
+ new_metrics = extract_style_metrics(rewritten)
128
+ post_report = (
129
+ "βœ… REWRITTEN TEXT METRICS\n"
130
+ + fmt_metrics(new_metrics)
131
+ + "\n\nπŸ“ DELTA (rewritten vs reference)\n"
132
+ + metric_delta(ref_metrics, new_metrics)
133
+ )
134
+
135
+ return ref_report, tgt_report + "\n\n" + pre_delta, rewritten, post_report, reference_text
136
+
137
+
138
+ # ── Gradio UI ───────────────────────────────────────────────────────────────────
139
+
140
+ DESCRIPTION = """\
141
+ # πŸ–‹οΈ Manuscript-Mimic
142
+
143
+ **AI Style Transfer for Scientific Writing**
144
+
145
+ Upload a pre-2022 human-authored reference manuscript (PDF or pasted text) and
146
+ an AI-generated draft. The agent rewrites your draft to statistically match
147
+ the reference's writing style β€” sentence complexity, hedging language, and
148
+ passive voice patterns.
149
+
150
+ **Three metrics are matched:**
151
+ - **Sentence Length Variance** β€” Οƒ of word counts per sentence
152
+ - **Hedging Density** β€” frequency of hedge words (*suggest, may, putative, indicate*…)
153
+ - **Passive Voice Density** β€” frequency of academic passives (*was performed, were analyzed*…)
154
+ """
155
+
156
+ with gr.Blocks(
157
+ title="Manuscript-Mimic",
158
+ ) as demo:
159
+
160
+ gr.Markdown(DESCRIPTION)
161
+
162
+ with gr.Row():
163
+ with gr.Column(scale=1):
164
+ gr.Markdown("### πŸ“– Reference Style")
165
+ pdf_input = gr.File(
166
+ label="Upload Reference PDF (optional)",
167
+ file_types=[".pdf"],
168
+ type="filepath",
169
+ )
170
+ ref_textbox = gr.Textbox(
171
+ label="Or paste Reference Text",
172
+ placeholder="Paste a paragraph from a pre-2022 manuscript…",
173
+ lines=10,
174
+ )
175
+
176
+ with gr.Column(scale=1):
177
+ gr.Markdown("### πŸ“ Target Draft")
178
+ target_textbox = gr.Textbox(
179
+ label="Paste your AI-generated draft",
180
+ placeholder="Paste the text you want rewritten…",
181
+ lines=10,
182
+ )
183
+
184
+ with gr.Row():
185
+ model_dropdown = gr.Dropdown(
186
+ label="LLM Model",
187
+ choices=[
188
+ "Qwen/Qwen2.5-Coder-32B-Instruct",
189
+ "meta-llama/Llama-3.3-70B-Instruct",
190
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
191
+ ],
192
+ value="Qwen/Qwen2.5-Coder-32B-Instruct",
193
+ )
194
+ steps_slider = gr.Slider(
195
+ label="Max Agent Steps",
196
+ minimum=4,
197
+ maximum=20,
198
+ step=1,
199
+ value=12,
200
+ )
201
+
202
+ run_btn = gr.Button("πŸ”„ Rewrite to Match Style", variant="primary", size="lg")
203
+
204
+ # Demo loader
205
+ demo_btn = gr.Button("πŸ“‹ Load Demo Texts", variant="secondary")
206
+
207
+ with gr.Row():
208
+ ref_metrics_box = gr.Textbox(label="Reference Metrics", lines=8, interactive=False)
209
+ tgt_metrics_box = gr.Textbox(label="Target Metrics & Delta", lines=10, interactive=False)
210
+
211
+ gr.Markdown("### βœ… Rewritten Text")
212
+ rewritten_box = gr.Textbox(label="Rewritten Draft", lines=12, interactive=False)
213
+
214
+ post_metrics_box = gr.Textbox(label="Post-Rewrite Metrics & Delta", lines=10, interactive=False)
215
+
216
+ extracted_ref_box = gr.Textbox(label="Extracted Reference (from PDF)", lines=6, visible=False)
217
+
218
+ # ── Event wiring ────────────────────────────────────────────────────────
219
+ run_btn.click(
220
+ fn=process,
221
+ inputs=[pdf_input, ref_textbox, target_textbox, model_dropdown, steps_slider],
222
+ outputs=[ref_metrics_box, tgt_metrics_box, rewritten_box, post_metrics_box, extracted_ref_box],
223
+ )
224
+
225
+ def load_demo():
226
+ return DEMO_REFERENCE, DEMO_TARGET
227
+
228
+ demo_btn.click(
229
+ fn=load_demo,
230
+ inputs=[],
231
+ outputs=[ref_textbox, target_textbox],
232
+ )
233
+
234
+
235
+ if __name__ == "__main__":
236
+ demo.launch(server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft())