Babajaan commited on
Commit
73bbc7f
Β·
verified Β·
1 Parent(s): f7b6ec4

Add rewrite_agent.py

Browse files
Files changed (1) hide show
  1. manuscript_mimic/rewrite_agent.py +200 -0
manuscript_mimic/rewrite_agent.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ rewrite_agent.py β€” Manuscript-Mimic CodeAgent Orchestrator
3
+
4
+ Builds a smolagents CodeAgent that executes a three-step agentic loop:
5
+
6
+ Step 1: Run style_extractor on the Reference Text to obtain target metrics.
7
+ Step 2: Run style_extractor on the AI-generated Target Draft.
8
+ Step 3: Rewrite the Target Draft paragraph-by-paragraph so its statistical
9
+ metrics (sentence_length_variance, hedging_density, passive_voice_density)
10
+ converge toward the Reference Text's profile.
11
+
12
+ The agent intentionally injects complex multi-clause sentences, hedging
13
+ language, and passive-voice constructions typical of pre-2022 human-authored
14
+ methods sections in biomedical / genetics manuscripts.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import os
20
+ import textwrap
21
+ from typing import Optional
22
+
23
+ from smolagents import CodeAgent, InferenceClientModel
24
+
25
+ from style_extractor import StyleExtractorTool
26
+
27
+
28
+ # ── System Instructions (injected into the CodeAgent's system prompt) ───────────
29
+
30
+ SYSTEM_INSTRUCTIONS = textwrap.dedent("""\
31
+ You are **Manuscript-Mimic**, an expert scientific style-transfer agent.
32
+
33
+ Your mission: rewrite an AI-generated "Target Draft" so that its *measurable
34
+ stylistic statistics* match a human-authored "Reference Text" from a pre-2022
35
+ academic manuscript.
36
+
37
+ ### Workflow β€” execute EXACTLY these steps:
38
+
39
+ **Step 1 β€” Profile the Reference Text**
40
+ Call `style_extractor(text=<reference_text>)` to obtain the Reference metrics.
41
+ Note the three key values:
42
+ β€’ `sentence_length_variance` (Οƒ of word counts per sentence)
43
+ β€’ `hedging_density` (hedge words like *suggest, may, putative, indicate, could* per sentence)
44
+ β€’ `passive_voice_density` (academic passives like *was performed, were analyzed* per sentence)
45
+
46
+ **Step 2 β€” Profile the Target Draft**
47
+ Call `style_extractor(text=<target_text>)` to obtain the current draft metrics.
48
+
49
+ **Step 3 β€” Rewrite paragraph-by-paragraph**
50
+ For each paragraph in the Target Draft, rewrite it so that:
51
+ 1. **Sentence length variance** matches the Reference β€” mix short declarative
52
+ sentences with long, multi-clause sentences containing subordinate clauses,
53
+ parenthetical asides, and embedded lists (common in methods sections).
54
+ 2. **Hedging density** matches β€” inject hedging language (*suggest, indicate,
55
+ may, could, putative, appear to, it is plausible that, these findings
56
+ imply*) at the Reference frequency.
57
+ 3. **Passive voice density** matches β€” use academic passives (*was performed,
58
+ were identified, has been reported, can be observed, were subsequently
59
+ analyzed*) at the Reference frequency.
60
+
61
+ Preserve ALL scientific facts, gene names, variant identifiers, and technical
62
+ terminology from the original draft. Do NOT invent new data.
63
+
64
+ After rewriting, call `style_extractor` on your rewritten text to verify the
65
+ metrics are close to the Reference. If any metric deviates by more than 30%,
66
+ revise and re-check.
67
+
68
+ **Final output**: Return ONLY the rewritten text (all paragraphs concatenated),
69
+ with no commentary or meta-text.
70
+ """)
71
+
72
+
73
+ # ── Agent Builder ───────────────────────────────────────────────────────────────
74
+
75
+ def build_agent(
76
+ model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct",
77
+ max_steps: int = 12,
78
+ verbosity: int = 1,
79
+ ) -> CodeAgent:
80
+ """
81
+ Construct a ready-to-run CodeAgent with the StyleExtractorTool attached.
82
+
83
+ Parameters
84
+ ----------
85
+ model_id : str
86
+ HuggingFace model ID served via the Inference API.
87
+ max_steps : int
88
+ Maximum agentic reasoning steps (default 12 β€” enough for
89
+ extract→extract→rewrite→verify loop).
90
+ verbosity : int
91
+ 0 = silent, 1 = info, 2 = debug.
92
+
93
+ Returns
94
+ -------
95
+ CodeAgent
96
+ """
97
+ model = InferenceClientModel(
98
+ model_id=model_id,
99
+ token=os.environ.get("HF_TOKEN"),
100
+ )
101
+
102
+ agent = CodeAgent(
103
+ tools=[StyleExtractorTool()],
104
+ model=model,
105
+ max_steps=max_steps,
106
+ additional_authorized_imports=[
107
+ "re", "string", "statistics", "json", "textwrap", "math",
108
+ ],
109
+ instructions=SYSTEM_INSTRUCTIONS,
110
+ verbosity_level=verbosity,
111
+ )
112
+ return agent
113
+
114
+
115
+ def run_mimic(
116
+ reference_text: str,
117
+ target_text: str,
118
+ model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct",
119
+ max_steps: int = 12,
120
+ verbosity: int = 1,
121
+ ) -> str:
122
+ """
123
+ End-to-end convenience function:
124
+ 1. Build the agent.
125
+ 2. Compose the user prompt.
126
+ 3. Run and return the rewritten text.
127
+ """
128
+ agent = build_agent(model_id=model_id, max_steps=max_steps, verbosity=verbosity)
129
+
130
+ prompt = textwrap.dedent(f"""\
131
+ ## Reference Text (pre-2022 human-authored manuscript)
132
+
133
+ {reference_text.strip()}
134
+
135
+ ## Target Draft (AI-generated β€” to be rewritten)
136
+
137
+ {target_text.strip()}
138
+
139
+ Rewrite the Target Draft so its style metrics match the Reference Text.
140
+ Follow your instructions exactly.
141
+ """)
142
+
143
+ result = agent.run(prompt)
144
+ return str(result)
145
+
146
+
147
+ # ── Demo ────────────────────────────────────────────────────────────────────────
148
+
149
+ DEMO_REFERENCE = textwrap.dedent("""\
150
+ The computational analysis of genetic variants was performed using a custom
151
+ bioinformatics pipeline that integrated several publicly available annotation
152
+ databases, including ClinVar, gnomAD, and the CADD scoring framework. Variants
153
+ with a minor allele frequency exceeding 0.01 in any reference population were
154
+ excluded from downstream analysis, as these were considered unlikely to
155
+ represent pathogenic mutations. Putative loss-of-function variants β€” including
156
+ nonsense mutations, canonical splice-site alterations, and frameshift
157
+ insertions or deletions β€” were prioritized for further investigation. These
158
+ results suggest that a substantial proportion of the identified variants may
159
+ contribute to the phenotypic heterogeneity observed across the patient cohort,
160
+ although it could be argued that additional functional studies are needed before
161
+ definitive genotype-phenotype correlations can be established. Segregation
162
+ analysis was subsequently carried out in all available family members, and
163
+ variants that did not co-segregate with the disease phenotype were deprioritized.
164
+ """).strip()
165
+
166
+ DEMO_TARGET = textwrap.dedent("""\
167
+ We used a bioinformatics pipeline for genetic variant analysis. The pipeline
168
+ used ClinVar, gnomAD, and CADD. We removed variants with allele frequency above
169
+ 0.01. We focused on loss-of-function variants like nonsense mutations, splice-site
170
+ changes, and frameshifts. Many identified variants contribute to phenotypic
171
+ differences in patients. We did segregation analysis on family members and removed
172
+ variants that didn't match the disease.
173
+ """).strip()
174
+
175
+
176
+ if __name__ == "__main__":
177
+ from style_extractor import extract_style_metrics
178
+ import json
179
+
180
+ print("=" * 70)
181
+ print("MANUSCRIPT-MIMIC β€” Demo Execution")
182
+ print("=" * 70)
183
+
184
+ print("\nπŸ“Š Reference Metrics:")
185
+ ref_m = extract_style_metrics(DEMO_REFERENCE)
186
+ print(json.dumps(ref_m, indent=2))
187
+
188
+ print("\nπŸ“Š Target Metrics (before rewriting):")
189
+ tgt_m = extract_style_metrics(DEMO_TARGET)
190
+ print(json.dumps(tgt_m, indent=2))
191
+
192
+ print("\nπŸ”„ Running Manuscript-Mimic agent...")
193
+ rewritten = run_mimic(DEMO_REFERENCE, DEMO_TARGET, verbosity=2)
194
+
195
+ print("\nβœ… Rewritten Text:")
196
+ print(rewritten)
197
+
198
+ print("\nπŸ“Š Rewritten Metrics:")
199
+ new_m = extract_style_metrics(rewritten)
200
+ print(json.dumps(new_m, indent=2))