Babajaan
/

manuscript-mimic

Model card Files Files and versions

xet

Community

Babajaan commited on 22 days ago

Commit

73bbc7f

verified ·

1 Parent(s): f7b6ec4

Add rewrite_agent.py

Browse files

Files changed (1) hide show

manuscript_mimic/rewrite_agent.py +200 -0

manuscript_mimic/rewrite_agent.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+rewrite_agent.py — Manuscript-Mimic CodeAgent Orchestrator
+Builds a smolagents CodeAgent that executes a three-step agentic loop:
+  Step 1: Run style_extractor on the Reference Text to obtain target metrics.
+  Step 2: Run style_extractor on the AI-generated Target Draft.
+  Step 3: Rewrite the Target Draft paragraph-by-paragraph so its statistical
+          metrics (sentence_length_variance, hedging_density, passive_voice_density)
+          converge toward the Reference Text's profile.
+The agent intentionally injects complex multi-clause sentences, hedging
+language, and passive-voice constructions typical of pre-2022 human-authored
+methods sections in biomedical / genetics manuscripts.
+"""
+from __future__ import annotations
+import os
+import textwrap
+from typing import Optional
+from smolagents import CodeAgent, InferenceClientModel
+from style_extractor import StyleExtractorTool
+# ── System Instructions (injected into the CodeAgent's system prompt) ───────────
+SYSTEM_INSTRUCTIONS = textwrap.dedent("""\
+You are **Manuscript-Mimic**, an expert scientific style-transfer agent.
+Your mission: rewrite an AI-generated "Target Draft" so that its *measurable
+stylistic statistics* match a human-authored "Reference Text" from a pre-2022
+academic manuscript.
+### Workflow — execute EXACTLY these steps:
+**Step 1 — Profile the Reference Text**
+Call `style_extractor(text=<reference_text>)` to obtain the Reference metrics.
+Note the three key values:
+  • `sentence_length_variance` (σ of word counts per sentence)
+  • `hedging_density` (hedge words like *suggest, may, putative, indicate, could* per sentence)
+  • `passive_voice_density` (academic passives like *was performed, were analyzed* per sentence)
+**Step 2 — Profile the Target Draft**
+Call `style_extractor(text=<target_text>)` to obtain the current draft metrics.
+**Step 3 — Rewrite paragraph-by-paragraph**
+For each paragraph in the Target Draft, rewrite it so that:
+  1. **Sentence length variance** matches the Reference — mix short declarative
+     sentences with long, multi-clause sentences containing subordinate clauses,
+     parenthetical asides, and embedded lists (common in methods sections).
+  2. **Hedging density** matches — inject hedging language (*suggest, indicate,
+     may, could, putative, appear to, it is plausible that, these findings
+     imply*) at the Reference frequency.
+  3. **Passive voice density** matches — use academic passives (*was performed,
+     were identified, has been reported, can be observed, were subsequently
+     analyzed*) at the Reference frequency.
+Preserve ALL scientific facts, gene names, variant identifiers, and technical
+terminology from the original draft.  Do NOT invent new data.
+After rewriting, call `style_extractor` on your rewritten text to verify the
+metrics are close to the Reference.  If any metric deviates by more than 30%,
+revise and re-check.
+**Final output**: Return ONLY the rewritten text (all paragraphs concatenated),
+with no commentary or meta-text.
+""")
+# ── Agent Builder ───────────────────────────────────────────────────────────────
+def build_agent(
+    model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct",
+    max_steps: int = 12,
+    verbosity: int = 1,
+) -> CodeAgent:
+    """
+    Construct a ready-to-run CodeAgent with the StyleExtractorTool attached.
+    Parameters
+    ----------
+    model_id : str
+        HuggingFace model ID served via the Inference API.
+    max_steps : int
+        Maximum agentic reasoning steps (default 12 — enough for
+        extract→extract→rewrite→verify loop).
+    verbosity : int
+        0 = silent, 1 = info, 2 = debug.
+    Returns
+    -------
+    CodeAgent
+    """
+    model = InferenceClientModel(
+        model_id=model_id,
+        token=os.environ.get("HF_TOKEN"),
+    )
+    agent = CodeAgent(
+        tools=[StyleExtractorTool()],
+        model=model,
+        max_steps=max_steps,
+        additional_authorized_imports=[
+            "re", "string", "statistics", "json", "textwrap", "math",
+        ],
+        instructions=SYSTEM_INSTRUCTIONS,
+        verbosity_level=verbosity,
+    )
+    return agent
+def run_mimic(
+    reference_text: str,
+    target_text: str,
+    model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct",
+    max_steps: int = 12,
+    verbosity: int = 1,
+) -> str:
+    """
+    End-to-end convenience function:
+      1. Build the agent.
+      2. Compose the user prompt.
+      3. Run and return the rewritten text.
+    """
+    agent = build_agent(model_id=model_id, max_steps=max_steps, verbosity=verbosity)
+    prompt = textwrap.dedent(f"""\
+    ## Reference Text (pre-2022 human-authored manuscript)
+    {reference_text.strip()}
+    ## Target Draft (AI-generated — to be rewritten)
+    {target_text.strip()}
+    Rewrite the Target Draft so its style metrics match the Reference Text.
+    Follow your instructions exactly.
+    """)
+    result = agent.run(prompt)
+    return str(result)
+# ── Demo ────────────────────────────────────────────────────────────────────────
+DEMO_REFERENCE = textwrap.dedent("""\
+The computational analysis of genetic variants was performed using a custom
+bioinformatics pipeline that integrated several publicly available annotation
+databases, including ClinVar, gnomAD, and the CADD scoring framework.  Variants
+with a minor allele frequency exceeding 0.01 in any reference population were
+excluded from downstream analysis, as these were considered unlikely to
+represent pathogenic mutations.  Putative loss-of-function variants — including
+nonsense mutations, canonical splice-site alterations, and frameshift
+insertions or deletions — were prioritized for further investigation.  These
+results suggest that a substantial proportion of the identified variants may
+contribute to the phenotypic heterogeneity observed across the patient cohort,
+although it could be argued that additional functional studies are needed before
+definitive genotype-phenotype correlations can be established.  Segregation
+analysis was subsequently carried out in all available family members, and
+variants that did not co-segregate with the disease phenotype were deprioritized.
+""").strip()
+DEMO_TARGET = textwrap.dedent("""\
+We used a bioinformatics pipeline for genetic variant analysis. The pipeline
+used ClinVar, gnomAD, and CADD. We removed variants with allele frequency above
+0.01. We focused on loss-of-function variants like nonsense mutations, splice-site
+changes, and frameshifts. Many identified variants contribute to phenotypic
+differences in patients. We did segregation analysis on family members and removed
+variants that didn't match the disease.
+""").strip()
+if __name__ == "__main__":
+    from style_extractor import extract_style_metrics
+    import json
+    print("=" * 70)
+    print("MANUSCRIPT-MIMIC  —  Demo Execution")
+    print("=" * 70)
+    print("\n📊 Reference Metrics:")
+    ref_m = extract_style_metrics(DEMO_REFERENCE)
+    print(json.dumps(ref_m, indent=2))
+    print("\n📊 Target Metrics (before rewriting):")
+    tgt_m = extract_style_metrics(DEMO_TARGET)
+    print(json.dumps(tgt_m, indent=2))
+    print("\n🔄 Running Manuscript-Mimic agent...")
+    rewritten = run_mimic(DEMO_REFERENCE, DEMO_TARGET, verbosity=2)
+    print("\n✅ Rewritten Text:")
+    print(rewritten)
+    print("\n📊 Rewritten Metrics:")
+    new_m = extract_style_metrics(rewritten)
+    print(json.dumps(new_m, indent=2))