Babajaan
/

manuscript-mimic

Model card Files Files and versions

xet

Community

Babajaan commited on 21 days ago

Commit

f7b6ec4

verified ·

1 Parent(s): 951ffbe

Add style_extractor.py

Browse files

Files changed (1) hide show

manuscript_mimic/style_extractor.py +193 -0

manuscript_mimic/style_extractor.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+style_extractor.py — Manuscript-Mimic Style Analysis Tool
+A smolagents Tool that ingests a reference text and computes three
+stylometric metrics used to quantify "human academic writing style":
+  1. Sentence Length Variance  — σ of word counts per sentence
+  2. Hedging Density           — frequency of hedge words per sentence
+  3. Structural Passive Voice  — frequency of academic passive constructions per sentence
+"""
+from __future__ import annotations
+import re
+import statistics
+from typing import Any
+from smolagents import Tool
+# ── Linguistic Resources ────────────────────────────────────────────────────────
+HEDGE_WORDS = {
+    "suggest", "suggests", "suggested", "suggesting",
+    "indicate", "indicates", "indicated", "indicating",
+    "putative", "putatively",
+    "may", "might", "could", "would",
+    "possibly", "perhaps", "likely", "unlikely",
+    "probable", "probably", "plausible", "plausibly",
+    "appear", "appears", "appeared", "appearing",
+    "seem", "seems", "seemed", "seeming",
+    "generally", "typically", "approximately", "roughly",
+    "tend", "tends", "tended", "tendency",
+    "potential", "potentially",
+    "hypothesize", "hypothesized", "hypothetical",
+    "speculate", "speculated", "speculative",
+    "imply", "implies", "implied", "implying",
+    "conceivable", "conceivably",
+    "arguable", "arguably",
+    "presumably", "ostensibly",
+    "largely", "partly", "partially",
+}
+# Passive-voice patterns common in methods/results sections.
+# We match auxiliary + past participle patterns like:
+#   "was performed", "were analyzed", "has been reported", "can be observed"
+PASSIVE_RE = re.compile(
+    r"\b(?:"
+    r"(?:was|were|is|are|been|be|being|has\s+been|have\s+been|had\s+been|"
+    r"will\s+be|can\s+be|could\s+be|may\s+be|might\s+be|should\s+be|"
+    r"would\s+be|shall\s+be|must\s+be)"
+    r")\s+"
+    r"(?:[a-z]+(?:ed|en|ized|ised|ated|uted|ted|sed|ied|yed|own|ung|awn|orn))"
+    r"\b",
+    re.IGNORECASE,
+)
+# ── Sentence Splitter ───────────────────────────────────────────────────────────
+def split_sentences(text: str) -> list[str]:
+    """
+    Split text into sentences.  Handles abbreviations (e.g., et al., Fig., Dr.)
+    and decimal numbers to avoid false splits.
+    """
+    # Protect common abbreviations
+    protected = text
+    for abbr in ("et al.", "e.g.", "i.e.", "Fig.", "Dr.", "Mr.", "Mrs.", "vs.", "approx.", "ca."):
+        protected = protected.replace(abbr, abbr.replace(".", "@@DOT@@"))
+    # Split on sentence-ending punctuation followed by whitespace + uppercase or end
+    parts = re.split(r'(?<=[.!?])\s+(?=[A-Z"\(])', protected)
+    sentences = []
+    for p in parts:
+        s = p.replace("@@DOT@@", ".").strip()
+        if s:
+            sentences.append(s)
+    return sentences
+# ── Core Metric Functions ───────────────────────────────────────────────────────
+def sentence_length_variance(sentences: list[str]) -> float:
+    """Standard deviation of word-counts per sentence."""
+    if len(sentences) < 2:
+        return 0.0
+    lengths = [len(s.split()) for s in sentences]
+    return round(statistics.stdev(lengths), 4)
+def hedging_density(sentences: list[str]) -> float:
+    """Average number of hedge words per sentence."""
+    if not sentences:
+        return 0.0
+    total_hedges = 0
+    for sent in sentences:
+        words = re.findall(r"[a-z]+", sent.lower())
+        total_hedges += sum(1 for w in words if w in HEDGE_WORDS)
+    return round(total_hedges / len(sentences), 4)
+def passive_voice_density(sentences: list[str]) -> float:
+    """Average number of passive-voice constructions per sentence."""
+    if not sentences:
+        return 0.0
+    total_passives = 0
+    for sent in sentences:
+        total_passives += len(PASSIVE_RE.findall(sent))
+    return round(total_passives / len(sentences), 4)
+def word_count(sentences: list[str]) -> int:
+    """Total word count across all sentences."""
+    return sum(len(s.split()) for s in sentences)
+def avg_sentence_length(sentences: list[str]) -> float:
+    """Average words per sentence."""
+    if not sentences:
+        return 0.0
+    return round(word_count(sentences) / len(sentences), 2)
+# ── Public convenience function ─────────────────────────────────────────────────
+def extract_style_metrics(text: str) -> dict[str, Any]:
+    """
+    One-call entry point: returns a dict with all style metrics.
+    """
+    sentences = split_sentences(text)
+    return {
+        "num_sentences": len(sentences),
+        "total_words": word_count(sentences),
+        "avg_sentence_length": avg_sentence_length(sentences),
+        "sentence_length_variance": sentence_length_variance(sentences),
+        "hedging_density": hedging_density(sentences),
+        "passive_voice_density": passive_voice_density(sentences),
+    }
+# ── smolagents Tool ─────────────────────────────────────────────────────────────
+class StyleExtractorTool(Tool):
+    """
+    smolagents-compatible tool that extracts stylometric features from text.
+    Returns a dict with:
+      - num_sentences           (int)
+      - total_words             (int)
+      - avg_sentence_length     (float)  — mean words per sentence
+      - sentence_length_variance(float)  — stdev of words per sentence
+      - hedging_density         (float)  — hedge words per sentence
+      - passive_voice_density   (float)  — passive constructions per sentence
+    """
+    name = "style_extractor"
+    description = (
+        "Analyzes a block of academic text and returns style metrics: "
+        "sentence_length_variance (σ of word counts per sentence), "
+        "hedging_density (hedge words per sentence), and "
+        "passive_voice_density (passive constructions per sentence). "
+        "Also reports num_sentences, total_words, and avg_sentence_length. "
+        "Input: a string of text.  Output: a dict of float/int metrics."
+    )
+    inputs = {
+        "text": {
+            "type": "string",
+            "description": "The academic text passage to analyze.",
+        }
+    }
+    output_type = "object"
+    def forward(self, text: str) -> dict:
+        return extract_style_metrics(text)
+# ── Self-test ───────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    sample = (
+        "The computational pipeline was performed using custom Python scripts. "
+        "Variants were filtered based on allele frequency, and putative pathogenic "
+        "mutations were identified through a multi-step annotation process. "
+        "These results suggest that the observed variants may contribute to the "
+        "phenotypic heterogeneity reported in previous studies. "
+        "However, it could be argued that additional functional validation is "
+        "needed before definitive conclusions can be drawn."
+    )
+    metrics = extract_style_metrics(sample)
+    print("=== Style Extractor Self-Test ===")
+    for k, v in metrics.items():
+        print(f"  {k:>28s}: {v}")