Babajaan commited on
Commit
f7b6ec4
Β·
verified Β·
1 Parent(s): 951ffbe

Add style_extractor.py

Browse files
Files changed (1) hide show
  1. manuscript_mimic/style_extractor.py +193 -0
manuscript_mimic/style_extractor.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ style_extractor.py β€” Manuscript-Mimic Style Analysis Tool
3
+
4
+ A smolagents Tool that ingests a reference text and computes three
5
+ stylometric metrics used to quantify "human academic writing style":
6
+
7
+ 1. Sentence Length Variance β€” Οƒ of word counts per sentence
8
+ 2. Hedging Density β€” frequency of hedge words per sentence
9
+ 3. Structural Passive Voice β€” frequency of academic passive constructions per sentence
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ import statistics
16
+ from typing import Any
17
+
18
+ from smolagents import Tool
19
+
20
+
21
+ # ── Linguistic Resources ────────────────────────────────────────────────────────
22
+
23
+ HEDGE_WORDS = {
24
+ "suggest", "suggests", "suggested", "suggesting",
25
+ "indicate", "indicates", "indicated", "indicating",
26
+ "putative", "putatively",
27
+ "may", "might", "could", "would",
28
+ "possibly", "perhaps", "likely", "unlikely",
29
+ "probable", "probably", "plausible", "plausibly",
30
+ "appear", "appears", "appeared", "appearing",
31
+ "seem", "seems", "seemed", "seeming",
32
+ "generally", "typically", "approximately", "roughly",
33
+ "tend", "tends", "tended", "tendency",
34
+ "potential", "potentially",
35
+ "hypothesize", "hypothesized", "hypothetical",
36
+ "speculate", "speculated", "speculative",
37
+ "imply", "implies", "implied", "implying",
38
+ "conceivable", "conceivably",
39
+ "arguable", "arguably",
40
+ "presumably", "ostensibly",
41
+ "largely", "partly", "partially",
42
+ }
43
+
44
+ # Passive-voice patterns common in methods/results sections.
45
+ # We match auxiliary + past participle patterns like:
46
+ # "was performed", "were analyzed", "has been reported", "can be observed"
47
+ PASSIVE_RE = re.compile(
48
+ r"\b(?:"
49
+ r"(?:was|were|is|are|been|be|being|has\s+been|have\s+been|had\s+been|"
50
+ r"will\s+be|can\s+be|could\s+be|may\s+be|might\s+be|should\s+be|"
51
+ r"would\s+be|shall\s+be|must\s+be)"
52
+ r")\s+"
53
+ r"(?:[a-z]+(?:ed|en|ized|ised|ated|uted|ted|sed|ied|yed|own|ung|awn|orn))"
54
+ r"\b",
55
+ re.IGNORECASE,
56
+ )
57
+
58
+
59
+ # ── Sentence Splitter ───────────────────────────────────────────────────────────
60
+
61
+ def split_sentences(text: str) -> list[str]:
62
+ """
63
+ Split text into sentences. Handles abbreviations (e.g., et al., Fig., Dr.)
64
+ and decimal numbers to avoid false splits.
65
+ """
66
+ # Protect common abbreviations
67
+ protected = text
68
+ for abbr in ("et al.", "e.g.", "i.e.", "Fig.", "Dr.", "Mr.", "Mrs.", "vs.", "approx.", "ca."):
69
+ protected = protected.replace(abbr, abbr.replace(".", "@@DOT@@"))
70
+
71
+ # Split on sentence-ending punctuation followed by whitespace + uppercase or end
72
+ parts = re.split(r'(?<=[.!?])\s+(?=[A-Z"\(])', protected)
73
+
74
+ sentences = []
75
+ for p in parts:
76
+ s = p.replace("@@DOT@@", ".").strip()
77
+ if s:
78
+ sentences.append(s)
79
+ return sentences
80
+
81
+
82
+ # ── Core Metric Functions ───────────────────────────────────────────────────────
83
+
84
+ def sentence_length_variance(sentences: list[str]) -> float:
85
+ """Standard deviation of word-counts per sentence."""
86
+ if len(sentences) < 2:
87
+ return 0.0
88
+ lengths = [len(s.split()) for s in sentences]
89
+ return round(statistics.stdev(lengths), 4)
90
+
91
+
92
+ def hedging_density(sentences: list[str]) -> float:
93
+ """Average number of hedge words per sentence."""
94
+ if not sentences:
95
+ return 0.0
96
+ total_hedges = 0
97
+ for sent in sentences:
98
+ words = re.findall(r"[a-z]+", sent.lower())
99
+ total_hedges += sum(1 for w in words if w in HEDGE_WORDS)
100
+ return round(total_hedges / len(sentences), 4)
101
+
102
+
103
+ def passive_voice_density(sentences: list[str]) -> float:
104
+ """Average number of passive-voice constructions per sentence."""
105
+ if not sentences:
106
+ return 0.0
107
+ total_passives = 0
108
+ for sent in sentences:
109
+ total_passives += len(PASSIVE_RE.findall(sent))
110
+ return round(total_passives / len(sentences), 4)
111
+
112
+
113
+ def word_count(sentences: list[str]) -> int:
114
+ """Total word count across all sentences."""
115
+ return sum(len(s.split()) for s in sentences)
116
+
117
+
118
+ def avg_sentence_length(sentences: list[str]) -> float:
119
+ """Average words per sentence."""
120
+ if not sentences:
121
+ return 0.0
122
+ return round(word_count(sentences) / len(sentences), 2)
123
+
124
+
125
+ # ── Public convenience function ─────────────────────────────────────────────────
126
+
127
+ def extract_style_metrics(text: str) -> dict[str, Any]:
128
+ """
129
+ One-call entry point: returns a dict with all style metrics.
130
+ """
131
+ sentences = split_sentences(text)
132
+ return {
133
+ "num_sentences": len(sentences),
134
+ "total_words": word_count(sentences),
135
+ "avg_sentence_length": avg_sentence_length(sentences),
136
+ "sentence_length_variance": sentence_length_variance(sentences),
137
+ "hedging_density": hedging_density(sentences),
138
+ "passive_voice_density": passive_voice_density(sentences),
139
+ }
140
+
141
+
142
+ # ── smolagents Tool ─────────────────────────────────────────────────────────────
143
+
144
+ class StyleExtractorTool(Tool):
145
+ """
146
+ smolagents-compatible tool that extracts stylometric features from text.
147
+
148
+ Returns a dict with:
149
+ - num_sentences (int)
150
+ - total_words (int)
151
+ - avg_sentence_length (float) β€” mean words per sentence
152
+ - sentence_length_variance(float) β€” stdev of words per sentence
153
+ - hedging_density (float) β€” hedge words per sentence
154
+ - passive_voice_density (float) β€” passive constructions per sentence
155
+ """
156
+
157
+ name = "style_extractor"
158
+ description = (
159
+ "Analyzes a block of academic text and returns style metrics: "
160
+ "sentence_length_variance (Οƒ of word counts per sentence), "
161
+ "hedging_density (hedge words per sentence), and "
162
+ "passive_voice_density (passive constructions per sentence). "
163
+ "Also reports num_sentences, total_words, and avg_sentence_length. "
164
+ "Input: a string of text. Output: a dict of float/int metrics."
165
+ )
166
+ inputs = {
167
+ "text": {
168
+ "type": "string",
169
+ "description": "The academic text passage to analyze.",
170
+ }
171
+ }
172
+ output_type = "object"
173
+
174
+ def forward(self, text: str) -> dict:
175
+ return extract_style_metrics(text)
176
+
177
+
178
+ # ── Self-test ───────────────────────────────────────────────────────────────────
179
+
180
+ if __name__ == "__main__":
181
+ sample = (
182
+ "The computational pipeline was performed using custom Python scripts. "
183
+ "Variants were filtered based on allele frequency, and putative pathogenic "
184
+ "mutations were identified through a multi-step annotation process. "
185
+ "These results suggest that the observed variants may contribute to the "
186
+ "phenotypic heterogeneity reported in previous studies. "
187
+ "However, it could be argued that additional functional validation is "
188
+ "needed before definitive conclusions can be drawn."
189
+ )
190
+ metrics = extract_style_metrics(sample)
191
+ print("=== Style Extractor Self-Test ===")
192
+ for k, v in metrics.items():
193
+ print(f" {k:>28s}: {v}")