Spaces:
Sleeping
Sleeping
Update code
Browse files- stylometry.py +39 -21
stylometry.py
CHANGED
|
@@ -535,29 +535,47 @@ class AdvancedStylometryAnalyzer:
|
|
| 535 |
else:
|
| 536 |
scores['time_pattern'] = 0.0
|
| 537 |
|
| 538 |
-
# ===
|
| 539 |
-
# Feature Vector
|
| 540 |
-
#
|
| 541 |
-
#
|
| 542 |
-
#
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
-
|
| 553 |
-
if scores['embedding_cosine'] is None:
|
| 554 |
-
weights['feature_cosine'] += 0.10
|
| 555 |
-
weights['bigram_overlap'] += 0.05
|
| 556 |
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
if scores.get(key) is not None:
|
| 560 |
-
overall += weight * scores[key]
|
| 561 |
|
| 562 |
return overall, scores
|
| 563 |
|
|
|
|
| 535 |
else:
|
| 536 |
scores['time_pattern'] = 0.0
|
| 537 |
|
| 538 |
+
# === Threshold-based scoring ===
|
| 539 |
+
# Feature Vector is the most reliable discriminator. Use it as a gate:
|
| 540 |
+
# - Below 94%: heavy penalty (likely different people)
|
| 541 |
+
# - 94-96%: moderate score
|
| 542 |
+
# - Above 96%: bonus (likely same person)
|
| 543 |
+
|
| 544 |
+
feature_score = scores['feature_cosine']
|
| 545 |
+
bigram_score = scores['bigram_overlap']
|
| 546 |
+
|
| 547 |
+
# Base score from key metrics (feature vector is primary)
|
| 548 |
+
base_score = (
|
| 549 |
+
feature_score * 0.50 +
|
| 550 |
+
bigram_score * 0.30 +
|
| 551 |
+
scores['trigram_overlap'] * 0.10 +
|
| 552 |
+
(scores['embedding_cosine'] * 0.10 if scores['embedding_cosine'] is not None else 0)
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
# Apply threshold-based multipliers
|
| 556 |
+
if feature_score >= 0.96:
|
| 557 |
+
# Very high feature similarity - likely same person
|
| 558 |
+
multiplier = 1.15
|
| 559 |
+
elif feature_score >= 0.94:
|
| 560 |
+
# High similarity - possible match
|
| 561 |
+
multiplier = 1.0
|
| 562 |
+
elif feature_score >= 0.90:
|
| 563 |
+
# Moderate similarity - penalize
|
| 564 |
+
multiplier = 0.75
|
| 565 |
+
else:
|
| 566 |
+
# Low similarity - heavy penalty
|
| 567 |
+
multiplier = 0.5
|
| 568 |
+
|
| 569 |
+
# Additional penalty if bigrams are low
|
| 570 |
+
if bigram_score < 0.80:
|
| 571 |
+
multiplier *= 0.85
|
| 572 |
+
elif bigram_score >= 0.85:
|
| 573 |
+
multiplier *= 1.05
|
| 574 |
|
| 575 |
+
overall = base_score * multiplier
|
|
|
|
|
|
|
|
|
|
| 576 |
|
| 577 |
+
# Cap at 100%
|
| 578 |
+
overall = min(overall, 1.0)
|
|
|
|
|
|
|
| 579 |
|
| 580 |
return overall, scores
|
| 581 |
|