rottg commited on
Commit
a922175
·
verified ·
1 Parent(s): b83fda7

Update code

Browse files
Files changed (1) hide show
  1. stylometry.py +39 -21
stylometry.py CHANGED
@@ -535,29 +535,47 @@ class AdvancedStylometryAnalyzer:
535
  else:
536
  scores['time_pattern'] = 0.0
537
 
538
- # === Weighted combination ===
539
- # Feature Vector and Character Bigrams are the most reliable for identifying
540
- # same-person accounts. AI Embedding tends to capture general "Hebrew chat style"
541
- # rather than individual fingerprints, so it gets reduced weight.
542
- # Time/Word patterns have low discriminative power in practice.
543
- weights = {
544
- 'feature_cosine': 0.40, # Most reliable - individual fingerprint
545
- 'embedding_cosine': 0.15 if scores['embedding_cosine'] is not None else 0.0, # General style only
546
- 'bigram_overlap': 0.25, # Very reliable character patterns
547
- 'trigram_overlap': 0.10, # Good character patterns
548
- 'word_bigram_overlap': 0.05, # Low discriminative power
549
- 'time_pattern': 0.05, # Low discriminative power
550
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
 
552
- # Redistribute embedding weight if not available
553
- if scores['embedding_cosine'] is None:
554
- weights['feature_cosine'] += 0.10
555
- weights['bigram_overlap'] += 0.05
556
 
557
- overall = 0.0
558
- for key, weight in weights.items():
559
- if scores.get(key) is not None:
560
- overall += weight * scores[key]
561
 
562
  return overall, scores
563
 
 
535
  else:
536
  scores['time_pattern'] = 0.0
537
 
538
+ # === Threshold-based scoring ===
539
+ # Feature Vector is the most reliable discriminator. Use it as a gate:
540
+ # - Below 94%: heavy penalty (likely different people)
541
+ # - 94-96%: moderate score
542
+ # - Above 96%: bonus (likely same person)
543
+
544
+ feature_score = scores['feature_cosine']
545
+ bigram_score = scores['bigram_overlap']
546
+
547
+ # Base score from key metrics (feature vector is primary)
548
+ base_score = (
549
+ feature_score * 0.50 +
550
+ bigram_score * 0.30 +
551
+ scores['trigram_overlap'] * 0.10 +
552
+ (scores['embedding_cosine'] * 0.10 if scores['embedding_cosine'] is not None else 0)
553
+ )
554
+
555
+ # Apply threshold-based multipliers
556
+ if feature_score >= 0.96:
557
+ # Very high feature similarity - likely same person
558
+ multiplier = 1.15
559
+ elif feature_score >= 0.94:
560
+ # High similarity - possible match
561
+ multiplier = 1.0
562
+ elif feature_score >= 0.90:
563
+ # Moderate similarity - penalize
564
+ multiplier = 0.75
565
+ else:
566
+ # Low similarity - heavy penalty
567
+ multiplier = 0.5
568
+
569
+ # Additional penalty if bigrams are low
570
+ if bigram_score < 0.80:
571
+ multiplier *= 0.85
572
+ elif bigram_score >= 0.85:
573
+ multiplier *= 1.05
574
 
575
+ overall = base_score * multiplier
 
 
 
576
 
577
+ # Cap at 100%
578
+ overall = min(overall, 1.0)
 
 
579
 
580
  return overall, scores
581