Create ranking_originality.py

Browse files

Files changed (1) hide show

src/evaluation/ranking_originality.py +109 -0

src/evaluation/ranking_originality.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# © 2025 Elena Marziali — Code released under Apache 2.0 license.
+# See LICENSE in the repository for details.
+# Removal of this copyright is prohibited.
+# Sample data for ranking
+data = np.array([
+    [120, 45, 1, 2023],  # Citations, h-index, peer review, year
+    [50, 30, 1, 2020],
+    [10, 15, 0, 2018]
+])
+labels = [95, 70, 30]  # Academic impact score
+# Model training
+ranking_model = RandomForestRegressor(n_estimators=100)
+ranking_model.fit(data, labels)
+# **Ranking prediction**
+def calculate_impact_score(citations, h_index, peer_review, publication_year):
+    paper_data = np.array([[citations, h_index, peer_review, publication_year]])
+    score = ranking_model.predict(paper_data)
+    return max(0, score[0])  # Ensure non-negative
+# Usage example
+impact_score = calculate_impact_score(80, 40, 1, 2024)
+print(f"Estimated score: {impact_score}")
+# Ranking model
+from sklearn.ensemble import RandomForestRegressor
+# Sample data for ranking
+data = np.array([
+    [120, 45, 1, 2023],  # Citations, h-index, peer review, year
+    [50, 30, 1, 2020],
+    [10, 15, 0, 2018]
+])
+labels = [95, 70, 30]  # Academic impact score
+# Model training
+ranking_model = RandomForestRegressor(n_estimators=100)
+ranking_model.fit(data, labels)
+# Ranking prediction
+new_paper = np.array([[80, 40, 1, 2024]])
+score = ranking_model.predict(new_paper)
+print(f"Estimated score: {score[0]}")
+# === Scientific originality evaluation ===
+def evaluate_hypothesis_novelty(hypothesis, existing_articles, threshold=0.7):
+    """
+    Compares the hypothesis with existing articles using semantic embeddings.
+    Returns:
+    - average similarity score
+    - similar articles
+    - qualitative assessment of originality
+    """
+    try:
+        emb_hypothesis = model_embedding.encode([hypothesis])
+        emb_articles = model_embedding.encode([a["abstract"] for a in existing_articles if "abstract" in a])
+        similarity = np.dot(emb_hypothesis, emb_articles.T) / (
+            np.linalg.norm(emb_hypothesis) * np.linalg.norm(emb_articles, axis=1)
+        )
+        average = round(float(np.mean(similarity)), 3)
+        similar_articles = [
+            existing_articles[i]["title"]
+            for i, score in enumerate(similarity[0]) if score > threshold
+        ]
+        if average < 0.4:
+            assessment = "High originality: hypothesis is rarely present in the literature."
+        elif average < 0.7:
+            assessment = "Moderate originality: related concepts exist."
+        else:
+            assessment = "Low originality: hypothesis is already widely discussed."
+        return {
+            "novelty_score": average,
+            "similar_articles": similar_articles,
+            "assessment": assessment
+        }
+    except Exception as e:
+        logging.error(f"[evaluate_novelty] Error during originality evaluation: {e}")
+        return {
+            "novelty_score": 0.0,
+            "similar_articles": [],
+            "assessment": "Error during originality evaluation."
+        }
+# Automated paper review with AI
+async def review_paper(paper_text):
+    """ Checks the methodology and citation quality of a paper. """
+    methodology = await verify_methodology(paper_text)
+    citations = await verify_citations(paper_text)
+    return {"methodology": methodology, "citations": citations}
+async def validate_hypothesis(hypothesis):
+    sources = await search_multi_database(hypothesis)
+    score = calculate_impact_score(sources)  # Based on citations, year, h-index, etc.
+    summary = summarize_evidence(sources)
+    return score, summary
+def summarize_evidence(sources):
+    return "\n".join([
+        f"- {a['title'][:80]}…" for a in sources if isinstance(a, dict) and 'title' in a
+    ]) if sources else "No evidence found."