Transformers
Italian
English
semantic-search
explainable-ai
faiss
ai-ethics
responsible-ai
llm
prompt-engineering
multimodal-ai
ai-transparency
ethical-intelligence
explainable-llm
cognitive-ai
ethical-ai
scientific-retrieval
modular-ai
memory-augmented-llm
trustworthy-ai
reasoning-engine
ai-alignment
next-gen-llm
thinking-machines
open-source-ai
explainability
ai-research
semantic audit
cognitive agent
human-centered-ai
Create ranking_originality.py
Browse files
src/evaluation/ranking_originality.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# © 2025 Elena Marziali — Code released under Apache 2.0 license.
|
| 2 |
+
# See LICENSE in the repository for details.
|
| 3 |
+
# Removal of this copyright is prohibited.
|
| 4 |
+
|
| 5 |
+
# Sample data for ranking
|
| 6 |
+
data = np.array([
|
| 7 |
+
[120, 45, 1, 2023], # Citations, h-index, peer review, year
|
| 8 |
+
[50, 30, 1, 2020],
|
| 9 |
+
[10, 15, 0, 2018]
|
| 10 |
+
])
|
| 11 |
+
|
| 12 |
+
labels = [95, 70, 30] # Academic impact score
|
| 13 |
+
|
| 14 |
+
# Model training
|
| 15 |
+
ranking_model = RandomForestRegressor(n_estimators=100)
|
| 16 |
+
ranking_model.fit(data, labels)
|
| 17 |
+
|
| 18 |
+
# **Ranking prediction**
|
| 19 |
+
def calculate_impact_score(citations, h_index, peer_review, publication_year):
|
| 20 |
+
paper_data = np.array([[citations, h_index, peer_review, publication_year]])
|
| 21 |
+
score = ranking_model.predict(paper_data)
|
| 22 |
+
return max(0, score[0]) # Ensure non-negative
|
| 23 |
+
|
| 24 |
+
# Usage example
|
| 25 |
+
impact_score = calculate_impact_score(80, 40, 1, 2024)
|
| 26 |
+
print(f"Estimated score: {impact_score}")
|
| 27 |
+
|
| 28 |
+
# Ranking model
|
| 29 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 30 |
+
|
| 31 |
+
# Sample data for ranking
|
| 32 |
+
data = np.array([
|
| 33 |
+
[120, 45, 1, 2023], # Citations, h-index, peer review, year
|
| 34 |
+
[50, 30, 1, 2020],
|
| 35 |
+
[10, 15, 0, 2018]
|
| 36 |
+
])
|
| 37 |
+
|
| 38 |
+
labels = [95, 70, 30] # Academic impact score
|
| 39 |
+
|
| 40 |
+
# Model training
|
| 41 |
+
ranking_model = RandomForestRegressor(n_estimators=100)
|
| 42 |
+
ranking_model.fit(data, labels)
|
| 43 |
+
|
| 44 |
+
# Ranking prediction
|
| 45 |
+
new_paper = np.array([[80, 40, 1, 2024]])
|
| 46 |
+
score = ranking_model.predict(new_paper)
|
| 47 |
+
print(f"Estimated score: {score[0]}")
|
| 48 |
+
|
| 49 |
+
# === Scientific originality evaluation ===
|
| 50 |
+
def evaluate_hypothesis_novelty(hypothesis, existing_articles, threshold=0.7):
|
| 51 |
+
"""
|
| 52 |
+
Compares the hypothesis with existing articles using semantic embeddings.
|
| 53 |
+
Returns:
|
| 54 |
+
- average similarity score
|
| 55 |
+
- similar articles
|
| 56 |
+
- qualitative assessment of originality
|
| 57 |
+
"""
|
| 58 |
+
try:
|
| 59 |
+
emb_hypothesis = model_embedding.encode([hypothesis])
|
| 60 |
+
emb_articles = model_embedding.encode([a["abstract"] for a in existing_articles if "abstract" in a])
|
| 61 |
+
|
| 62 |
+
similarity = np.dot(emb_hypothesis, emb_articles.T) / (
|
| 63 |
+
np.linalg.norm(emb_hypothesis) * np.linalg.norm(emb_articles, axis=1)
|
| 64 |
+
)
|
| 65 |
+
average = round(float(np.mean(similarity)), 3)
|
| 66 |
+
|
| 67 |
+
similar_articles = [
|
| 68 |
+
existing_articles[i]["title"]
|
| 69 |
+
for i, score in enumerate(similarity[0]) if score > threshold
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
if average < 0.4:
|
| 73 |
+
assessment = "High originality: hypothesis is rarely present in the literature."
|
| 74 |
+
elif average < 0.7:
|
| 75 |
+
assessment = "Moderate originality: related concepts exist."
|
| 76 |
+
else:
|
| 77 |
+
assessment = "Low originality: hypothesis is already widely discussed."
|
| 78 |
+
|
| 79 |
+
return {
|
| 80 |
+
"novelty_score": average,
|
| 81 |
+
"similar_articles": similar_articles,
|
| 82 |
+
"assessment": assessment
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logging.error(f"[evaluate_novelty] Error during originality evaluation: {e}")
|
| 87 |
+
return {
|
| 88 |
+
"novelty_score": 0.0,
|
| 89 |
+
"similar_articles": [],
|
| 90 |
+
"assessment": "Error during originality evaluation."
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
# Automated paper review with AI
|
| 94 |
+
async def review_paper(paper_text):
|
| 95 |
+
""" Checks the methodology and citation quality of a paper. """
|
| 96 |
+
methodology = await verify_methodology(paper_text)
|
| 97 |
+
citations = await verify_citations(paper_text)
|
| 98 |
+
return {"methodology": methodology, "citations": citations}
|
| 99 |
+
|
| 100 |
+
async def validate_hypothesis(hypothesis):
|
| 101 |
+
sources = await search_multi_database(hypothesis)
|
| 102 |
+
score = calculate_impact_score(sources) # Based on citations, year, h-index, etc.
|
| 103 |
+
summary = summarize_evidence(sources)
|
| 104 |
+
return score, summary
|
| 105 |
+
|
| 106 |
+
def summarize_evidence(sources):
|
| 107 |
+
return "\n".join([
|
| 108 |
+
f"- {a['title'][:80]}…" for a in sources if isinstance(a, dict) and 'title' in a
|
| 109 |
+
]) if sources else "No evidence found."
|