Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import requests
|
| 5 |
+
import traceback
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from flask import Flask, request, jsonify, render_template_string
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
from pymongo import MongoClient
|
| 11 |
+
from difflib import SequenceMatcher
|
| 12 |
+
|
| 13 |
+
# ==========================================
|
| 14 |
+
# 1. 配置与连接 (使用环境变量)
|
| 15 |
+
# ==========================================
|
| 16 |
+
# 在 Hugging Face 的 Settings -> Variables and secrets 里设置这些
|
| 17 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
| 18 |
+
MONGO_USER = os.environ.get("MONGO_USER")
|
| 19 |
+
MONGO_PASS = os.environ.get("MONGO_PASS")
|
| 20 |
+
MONGO_CLUSTER = os.environ.get("MONGO_CLUSTER", "cluster0.mh3esar.mongodb.net")
|
| 21 |
+
|
| 22 |
+
# 初始化 OpenAI
|
| 23 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 24 |
+
|
| 25 |
+
# 初始化 MongoDB
|
| 26 |
+
try:
|
| 27 |
+
uri = f"mongodb+srv://{MONGO_USER}:{MONGO_PASS}@{MONGO_CLUSTER}/?retryWrites=true&w=majority"
|
| 28 |
+
mongo_client = MongoClient(uri, serverSelectionTimeoutMS=5000)
|
| 29 |
+
db = mongo_client["proj810_db"]
|
| 30 |
+
collection = db["rankings"]
|
| 31 |
+
print("✅ 数据库连接成功")
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"❌ 数据库连接失败: {e}")
|
| 34 |
+
|
| 35 |
+
# ==========================================
|
| 36 |
+
# 2. 工具定义 (逻辑保持不变)
|
| 37 |
+
# ==========================================
|
| 38 |
+
def fetch_metadata(query):
|
| 39 |
+
"""通过 OpenAlex 获取 DOI 元数据"""
|
| 40 |
+
if "10." not in query: return "Invalid DOI"
|
| 41 |
+
try:
|
| 42 |
+
url = f"https://api.openalex.org/works/doi:{query}"
|
| 43 |
+
r = requests.get(url, timeout=10)
|
| 44 |
+
if r.status_code == 200:
|
| 45 |
+
d = r.json()
|
| 46 |
+
if not d: return "OpenAlex: Empty Response"
|
| 47 |
+
src = d.get('primary_location') or {}
|
| 48 |
+
source = src.get('source') or {}
|
| 49 |
+
pub = source.get('host_organization_name') or source.get('publisher')
|
| 50 |
+
name = source.get('display_name')
|
| 51 |
+
if not name and d.get('locations'):
|
| 52 |
+
for loc in d.get('locations'):
|
| 53 |
+
s = (loc.get('source') or {})
|
| 54 |
+
if s.get('display_name'):
|
| 55 |
+
name = s.get('display_name')
|
| 56 |
+
pub = s.get('host_organization_name') or s.get('publisher')
|
| 57 |
+
break
|
| 58 |
+
type_ = d.get('type')
|
| 59 |
+
if name:
|
| 60 |
+
return json.dumps({"journal": name, "publisher": pub, "type": type_})
|
| 61 |
+
return json.dumps({"journal": "Unknown Source", "publisher": pub or "Unknown", "type": type_, "note": "Source name not found"})
|
| 62 |
+
elif r.status_code == 404:
|
| 63 |
+
return "OpenAlex: DOI Not Found"
|
| 64 |
+
else:
|
| 65 |
+
return f"OpenAlex Error: {r.status_code}"
|
| 66 |
+
except Exception as e:
|
| 67 |
+
return f"Metadata Error: {str(e)}"
|
| 68 |
+
|
| 69 |
+
def check_ranking(journal_name):
|
| 70 |
+
"""查询数据库中的期刊排名"""
|
| 71 |
+
try:
|
| 72 |
+
if not journal_name: return "Error: Empty Name"
|
| 73 |
+
clean = journal_name.replace('"', '').replace("'", "").strip()
|
| 74 |
+
safe = re.escape(clean)
|
| 75 |
+
res = collection.find_one({"Title": {"$regex": f"^{safe}$", "$options": "i"}})
|
| 76 |
+
|
| 77 |
+
# 模糊匹配逻辑
|
| 78 |
+
if not res:
|
| 79 |
+
stopwords = ["the", "of", "and", "in", "on", "for", "journal", "international", "proceedings"]
|
| 80 |
+
words = [w for w in re.split(r'[^a-zA-Z]+', clean.lower()) if len(w) > 3 and w not in stopwords]
|
| 81 |
+
if len(words) >= 1:
|
| 82 |
+
longest_word = max(words, key=len)
|
| 83 |
+
candidates = collection.find({"Title": {"$regex": longest_word, "$options": "i"}}).limit(20)
|
| 84 |
+
best_score = 0
|
| 85 |
+
best_match = None
|
| 86 |
+
for cand in candidates:
|
| 87 |
+
score = SequenceMatcher(None, clean.lower(), cand['Title'].lower()).ratio()
|
| 88 |
+
if score > 0.85 and score > best_score:
|
| 89 |
+
best_score = score
|
| 90 |
+
best_match = cand
|
| 91 |
+
if best_match: res = best_match
|
| 92 |
+
|
| 93 |
+
if res:
|
| 94 |
+
keys = res.keys()
|
| 95 |
+
docs_col = next((k for k in keys if "Total Docs" in k), "Total Docs")
|
| 96 |
+
cit_col = next((k for k in keys if "Citations / Doc" in k), "Citations / Doc")
|
| 97 |
+
|
| 98 |
+
def get_safe_val(key, default="-"):
|
| 99 |
+
val = res.get(key, default)
|
| 100 |
+
if val == default: return default
|
| 101 |
+
try:
|
| 102 |
+
if isinstance(val, str): val = val.replace(',', '')
|
| 103 |
+
return float(val)
|
| 104 |
+
except: return val
|
| 105 |
+
|
| 106 |
+
try:
|
| 107 |
+
gp = str(res.get('Global_Percentile', '0')).replace('%', '')
|
| 108 |
+
gp_val = float(gp)
|
| 109 |
+
rank_str = f"Top {100 - gp_val:.1f}%" if gp_val > 0 else "N/A"
|
| 110 |
+
except: rank_str = "N/A"
|
| 111 |
+
|
| 112 |
+
quartile = res.get("SJR Best Quartile", "-")
|
| 113 |
+
if (not quartile or quartile == "-") and rank_str != "N/A":
|
| 114 |
+
try:
|
| 115 |
+
top_percent = float(rank_str.replace("Top ", "").replace("%", ""))
|
| 116 |
+
if top_percent <= 25: quartile = "Q1 (Implied)"
|
| 117 |
+
elif top_percent <= 50: quartile = "Q2 (Implied)"
|
| 118 |
+
elif top_percent <= 75: quartile = "Q3 (Implied)"
|
| 119 |
+
else: quartile = "Q4 (Implied)"
|
| 120 |
+
except: pass
|
| 121 |
+
|
| 122 |
+
return json.dumps({
|
| 123 |
+
"Title": res.get("Title"),
|
| 124 |
+
"Quartile": quartile,
|
| 125 |
+
"SJR": get_safe_val("SJR"),
|
| 126 |
+
"H_Index": get_safe_val("H index"),
|
| 127 |
+
"Total_Docs": get_safe_val(docs_col),
|
| 128 |
+
"Citations_Per_Doc": get_safe_val(cit_col),
|
| 129 |
+
"Publisher": res.get("Publisher", "Unknown"),
|
| 130 |
+
"Global_Rank": rank_str,
|
| 131 |
+
"Categories": res.get("Categories", "")
|
| 132 |
+
})
|
| 133 |
+
return f"DB: Not Found (Cleaned: {clean})"
|
| 134 |
+
except Exception as e:
|
| 135 |
+
return f"DB Error: {str(e)}"
|
| 136 |
+
|
| 137 |
+
tools_schema = [
|
| 138 |
+
{"type": "function", "function": {"name": "fetch_metadata", "description": "Get journal name from DOI.", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}},
|
| 139 |
+
{"type": "function", "function": {"name": "check_ranking", "description": "Check journal metrics in DB.", "parameters": {"type": "object", "properties": {"journal_name": {"type": "string"}}, "required": ["journal_name"]}}}
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
# ==========================================
|
| 143 |
+
# 3. Agent 核心
|
| 144 |
+
# ==========================================
|
| 145 |
+
def run_agent_with_logs(user_input):
|
| 146 |
+
logs = []
|
| 147 |
+
system_prompt = """
|
| 148 |
+
You are an Expert Journal Authority Auditor.
|
| 149 |
+
YOUR GOAL: Verify the quality of academic venues using the `check_ranking` tool.
|
| 150 |
+
|
| 151 |
+
### CRITICAL THINKING PROCESS:
|
| 152 |
+
1. Analyze the Input (DOI, name, or acronym).
|
| 153 |
+
2. Normalize: Convert acronyms (e.g., CVPR) to full titles.
|
| 154 |
+
3. Action: Call `check_ranking` with the Cleaned Full Name.
|
| 155 |
+
|
| 156 |
+
### REPORT FORMAT (Strict Markdown)
|
| 157 |
+
If Found:
|
| 158 |
+
| Metric | Value | Status |
|
| 159 |
+
| :--- | :--- | :--- |
|
| 160 |
+
| 📖 Venue | [Title] | - |
|
| 161 |
+
| 🏆 Quartile | [Q1-Q4] | [✅/⚠️] |
|
| 162 |
+
| 📉 SJR | [Value] | - |
|
| 163 |
+
|
| 164 |
+
If Not Found:
|
| 165 |
+
> ⚠️ Notice: "[Input]" is not ranked.
|
| 166 |
+
"""
|
| 167 |
+
|
| 168 |
+
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_input}]
|
| 169 |
+
logs.append(f"🧠 System: Analyzing '{user_input}'...")
|
| 170 |
+
|
| 171 |
+
turn_count = 0
|
| 172 |
+
try:
|
| 173 |
+
while turn_count < 4:
|
| 174 |
+
turn_count += 1
|
| 175 |
+
logs.append(f"🤔 Step {turn_count}: Thinking...")
|
| 176 |
+
|
| 177 |
+
resp = client.chat.completions.create(
|
| 178 |
+
model="gpt-4o",
|
| 179 |
+
messages=messages,
|
| 180 |
+
tools=tools_schema,
|
| 181 |
+
temperature=0
|
| 182 |
+
)
|
| 183 |
+
msg = resp.choices[0].message
|
| 184 |
+
messages.append(msg)
|
| 185 |
+
|
| 186 |
+
if not msg.tool_calls:
|
| 187 |
+
logs.append("📝 Report generated.")
|
| 188 |
+
return msg.content, logs
|
| 189 |
+
|
| 190 |
+
for tc in msg.tool_calls:
|
| 191 |
+
fname = tc.function.name
|
| 192 |
+
args_str = tc.function.arguments
|
| 193 |
+
try:
|
| 194 |
+
args = json.loads(args_str)
|
| 195 |
+
logs.append(f"🔍 Checking: {args.get('journal_name') or args.get('query')}")
|
| 196 |
+
|
| 197 |
+
if fname == "fetch_metadata":
|
| 198 |
+
res = fetch_metadata(args.get("query"))
|
| 199 |
+
elif fname == "check_ranking":
|
| 200 |
+
res = check_ranking(args.get("journal_name"))
|
| 201 |
+
else:
|
| 202 |
+
res = "Error: Unknown Tool"
|
| 203 |
+
except Exception as e:
|
| 204 |
+
res = f"Tool Error: {str(e)}"
|
| 205 |
+
|
| 206 |
+
logs.append(f"✅ Result: {str(res)[:80]}...")
|
| 207 |
+
messages.append({
|
| 208 |
+
"tool_call_id": tc.id,
|
| 209 |
+
"role": "tool",
|
| 210 |
+
"name": fname,
|
| 211 |
+
"content": str(res)
|
| 212 |
+
})
|
| 213 |
+
return "⚠️ Timeout: Analysis too complex.", logs
|
| 214 |
+
except Exception as e:
|
| 215 |
+
print(traceback.format_exc())
|
| 216 |
+
return f"**System Error**: {str(e)}", logs
|
| 217 |
+
|
| 218 |
+
# ==========================================
|
| 219 |
+
# 4. Flask Web Server
|
| 220 |
+
# ==========================================
|
| 221 |
+
app = Flask(__name__)
|
| 222 |
+
|
| 223 |
+
# 这里放入你的 HTML (为了简洁,我稍微压缩了一下,你可以直接用你 Colab 里那个完整版)
|
| 224 |
+
CHAT_HTML = """
|
| 225 |
+
<!DOCTYPE html>
|
| 226 |
+
<html lang="en">
|
| 227 |
+
<head>
|
| 228 |
+
<meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 229 |
+
<title>Journal Authority Auditor</title>
|
| 230 |
+
<style>
|
| 231 |
+
body { font-family: 'Segoe UI', system-ui; background: #2c3e50; display: flex; justify-content: center; height: 100vh; margin: 0; }
|
| 232 |
+
.container { display: flex; width: 95%; max-width: 1400px; height: 95vh; gap: 20px; margin-top: 2.5vh; }
|
| 233 |
+
.chat-panel { flex: 1.2; background: #f4f7f6; border-radius: 12px; display: flex; flex-direction: column; overflow: hidden; }
|
| 234 |
+
.header { background: #34495e; color: white; padding: 15px; font-weight: bold; }
|
| 235 |
+
.messages-area { flex: 1; padding: 20px; overflow-y: auto; display: flex; flex-direction: column; gap: 15px; }
|
| 236 |
+
.message { max-width: 85%; padding: 14px; border-radius: 12px; line-height: 1.6; }
|
| 237 |
+
.bot { background: white; align-self: flex-start; border-left: 4px solid #3498db; }
|
| 238 |
+
.user { background: #3498db; color: white; align-self: flex-end; }
|
| 239 |
+
.input-form { padding: 20px; background: white; display: flex; gap: 10px; }
|
| 240 |
+
input { flex: 1; padding: 10px; border-radius: 8px; border: 1px solid #ccc; }
|
| 241 |
+
.brain-panel { flex: 0.8; background: #1e272e; border-radius: 12px; color: #0fb9b1; padding: 15px; font-family: monospace; overflow-y: auto; }
|
| 242 |
+
table { width: 100%; border-collapse: collapse; margin: 10px 0; background: white; }
|
| 243 |
+
th, td { border: 1px solid #ddd; padding: 8px; }
|
| 244 |
+
</style>
|
| 245 |
+
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
| 246 |
+
</head>
|
| 247 |
+
<body>
|
| 248 |
+
<div class="container">
|
| 249 |
+
<div class="chat-panel">
|
| 250 |
+
<div class="header">🛡️ Journal Auditor (HF Space Edition)</div>
|
| 251 |
+
<div class="messages-area" id="chat-box"><div class="message bot">System Online.</div></div>
|
| 252 |
+
<form class="input-form" onsubmit="event.preventDefault(); send();">
|
| 253 |
+
<input type="text" id="inp" placeholder="Type DOI or Journal Name..." autocomplete="off">
|
| 254 |
+
<button onclick="send()">Analyze</button>
|
| 255 |
+
</form>
|
| 256 |
+
</div>
|
| 257 |
+
<div class="brain-panel" id="log-box"><div>TERMINAL LOG...</div></div>
|
| 258 |
+
</div>
|
| 259 |
+
<script>
|
| 260 |
+
async function send(){
|
| 261 |
+
let i = document.getElementById('inp');
|
| 262 |
+
let txt = i.value.trim();
|
| 263 |
+
if(!txt) return;
|
| 264 |
+
addMsg(txt, 'user'); i.value = '';
|
| 265 |
+
|
| 266 |
+
let logsDiv = document.getElementById('log-box');
|
| 267 |
+
|
| 268 |
+
try {
|
| 269 |
+
let r = await fetch('/chat', {
|
| 270 |
+
method: 'POST',
|
| 271 |
+
headers: {'Content-Type': 'application/json'},
|
| 272 |
+
body: JSON.stringify({message: txt})
|
| 273 |
+
});
|
| 274 |
+
let d = await r.json();
|
| 275 |
+
|
| 276 |
+
if(d.logs) {
|
| 277 |
+
d.logs.forEach(l => {
|
| 278 |
+
let div = document.createElement('div');
|
| 279 |
+
div.innerText = l;
|
| 280 |
+
div.style.marginBottom = "5px";
|
| 281 |
+
div.style.borderLeft = "2px solid #555";
|
| 282 |
+
logsDiv.appendChild(div);
|
| 283 |
+
});
|
| 284 |
+
logsDiv.scrollTop = logsDiv.scrollHeight;
|
| 285 |
+
}
|
| 286 |
+
addMsg(d.reply || "Error", 'bot', true);
|
| 287 |
+
} catch(e) { addMsg("Server Error", 'bot'); }
|
| 288 |
+
}
|
| 289 |
+
function addMsg(txt, cls, html=false){
|
| 290 |
+
let d = document.createElement('div');
|
| 291 |
+
d.className = 'message ' + cls;
|
| 292 |
+
if(html) d.innerHTML = marked.parse(txt); else d.innerText = txt;
|
| 293 |
+
document.getElementById('chat-box').appendChild(d);
|
| 294 |
+
}
|
| 295 |
+
</script></body></html>
|
| 296 |
+
"""
|
| 297 |
+
|
| 298 |
+
@app.route('/')
|
| 299 |
+
def index(): return render_template_string(CHAT_HTML)
|
| 300 |
+
|
| 301 |
+
@app.route('/chat', methods=['POST'])
|
| 302 |
+
def chat():
|
| 303 |
+
try:
|
| 304 |
+
data = request.json
|
| 305 |
+
reply, logs = run_agent_with_logs(data.get('message', ''))
|
| 306 |
+
return jsonify({"reply": reply, "logs": logs})
|
| 307 |
+
except Exception as e:
|
| 308 |
+
return jsonify({"reply": f"Error: {str(e)}", "logs": []})
|
| 309 |
+
|
| 310 |
+
if __name__ == '__main__':
|
| 311 |
+
# Hugging Face 需要监听 0.0.0.0 和端口 7860
|
| 312 |
+
app.run(host='0.0.0.0', port=7860)
|