Spaces:

jscmp4
/

810proj

Sleeping

App Files Files Community

jscmp4 commited on Dec 3, 2025

Commit

2d1764d

verified ·

1 Parent(s): 1abeda3

Create app.py

Browse files

Files changed (1) hide show

app.py +312 -0

app.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import os
+import json
+import re
+import requests
+import traceback
+import pandas as pd
+from datetime import datetime
+from flask import Flask, request, jsonify, render_template_string
+from openai import OpenAI
+from pymongo import MongoClient
+from difflib import SequenceMatcher
+# ==========================================
+# 1. 配置与连接 (使用环境变量)
+# ==========================================
+# 在 Hugging Face 的 Settings -> Variables and secrets 里设置这些
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+MONGO_USER = os.environ.get("MONGO_USER")
+MONGO_PASS = os.environ.get("MONGO_PASS")
+MONGO_CLUSTER = os.environ.get("MONGO_CLUSTER", "cluster0.mh3esar.mongodb.net")
+# 初始化 OpenAI
+client = OpenAI(api_key=OPENAI_API_KEY)
+# 初始化 MongoDB
+try:
+    uri = f"mongodb+srv://{MONGO_USER}:{MONGO_PASS}@{MONGO_CLUSTER}/?retryWrites=true&w=majority"
+    mongo_client = MongoClient(uri, serverSelectionTimeoutMS=5000)
+    db = mongo_client["proj810_db"]
+    collection = db["rankings"]
+    print("✅ 数据库连接成功")
+except Exception as e:
+    print(f"❌ 数据库连接失败: {e}")
+# ==========================================
+# 2. 工具定义 (逻辑保持不变)
+# ==========================================
+def fetch_metadata(query):
+    """通过 OpenAlex 获取 DOI 元数据"""
+    if "10." not in query: return "Invalid DOI"
+    try:
+        url = f"https://api.openalex.org/works/doi:{query}"
+        r = requests.get(url, timeout=10)
+        if r.status_code == 200:
+            d = r.json()
+            if not d: return "OpenAlex: Empty Response"
+            src = d.get('primary_location') or {}
+            source = src.get('source') or {}
+            pub = source.get('host_organization_name') or source.get('publisher')
+            name = source.get('display_name')
+            if not name and d.get('locations'):
+                for loc in d.get('locations'):
+                    s = (loc.get('source') or {})
+                    if s.get('display_name'):
+                        name = s.get('display_name')
+                        pub = s.get('host_organization_name') or s.get('publisher')
+                        break
+            type_ = d.get('type')
+            if name:
+                return json.dumps({"journal": name, "publisher": pub, "type": type_})
+            return json.dumps({"journal": "Unknown Source", "publisher": pub or "Unknown", "type": type_, "note": "Source name not found"})
+        elif r.status_code == 404:
+            return "OpenAlex: DOI Not Found"
+        else:
+            return f"OpenAlex Error: {r.status_code}"
+    except Exception as e:
+        return f"Metadata Error: {str(e)}"
+def check_ranking(journal_name):
+    """查询数据库中的期刊排名"""
+    try:
+        if not journal_name: return "Error: Empty Name"
+        clean = journal_name.replace('"', '').replace("'", "").strip()
+        safe = re.escape(clean)
+        res = collection.find_one({"Title": {"$regex": f"^{safe}$", "$options": "i"}})
+        # 模糊匹配逻辑
+        if not res:
+            stopwords = ["the", "of", "and", "in", "on", "for", "journal", "international", "proceedings"]
+            words = [w for w in re.split(r'[^a-zA-Z]+', clean.lower()) if len(w) > 3 and w not in stopwords]
+            if len(words) >= 1:
+                longest_word = max(words, key=len)
+                candidates = collection.find({"Title": {"$regex": longest_word, "$options": "i"}}).limit(20)
+                best_score = 0
+                best_match = None
+                for cand in candidates:
+                    score = SequenceMatcher(None, clean.lower(), cand['Title'].lower()).ratio()
+                    if score > 0.85 and score > best_score:
+                        best_score = score
+                        best_match = cand
+                if best_match: res = best_match
+        if res:
+            keys = res.keys()
+            docs_col = next((k for k in keys if "Total Docs" in k), "Total Docs")
+            cit_col = next((k for k in keys if "Citations / Doc" in k), "Citations / Doc")
+            def get_safe_val(key, default="-"):
+                val = res.get(key, default)
+                if val == default: return default
+                try:
+                    if isinstance(val, str): val = val.replace(',', '')
+                    return float(val)
+                except: return val
+            try:
+                gp = str(res.get('Global_Percentile', '0')).replace('%', '')
+                gp_val = float(gp)
+                rank_str = f"Top {100 - gp_val:.1f}%" if gp_val > 0 else "N/A"
+            except: rank_str = "N/A"
+            quartile = res.get("SJR Best Quartile", "-")
+            if (not quartile or quartile == "-") and rank_str != "N/A":
+                try:
+                    top_percent = float(rank_str.replace("Top ", "").replace("%", ""))
+                    if top_percent <= 25: quartile = "Q1 (Implied)"
+                    elif top_percent <= 50: quartile = "Q2 (Implied)"
+                    elif top_percent <= 75: quartile = "Q3 (Implied)"
+                    else: quartile = "Q4 (Implied)"
+                except: pass
+            return json.dumps({
+                "Title": res.get("Title"),
+                "Quartile": quartile,
+                "SJR": get_safe_val("SJR"),
+                "H_Index": get_safe_val("H index"),
+                "Total_Docs": get_safe_val(docs_col),
+                "Citations_Per_Doc": get_safe_val(cit_col),
+                "Publisher": res.get("Publisher", "Unknown"),
+                "Global_Rank": rank_str,
+                "Categories": res.get("Categories", "")
+            })
+        return f"DB: Not Found (Cleaned: {clean})"
+    except Exception as e:
+        return f"DB Error: {str(e)}"
+tools_schema = [
+    {"type": "function", "function": {"name": "fetch_metadata", "description": "Get journal name from DOI.", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}},
+    {"type": "function", "function": {"name": "check_ranking", "description": "Check journal metrics in DB.", "parameters": {"type": "object", "properties": {"journal_name": {"type": "string"}}, "required": ["journal_name"]}}}
+]
+# ==========================================
+# 3. Agent 核心
+# ==========================================
+def run_agent_with_logs(user_input):
+    logs = []
+    system_prompt = """
+    You are an Expert Journal Authority Auditor.
+    YOUR GOAL: Verify the quality of academic venues using the `check_ranking` tool.
+    ### CRITICAL THINKING PROCESS:
+    1. Analyze the Input (DOI, name, or acronym).
+    2. Normalize: Convert acronyms (e.g., CVPR) to full titles.
+    3. Action: Call `check_ranking` with the Cleaned Full Name.
+    ### REPORT FORMAT (Strict Markdown)
+    If Found:
+    | Metric | Value | Status |
+    | :--- | :--- | :--- |
+    | 📖 Venue | [Title] | - |
+    | 🏆 Quartile | [Q1-Q4] | [✅/⚠️] |
+    | 📉 SJR | [Value] | - |
+    If Not Found:
+    > ⚠️ Notice: "[Input]" is not ranked.
+    """
+    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_input}]
+    logs.append(f"🧠 System: Analyzing '{user_input}'...")
+    turn_count = 0
+    try:
+        while turn_count < 4:
+            turn_count += 1
+            logs.append(f"🤔 Step {turn_count}: Thinking...")
+            resp = client.chat.completions.create(
+                model="gpt-4o",
+                messages=messages,
+                tools=tools_schema,
+                temperature=0
+            )
+            msg = resp.choices[0].message
+            messages.append(msg)
+            if not msg.tool_calls:
+                logs.append("📝 Report generated.")
+                return msg.content, logs
+            for tc in msg.tool_calls:
+                fname = tc.function.name
+                args_str = tc.function.arguments
+                try:
+                    args = json.loads(args_str)
+                    logs.append(f"🔍 Checking: {args.get('journal_name') or args.get('query')}")
+                    if fname == "fetch_metadata":
+                        res = fetch_metadata(args.get("query"))
+                    elif fname == "check_ranking":
+                        res = check_ranking(args.get("journal_name"))
+                    else:
+                        res = "Error: Unknown Tool"
+                except Exception as e:
+                    res = f"Tool Error: {str(e)}"
+                logs.append(f"✅ Result: {str(res)[:80]}...")
+                messages.append({
+                    "tool_call_id": tc.id,
+                    "role": "tool",
+                    "name": fname,
+                    "content": str(res)
+                })
+        return "⚠️ Timeout: Analysis too complex.", logs
+    except Exception as e:
+        print(traceback.format_exc())
+        return f"**System Error**: {str(e)}", logs
+# ==========================================
+# 4. Flask Web Server
+# ==========================================
+app = Flask(__name__)
+# 这里放入你的 HTML (为了简洁，我稍微压缩了一下，你可以直接用你 Colab 里那个完整版)
+CHAT_HTML = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Journal Authority Auditor</title>
+<style>
+    body { font-family: 'Segoe UI', system-ui; background: #2c3e50; display: flex; justify-content: center; height: 100vh; margin: 0; }
+    .container { display: flex; width: 95%; max-width: 1400px; height: 95vh; gap: 20px; margin-top: 2.5vh; }
+    .chat-panel { flex: 1.2; background: #f4f7f6; border-radius: 12px; display: flex; flex-direction: column; overflow: hidden; }
+    .header { background: #34495e; color: white; padding: 15px; font-weight: bold; }
+    .messages-area { flex: 1; padding: 20px; overflow-y: auto; display: flex; flex-direction: column; gap: 15px; }
+    .message { max-width: 85%; padding: 14px; border-radius: 12px; line-height: 1.6; }
+    .bot { background: white; align-self: flex-start; border-left: 4px solid #3498db; }
+    .user { background: #3498db; color: white; align-self: flex-end; }
+    .input-form { padding: 20px; background: white; display: flex; gap: 10px; }
+    input { flex: 1; padding: 10px; border-radius: 8px; border: 1px solid #ccc; }
+    .brain-panel { flex: 0.8; background: #1e272e; border-radius: 12px; color: #0fb9b1; padding: 15px; font-family: monospace; overflow-y: auto; }
+    table { width: 100%; border-collapse: collapse; margin: 10px 0; background: white; }
+    th, td { border: 1px solid #ddd; padding: 8px; }
+</style>
+<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+</head>
+<body>
+<div class="container">
+    <div class="chat-panel">
+        <div class="header">🛡️ Journal Auditor (HF Space Edition)</div>
+        <div class="messages-area" id="chat-box"><div class="message bot">System Online.</div></div>
+        <form class="input-form" onsubmit="event.preventDefault(); send();">
+            <input type="text" id="inp" placeholder="Type DOI or Journal Name..." autocomplete="off">
+            <button onclick="send()">Analyze</button>
+        </form>
+    </div>
+    <div class="brain-panel" id="log-box"><div>TERMINAL LOG...</div></div>
+</div>
+<script>
+async function send(){
+    let i = document.getElementById('inp');
+    let txt = i.value.trim();
+    if(!txt) return;
+    addMsg(txt, 'user'); i.value = '';
+    let logsDiv = document.getElementById('log-box');
+    try {
+        let r = await fetch('/chat', {
+            method: 'POST',
+            headers: {'Content-Type': 'application/json'},
+            body: JSON.stringify({message: txt})
+        });
+        let d = await r.json();
+        if(d.logs) {
+            d.logs.forEach(l => {
+                let div = document.createElement('div');
+                div.innerText = l;
+                div.style.marginBottom = "5px";
+                div.style.borderLeft = "2px solid #555";
+                logsDiv.appendChild(div);
+            });
+            logsDiv.scrollTop = logsDiv.scrollHeight;
+        }
+        addMsg(d.reply || "Error", 'bot', true);
+    } catch(e) { addMsg("Server Error", 'bot'); }
+}
+function addMsg(txt, cls, html=false){
+    let d = document.createElement('div');
+    d.className = 'message ' + cls;
+    if(html) d.innerHTML = marked.parse(txt); else d.innerText = txt;
+    document.getElementById('chat-box').appendChild(d);
+}
+</script></body></html>
+"""
+@app.route('/')
+def index(): return render_template_string(CHAT_HTML)
+@app.route('/chat', methods=['POST'])
+def chat():
+    try:
+        data = request.json
+        reply, logs = run_agent_with_logs(data.get('message', ''))
+        return jsonify({"reply": reply, "logs": logs})
+    except Exception as e:
+        return jsonify({"reply": f"Error: {str(e)}", "logs": []})
+if __name__ == '__main__':
+    # Hugging Face 需要监听 0.0.0.0 和端口 7860
+    app.run(host='0.0.0.0', port=7860)