Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -893,480 +893,8 @@ def quick_score(text):
|
|
| 893 |
hum=analyze_humanizer(text,sents,words,morphs)
|
| 894 |
fs,v,lv=compute_verdict(sc, sent_avg=sent_avg, ppx_score=ppx["score"], hum_score=hum["score"])
|
| 895 |
return fs,v,lv,sc,ppx,hum
|
| 896 |
-
|
| 897 |
-
"""Brave Search API — 단일 쿼리"""
|
| 898 |
-
if not BRAVE_KEY: return []
|
| 899 |
-
url = f"https://api.search.brave.com/res/v1/web/search?q={query}&count={count}"
|
| 900 |
-
try:
|
| 901 |
-
if HAS_HTTPX:
|
| 902 |
-
r = httpx.get(url, headers={"X-Subscription-Token": BRAVE_KEY, "Accept": "application/json"}, timeout=10)
|
| 903 |
-
if r.status_code == 200:
|
| 904 |
-
data = r.json()
|
| 905 |
-
results = []
|
| 906 |
-
for item in data.get("web", {}).get("results", []):
|
| 907 |
-
results.append({"title": item.get("title",""), "url": item.get("url",""), "snippet": item.get("description",""), "source": "Brave"})
|
| 908 |
-
return results
|
| 909 |
-
except: pass
|
| 910 |
-
return []
|
| 911 |
-
def search_kci(query):
|
| 912 |
-
"""KCI(한국학술지인용색인) 검색"""
|
| 913 |
-
try:
|
| 914 |
-
url = f"https://open.kci.go.kr/po/openapi/openApiSearch.kci?apiCode=articleSearch&title={query}&displayCount=3"
|
| 915 |
-
resp = http_get(url, timeout=8)
|
| 916 |
-
if resp:
|
| 917 |
-
results = []
|
| 918 |
-
for m in re.finditer(r'<article-title><!\[CDATA\[(.+?)\]\]></article-title>.*?<url><!\[CDATA\[(.+?)\]\]></url>', resp, re.S):
|
| 919 |
-
results.append({"title": m.group(1), "url": m.group(2), "snippet": "", "source": "KCI"})
|
| 920 |
-
return results[:3]
|
| 921 |
-
except: pass
|
| 922 |
-
return []
|
| 923 |
-
def search_riss(query):
|
| 924 |
-
"""RISS(학술연구정보서비스) — 간접 검색"""
|
| 925 |
-
results = []
|
| 926 |
-
try:
|
| 927 |
-
url = f"http://www.riss.kr/search/Search.do?isDetailSearch=N&searchGubun=true&viewYn=OP&queryText=&strQuery={query}&iStartCount=0&iGroupView=5&icate=all"
|
| 928 |
-
resp = http_get(url, timeout=8)
|
| 929 |
-
if resp:
|
| 930 |
-
for m in re.finditer(r'class="title"[^>]*>.*?<a[^>]*href="([^"]+)"[^>]*>(.*?)</a>', resp, re.S):
|
| 931 |
-
title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
|
| 932 |
-
if title:
|
| 933 |
-
results.append({"title": title, "url": "https://www.riss.kr" + m.group(1), "snippet": "", "source": "RISS"})
|
| 934 |
-
except: pass
|
| 935 |
-
return results[:3]
|
| 936 |
-
def search_arxiv(query):
|
| 937 |
-
"""arXiv API 검색"""
|
| 938 |
-
results = []
|
| 939 |
-
try:
|
| 940 |
-
import urllib.parse
|
| 941 |
-
q = urllib.parse.quote(query)
|
| 942 |
-
url = f"https://export.arxiv.org/api/query?search_query=all:{q}&start=0&max_results=3&sortBy=relevance"
|
| 943 |
-
resp = http_get(url, timeout=12)
|
| 944 |
-
if resp:
|
| 945 |
-
for m in re.finditer(r'<entry>.*?<title>(.*?)</title>.*?<id>(.*?)</id>.*?<summary>(.*?)</summary>', resp, re.S):
|
| 946 |
-
title = re.sub(r'\s+', ' ', m.group(1)).strip()
|
| 947 |
-
results.append({"title": title, "url": m.group(2).strip(), "snippet": re.sub(r'\s+', ' ', m.group(3)).strip()[:150], "source": "arXiv"})
|
| 948 |
-
except Exception as e:
|
| 949 |
-
pass
|
| 950 |
-
return results[:3]
|
| 951 |
-
def gemini_plagiarism_check(text_chunk):
|
| 952 |
-
"""Gemini + Google Search Grounding으로 표절 검사"""
|
| 953 |
-
if not HAS_GENAI or not GEMINI_KEY: return None
|
| 954 |
-
try:
|
| 955 |
-
client = genai.Client(api_key=GEMINI_KEY)
|
| 956 |
-
tool = gtypes.Tool(googleSearch=gtypes.GoogleSearch())
|
| 957 |
-
prompt = f"""다음 텍스트가 인터넷에 존재하는지 Google Search로 확인하세요.
|
| 958 |
-
유사한 문장이 발견되면 출처 URL과 유사도(%)를 보고하세요.
|
| 959 |
-
마지막 줄에 "유사도: XX%" 형식으로 작성.
|
| 960 |
-
[텍스트]
|
| 961 |
-
{text_chunk[:1000]}"""
|
| 962 |
-
resp = client.models.generate_content(
|
| 963 |
-
model="gemini-flash-lite-latest",
|
| 964 |
-
contents=prompt,
|
| 965 |
-
config=gtypes.GenerateContentConfig(tools=[tool], temperature=0.1, max_output_tokens=600)
|
| 966 |
-
)
|
| 967 |
-
text_resp = resp.text if resp.text else ""
|
| 968 |
-
sources = []
|
| 969 |
-
if hasattr(resp, 'candidates') and resp.candidates:
|
| 970 |
-
gc = resp.candidates[0].grounding_metadata
|
| 971 |
-
if gc and hasattr(gc, 'grounding_chunks'):
|
| 972 |
-
for chunk in gc.grounding_chunks:
|
| 973 |
-
if hasattr(chunk, 'web') and chunk.web:
|
| 974 |
-
sources.append({"title": chunk.web.title or "", "url": chunk.web.uri or "", "source": "Google"})
|
| 975 |
-
pm = re.search(r'유사도[:\s]*(\d+)', text_resp)
|
| 976 |
-
pct = int(pm.group(1)) if pm else 0
|
| 977 |
-
return {"pct": pct, "response": text_resp, "sources": sources}
|
| 978 |
-
except Exception as e:
|
| 979 |
-
return {"pct": 0, "response": str(e)[:100], "sources": []}
|
| 980 |
-
def parallel_brave_search(queries, max_workers=10):
|
| 981 |
-
"""Brave Search 병렬 실행 (최대 20개)"""
|
| 982 |
-
all_results = {}
|
| 983 |
-
with ThreadPoolExecutor(max_workers=min(max_workers, 20)) as executor:
|
| 984 |
-
futures = {executor.submit(brave_search, q, 3): q for q in queries}
|
| 985 |
-
for future in as_completed(futures):
|
| 986 |
-
q = futures[future]
|
| 987 |
-
try:
|
| 988 |
-
results = future.result()
|
| 989 |
-
all_results[q] = results
|
| 990 |
-
except: all_results[q] = []
|
| 991 |
-
return all_results
|
| 992 |
-
def duckduckgo_search(query, max_results=5):
|
| 993 |
-
"""DuckDuckGo HTML 스크래핑 — API 키 불필요 폴백"""
|
| 994 |
-
results = []
|
| 995 |
-
try:
|
| 996 |
-
import urllib.parse
|
| 997 |
-
q = urllib.parse.quote(query)
|
| 998 |
-
url = f"https://html.duckduckgo.com/html/?q={q}"
|
| 999 |
-
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
| 1000 |
-
resp = http_get(url, headers=headers, timeout=10)
|
| 1001 |
-
if resp:
|
| 1002 |
-
for m in re.finditer(r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]*>(.*?)</a>.*?<a[^>]+class="result__snippet"[^>]*>(.*?)</a>', resp, re.S):
|
| 1003 |
-
href = m.group(1)
|
| 1004 |
-
title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
|
| 1005 |
-
snippet = re.sub(r'<[^>]+>', '', m.group(3)).strip()
|
| 1006 |
-
real_url = href
|
| 1007 |
-
if 'uddg=' in href:
|
| 1008 |
-
um = re.search(r'uddg=([^&]+)', href)
|
| 1009 |
-
if um: real_url = urllib.parse.unquote(um.group(1))
|
| 1010 |
-
if title:
|
| 1011 |
-
results.append({"title": title, "url": real_url, "snippet": snippet, "source": "Web"})
|
| 1012 |
-
if len(results) >= max_results: break
|
| 1013 |
-
except: pass
|
| 1014 |
-
return results
|
| 1015 |
-
def self_crawl_search(query, max_results=3):
|
| 1016 |
-
"""httpx 기반 자체 크롤링 (DuckDuckGo + 학술 사이트)"""
|
| 1017 |
-
all_results = []
|
| 1018 |
-
all_results.extend(duckduckgo_search(query, max_results))
|
| 1019 |
-
if '논문' not in query and 'paper' not in query.lower():
|
| 1020 |
-
all_results.extend(duckduckgo_search(f"{query} 논문 학술", 2))
|
| 1021 |
-
return all_results
|
| 1022 |
-
def _extract_key_phrases(text, max_phrases=6):
|
| 1023 |
-
"""텍스트에서 핵심 검색 구문 추출 (Brave/학술 검색용)"""
|
| 1024 |
-
sents = split_sentences(text)
|
| 1025 |
-
phrases = []
|
| 1026 |
-
# 긴 문장 우선 (정보량 많은 문장)
|
| 1027 |
-
ranked = sorted(sents, key=lambda s: len(s), reverse=True)
|
| 1028 |
-
for s in ranked:
|
| 1029 |
-
# 15~80자 사이 문장만 검색 쿼리로 적합
|
| 1030 |
-
if 15 <= len(s) <= 80:
|
| 1031 |
-
phrases.append(s)
|
| 1032 |
-
elif len(s) > 80:
|
| 1033 |
-
phrases.append(s[:80])
|
| 1034 |
-
if len(phrases) >= max_phrases:
|
| 1035 |
-
break
|
| 1036 |
-
# 부족하면 앞부분에서 보충
|
| 1037 |
-
if len(phrases) < 2 and sents:
|
| 1038 |
-
phrases.append(sents[0][:80])
|
| 1039 |
-
return phrases
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
def run_plagiarism(text, progress=gr.Progress()):
|
| 1043 |
-
"""표절 검사 — 3단계 파이프라인: ① Gemini Google Search ② Brave 웹검색 ③ 학술DB"""
|
| 1044 |
-
if not text or len(text.strip()) < 50:
|
| 1045 |
-
return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자 이상</div>", ""
|
| 1046 |
-
|
| 1047 |
-
text = text.strip()
|
| 1048 |
-
now = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 1049 |
-
doc_id = hashlib.md5(text[:100].encode()).hexdigest()[:8].upper()
|
| 1050 |
-
log_lines = []
|
| 1051 |
-
|
| 1052 |
-
gemini_pct = 0
|
| 1053 |
-
gemini_sources = []
|
| 1054 |
-
gemini_response = ""
|
| 1055 |
-
brave_sources = []
|
| 1056 |
-
academic_sources = []
|
| 1057 |
-
|
| 1058 |
-
# ═══════════════════════════════════════
|
| 1059 |
-
# PHASE 1: Gemini Google Search Grounding
|
| 1060 |
-
# ═══════════════════════════════════════
|
| 1061 |
-
if HAS_GENAI and GEMINI_KEY:
|
| 1062 |
-
progress(0.10, "① Gemini Google Search Grounding...")
|
| 1063 |
-
try:
|
| 1064 |
-
client = genai.Client(api_key=GEMINI_KEY)
|
| 1065 |
-
|
| 1066 |
-
prompt = f"""당신은 표절 검사 전문가입니다. 아래 텍스트가 인터넷에 이미 존재하는 내용인지 Google Search로 철저히 검색하세요.
|
| 1067 |
-
|
| 1068 |
-
[검사 대상 텍스트]
|
| 1069 |
-
{text[:3000]}
|
| 1070 |
-
|
| 1071 |
-
[응답 형식]
|
| 1072 |
-
1. 발견된 유사 콘텐츠를 각각 "제목 | URL | 유사도(높음/중간/낮음)" 형식으로 나열
|
| 1073 |
-
2. 발견 못하면 "유사 콘텐츠 없음"
|
| 1074 |
-
3. 마지막 줄에 반드시 "표절율: XX%" 형식으로 종합 판정"""
|
| 1075 |
-
|
| 1076 |
-
contents = [
|
| 1077 |
-
gtypes.Content(
|
| 1078 |
-
role="user",
|
| 1079 |
-
parts=[gtypes.Part.from_text(text=prompt)],
|
| 1080 |
-
)
|
| 1081 |
-
]
|
| 1082 |
-
tools = [gtypes.Tool(googleSearch=gtypes.GoogleSearch())]
|
| 1083 |
-
generate_content_config = gtypes.GenerateContentConfig(
|
| 1084 |
-
thinking_config=gtypes.ThinkingConfig(thinking_budget=0),
|
| 1085 |
-
tools=tools,
|
| 1086 |
-
temperature=0.1,
|
| 1087 |
-
max_output_tokens=4000,
|
| 1088 |
-
)
|
| 1089 |
-
|
| 1090 |
-
progress(0.20, "① Google Search 실행 중...")
|
| 1091 |
-
|
| 1092 |
-
# 스트리밍 응답 수집
|
| 1093 |
-
full_response = ""
|
| 1094 |
-
for chunk in client.models.generate_content_stream(
|
| 1095 |
-
model="gemini-flash-lite-latest",
|
| 1096 |
-
contents=contents,
|
| 1097 |
-
config=generate_content_config,
|
| 1098 |
-
):
|
| 1099 |
-
if chunk.text:
|
| 1100 |
-
full_response += chunk.text
|
| 1101 |
-
gemini_response = full_response
|
| 1102 |
-
|
| 1103 |
-
# ✅ 표절율 추출
|
| 1104 |
-
pm = re.search(r'표절율[:\s]*(\d+)', full_response)
|
| 1105 |
-
if pm:
|
| 1106 |
-
gemini_pct = int(pm.group(1))
|
| 1107 |
-
|
| 1108 |
-
# ✅ Grounding Metadata에서 출처 추출 (비스트리밍 재호출)
|
| 1109 |
-
progress(0.28, "① 출처 메타데이터 추출...")
|
| 1110 |
-
try:
|
| 1111 |
-
resp_full = client.models.generate_content(
|
| 1112 |
-
model="gemini-flash-lite-latest",
|
| 1113 |
-
contents=prompt,
|
| 1114 |
-
config=gtypes.GenerateContentConfig(
|
| 1115 |
-
tools=[gtypes.Tool(googleSearch=gtypes.GoogleSearch())],
|
| 1116 |
-
temperature=0.1,
|
| 1117 |
-
max_output_tokens=2000,
|
| 1118 |
-
)
|
| 1119 |
-
)
|
| 1120 |
-
# grounding_metadata에서 실제 검색 출처 추출
|
| 1121 |
-
if hasattr(resp_full, 'candidates') and resp_full.candidates:
|
| 1122 |
-
cand = resp_full.candidates[0]
|
| 1123 |
-
gm = getattr(cand, 'grounding_metadata', None)
|
| 1124 |
-
if gm:
|
| 1125 |
-
chunks = getattr(gm, 'grounding_chunks', None) or []
|
| 1126 |
-
for gc in chunks:
|
| 1127 |
-
web = getattr(gc, 'web', None)
|
| 1128 |
-
if web:
|
| 1129 |
-
title = getattr(web, 'title', '') or ''
|
| 1130 |
-
uri = getattr(web, 'uri', '') or ''
|
| 1131 |
-
if uri:
|
| 1132 |
-
gemini_sources.append({"title": title, "url": uri, "source": "Google", "snippet": ""})
|
| 1133 |
-
# support_chunks도 확인
|
| 1134 |
-
supports = getattr(gm, 'grounding_supports', None) or []
|
| 1135 |
-
for sup in supports:
|
| 1136 |
-
seg = getattr(sup, 'segment', None)
|
| 1137 |
-
snippet_text = getattr(seg, 'text', '') if seg else ''
|
| 1138 |
-
idxs = getattr(sup, 'grounding_chunk_indices', []) or []
|
| 1139 |
-
# snippet을 해당 source에 매핑
|
| 1140 |
-
for idx in idxs:
|
| 1141 |
-
if idx < len(gemini_sources) and snippet_text:
|
| 1142 |
-
gemini_sources[idx]["snippet"] = snippet_text[:120]
|
| 1143 |
-
except Exception as e2:
|
| 1144 |
-
print(f"Gemini 메타데이터 추출 오류: {e2}")
|
| 1145 |
-
|
| 1146 |
-
# 텍스트 응답에서 추가 URL 추출 (grounding에 없는 것만)
|
| 1147 |
-
existing_urls = {s["url"] for s in gemini_sources}
|
| 1148 |
-
for m in re.finditer(r'https?://[^\s\)\]\,\"\']{10,}', full_response):
|
| 1149 |
-
url = m.group(0).rstrip('.')
|
| 1150 |
-
if url not in existing_urls:
|
| 1151 |
-
domain = url.split('/')[2] if len(url.split('/')) > 2 else url
|
| 1152 |
-
gemini_sources.append({"title": domain, "url": url, "source": "Google", "snippet": ""})
|
| 1153 |
-
existing_urls.add(url)
|
| 1154 |
-
|
| 1155 |
-
log_lines.append(f"[Gemini] 표절율={gemini_pct}%, 출처={len(gemini_sources)}건")
|
| 1156 |
-
|
| 1157 |
-
except Exception as e:
|
| 1158 |
-
log_lines.append(f"[Gemini] 오류: {str(e)[:100]}")
|
| 1159 |
-
print(f"Gemini 오류: {str(e)}")
|
| 1160 |
-
else:
|
| 1161 |
-
log_lines.append("[Gemini] API 키 없음 — 건너뜀")
|
| 1162 |
-
|
| 1163 |
-
# ═══════════════════════════════════════
|
| 1164 |
-
# PHASE 2: Brave Search 병렬 웹 검색
|
| 1165 |
-
# ═══════════════════════════════════════
|
| 1166 |
-
progress(0.40, "② Brave Search 웹 검색...")
|
| 1167 |
-
key_phrases = _extract_key_phrases(text, max_phrases=6)
|
| 1168 |
-
|
| 1169 |
-
if BRAVE_KEY and key_phrases:
|
| 1170 |
-
try:
|
| 1171 |
-
brave_results = parallel_brave_search(key_phrases, max_workers=10)
|
| 1172 |
-
seen_urls = {s["url"] for s in gemini_sources}
|
| 1173 |
-
for query, results in brave_results.items():
|
| 1174 |
-
for r in results:
|
| 1175 |
-
url = r.get("url", "")
|
| 1176 |
-
if url and url not in seen_urls:
|
| 1177 |
-
brave_sources.append({
|
| 1178 |
-
"title": r.get("title", "")[:80],
|
| 1179 |
-
"url": url,
|
| 1180 |
-
"source": "Brave",
|
| 1181 |
-
"snippet": r.get("snippet", "")[:120],
|
| 1182 |
-
})
|
| 1183 |
-
seen_urls.add(url)
|
| 1184 |
-
log_lines.append(f"[Brave] 쿼리={len(key_phrases)}개, 출처={len(brave_sources)}건")
|
| 1185 |
-
except Exception as e:
|
| 1186 |
-
log_lines.append(f"[Brave] 오류: {str(e)[:80]}")
|
| 1187 |
-
elif not BRAVE_KEY:
|
| 1188 |
-
# Brave 키 없으면 DuckDuckGo 폴백
|
| 1189 |
-
try:
|
| 1190 |
-
seen_urls = {s["url"] for s in gemini_sources}
|
| 1191 |
-
for phrase in key_phrases[:3]:
|
| 1192 |
-
for r in duckduckgo_search(phrase, max_results=3):
|
| 1193 |
-
url = r.get("url", "")
|
| 1194 |
-
if url and url not in seen_urls:
|
| 1195 |
-
brave_sources.append({
|
| 1196 |
-
"title": r.get("title", "")[:80],
|
| 1197 |
-
"url": url,
|
| 1198 |
-
"source": "Web",
|
| 1199 |
-
"snippet": r.get("snippet", "")[:120],
|
| 1200 |
-
})
|
| 1201 |
-
seen_urls.add(url)
|
| 1202 |
-
log_lines.append(f"[DuckDuckGo] 폴백, 출처={len(brave_sources)}��")
|
| 1203 |
-
except Exception as e:
|
| 1204 |
-
log_lines.append(f"[DuckDuckGo] 오류: {str(e)[:80]}")
|
| 1205 |
-
|
| 1206 |
-
# ═══════════════════════════════════════
|
| 1207 |
-
# PHASE 3: 학술 DB 검색 (KCI · RISS · arXiv)
|
| 1208 |
-
# ═══════════════════════════════════════
|
| 1209 |
-
progress(0.60, "③ 학술 DB 검색 (KCI·RISS·arXiv)...")
|
| 1210 |
-
|
| 1211 |
-
# 학술 검색용 키워드: 텍스트에서 핵심 명사구 추출
|
| 1212 |
-
academic_query = text[:100].replace('\n', ' ')
|
| 1213 |
-
# 한글이 포함되어 있으면 한글 학술DB도 검색
|
| 1214 |
-
has_korean = bool(re.search(r'[가-힣]', text))
|
| 1215 |
-
|
| 1216 |
-
try:
|
| 1217 |
-
with ThreadPoolExecutor(max_workers=5) as executor:
|
| 1218 |
-
futures = {}
|
| 1219 |
-
futures[executor.submit(search_arxiv, academic_query[:60])] = "arXiv"
|
| 1220 |
-
if has_korean:
|
| 1221 |
-
futures[executor.submit(search_kci, academic_query[:40])] = "KCI"
|
| 1222 |
-
futures[executor.submit(search_riss, academic_query[:40])] = "RISS"
|
| 1223 |
-
|
| 1224 |
-
seen_urls = {s["url"] for s in gemini_sources + brave_sources}
|
| 1225 |
-
for future in as_completed(futures, timeout=15):
|
| 1226 |
-
src_name = futures[future]
|
| 1227 |
-
try:
|
| 1228 |
-
results = future.result()
|
| 1229 |
-
for r in results:
|
| 1230 |
-
url = r.get("url", "")
|
| 1231 |
-
if url and url not in seen_urls:
|
| 1232 |
-
academic_sources.append({
|
| 1233 |
-
"title": r.get("title", "")[:80],
|
| 1234 |
-
"url": url,
|
| 1235 |
-
"source": src_name,
|
| 1236 |
-
"snippet": r.get("snippet", "")[:120],
|
| 1237 |
-
})
|
| 1238 |
-
seen_urls.add(url)
|
| 1239 |
-
except Exception:
|
| 1240 |
-
pass
|
| 1241 |
-
log_lines.append(f"[학술] KCI·RISS·arXiv 출처={len(academic_sources)}건")
|
| 1242 |
-
except Exception as e:
|
| 1243 |
-
log_lines.append(f"[학술] 오류: {str(e)[:80]}")
|
| 1244 |
-
|
| 1245 |
-
# ═══════════════════════════════════════
|
| 1246 |
-
# 종합 판정
|
| 1247 |
-
# ═══════════════════════════════════════
|
| 1248 |
-
progress(0.80, "보고서 생성...")
|
| 1249 |
-
|
| 1250 |
-
all_sources = gemini_sources + brave_sources + academic_sources
|
| 1251 |
-
|
| 1252 |
-
# 종합 표절율: Gemini 90% + Brave/학술 보조 10%
|
| 1253 |
-
web_boost = min(len(brave_sources) * 1.5, 7)
|
| 1254 |
-
acad_boost = min(len(academic_sources) * 2, 3)
|
| 1255 |
-
plag_pct = min(round(gemini_pct * 0.9 + web_boost + acad_boost), 100)
|
| 1256 |
-
|
| 1257 |
-
if plag_pct >= 50:
|
| 1258 |
-
grade, gc = "🚨 표절 의심", "#FF4444"
|
| 1259 |
-
elif plag_pct >= 30:
|
| 1260 |
-
grade, gc = "⚠️ 주의 필요", "#FF8800"
|
| 1261 |
-
elif plag_pct >= 15:
|
| 1262 |
-
grade, gc = "📌 유사표현", "#DDAA00"
|
| 1263 |
-
elif plag_pct >= 5:
|
| 1264 |
-
grade, gc = "✓ 양호", "#4ECDC4"
|
| 1265 |
-
else:
|
| 1266 |
-
grade, gc = "✅ 우수", "#22AA44"
|
| 1267 |
-
|
| 1268 |
-
word_count = len(split_words(text))
|
| 1269 |
-
char_count = len(text)
|
| 1270 |
-
|
| 1271 |
-
# ═══════════════════════════════════════
|
| 1272 |
-
# 출처 테이블 HTML 생성
|
| 1273 |
-
# ═══════════════════════════════════════
|
| 1274 |
-
def _source_badge(src):
|
| 1275 |
-
colors = {"Google": "#4285F4", "Brave": "#FB542B", "Web": "#888",
|
| 1276 |
-
"KCI": "#2E7D32", "RISS": "#1565C0", "arXiv": "#B71C1C"}
|
| 1277 |
-
c = colors.get(src, "#666")
|
| 1278 |
-
return f'<span style="display:inline-block;padding:2px 6px;border-radius:3px;background:{c};color:#fff;font-size:9px;font-weight:700;">{src}</span>'
|
| 1279 |
-
|
| 1280 |
-
src_rows = ""
|
| 1281 |
-
for i, s in enumerate(all_sources[:30]):
|
| 1282 |
-
title_display = s['title'][:55] if s['title'] else s['url'].split('/')[2] if len(s['url'].split('/')) > 2 else s['url'][:40]
|
| 1283 |
-
snippet_html = f'<div style="font-size:9px;color:#888;margin-top:2px;">{s["snippet"][:100]}</div>' if s.get("snippet") else ""
|
| 1284 |
-
src_rows += f"""<tr style="border-bottom:1px solid #E8E8E8;">
|
| 1285 |
-
<td style="padding:8px;text-align:center;font-size:11px;color:#666;">{i+1}</td>
|
| 1286 |
-
<td style="padding:8px;">{_source_badge(s.get('source',''))}</td>
|
| 1287 |
-
<td style="padding:8px;"><a href="{s['url']}" target="_blank" rel="noopener noreferrer" style="color:#2E86C1;text-decoration:none;font-weight:600;font-size:11px;">{title_display}</a>{snippet_html}</td>
|
| 1288 |
-
<td style="padding:8px;font-size:9px;color:#999;word-break:break-all;max-width:200px;"><a href="{s['url']}" target="_blank" rel="noopener noreferrer" style="color:#999;text-decoration:none;">{s['url'][:65]}</a></td>
|
| 1289 |
-
</tr>"""
|
| 1290 |
-
|
| 1291 |
-
if not src_rows:
|
| 1292 |
-
src_rows = '<tr><td colspan="4" style="padding:20px;text-align:center;color:#999;">발견된 출처 없음</td></tr>'
|
| 1293 |
-
|
| 1294 |
-
# Gemini 분석 요약 (접기)
|
| 1295 |
-
gemini_summary = ""
|
| 1296 |
-
if gemini_response:
|
| 1297 |
-
safe_resp = gemini_response.replace('<', '<').replace('>', '>').replace('\n', '<br>')
|
| 1298 |
-
gemini_summary = f"""
|
| 1299 |
-
<div style="padding:16px 24px;border-bottom:1px solid #E0E0E0;">
|
| 1300 |
-
<details>
|
| 1301 |
-
<summary style="cursor:pointer;font-size:13px;font-weight:700;color:#1A3C6E;">🤖 Gemini 분석 상세</summary>
|
| 1302 |
-
<div style="margin-top:10px;padding:12px;background:#F8F9FA;border-radius:6px;font-size:11px;line-height:1.7;color:#333;max-height:300px;overflow-y:auto;">{safe_resp}</div>
|
| 1303 |
-
</details>
|
| 1304 |
-
</div>"""
|
| 1305 |
-
|
| 1306 |
-
HDR = '#3B7DD8'
|
| 1307 |
-
html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:900px;margin:20px auto;background:#fff;border:1px solid #E0E0E0;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,0.06);">
|
| 1308 |
-
<div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:24px;color:#fff;border-radius:8px 8px 0 0;">
|
| 1309 |
-
<div style="display:flex;justify-content:space-between;align-items:center;">
|
| 1310 |
-
<div>
|
| 1311 |
-
<div style="font-size:24px;font-weight:900;">표절 검사 결과</div>
|
| 1312 |
-
<div style="font-size:12px;opacity:0.9;margin-top:4px;">Gemini Google Search + Brave + KCI·RISS·arXiv</div>
|
| 1313 |
-
</div>
|
| 1314 |
-
<div style="text-align:right;font-size:11px;opacity:0.9;">
|
| 1315 |
-
<div>문서: {doc_id}</div>
|
| 1316 |
-
<div>{now}</div>
|
| 1317 |
-
</div>
|
| 1318 |
-
</div>
|
| 1319 |
-
</div>
|
| 1320 |
-
<div style="padding:24px;background:#FAFBFE;border-bottom:1px solid #E0E0E0;">
|
| 1321 |
-
<div style="display:grid;grid-template-columns:1fr 1fr 1fr 1fr;gap:12px;">
|
| 1322 |
-
<div style="text-align:center;padding:16px;background:#fff;border-radius:6px;border:1px solid #E0E0E0;">
|
| 1323 |
-
<div style="font-size:42px;font-weight:900;color:{gc};">{plag_pct}%</div>
|
| 1324 |
-
<div style="font-size:11px;color:#666;margin-top:6px;">종합 표절율</div>
|
| 1325 |
-
</div>
|
| 1326 |
-
<div style="text-align:center;padding:16px;background:#fff;border-radius:6px;border:1px solid #E0E0E0;">
|
| 1327 |
-
<div style="font-size:22px;font-weight:900;color:{gc};margin-top:6px;">{grade}</div>
|
| 1328 |
-
<div style="font-size:11px;color:#666;margin-top:6px;">판정</div>
|
| 1329 |
-
</div>
|
| 1330 |
-
<div style="text-align:center;padding:16px;background:#fff;border-radius:6px;border:1px solid #E0E0E0;">
|
| 1331 |
-
<div style="font-size:28px;font-weight:900;color:#555;">{len(all_sources)}</div>
|
| 1332 |
-
<div style="font-size:11px;color:#666;margin-top:6px;">발견 출처</div>
|
| 1333 |
-
</div>
|
| 1334 |
-
<div style="text-align:center;padding:16px;background:#fff;border-radius:6px;border:1px solid #E0E0E0;">
|
| 1335 |
-
<div style="font-size:14px;font-weight:700;color:#4285F4;margin-top:4px;">{len(gemini_sources)}</div>
|
| 1336 |
-
<div style="font-size:14px;font-weight:700;color:#FB542B;">{len(brave_sources)}</div>
|
| 1337 |
-
<div style="font-size:14px;font-weight:700;color:#2E7D32;">{len(academic_sources)}</div>
|
| 1338 |
-
<div style="font-size:9px;color:#666;margin-top:2px;">Google·Brave·학술</div>
|
| 1339 |
-
</div>
|
| 1340 |
-
</div>
|
| 1341 |
-
</div>
|
| 1342 |
-
<div style="padding:16px 24px;border-bottom:1px solid #E0E0E0;">
|
| 1343 |
-
<div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:8px;">📋 검사 정보</div>
|
| 1344 |
-
<div style="display:flex;gap:24px;font-size:12px;">
|
| 1345 |
-
<span><span style="color:#888;">글자수</span> <b>{char_count:,}자</b></span>
|
| 1346 |
-
<span><span style="color:#888;">단어수</span> <b>{word_count:,}단어</b></span>
|
| 1347 |
-
<span><span style="color:#888;">검색엔진</span> <b>Google + Brave + KCI·RISS·arXiv</b></span>
|
| 1348 |
-
</div>
|
| 1349 |
-
</div>{gemini_summary}
|
| 1350 |
-
<div style="padding:24px;">
|
| 1351 |
-
<div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:12px;">🔍 발견된 출처 ({len(all_sources)}건)</div>
|
| 1352 |
-
<table style="width:100%;border-collapse:collapse;font-size:11px;">
|
| 1353 |
-
<thead>
|
| 1354 |
-
<tr style="background:{HDR};color:white;">
|
| 1355 |
-
<th style="padding:10px;text-align:center;width:35px;">#</th>
|
| 1356 |
-
<th style="padding:10px;text-align:center;width:55px;">소스</th>
|
| 1357 |
-
<th style="padding:10px;text-align:left;">출처</th>
|
| 1358 |
-
<th style="padding:10px;text-align:left;width:200px;">URL</th>
|
| 1359 |
-
</tr>
|
| 1360 |
-
</thead>
|
| 1361 |
-
<tbody>{src_rows}</tbody>
|
| 1362 |
-
</table>
|
| 1363 |
-
</div>
|
| 1364 |
-
</div>"""
|
| 1365 |
-
|
| 1366 |
-
progress(0.95, "완료!")
|
| 1367 |
-
log_text = "\n".join(log_lines)
|
| 1368 |
|
| 1369 |
-
return html, log_text
|
| 1370 |
def run_detection(text, progress=gr.Progress()):
|
| 1371 |
if not text or len(text.strip())<50: return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자</div>",""
|
| 1372 |
text=text.strip()
|
|
|
|
| 893 |
hum=analyze_humanizer(text,sents,words,morphs)
|
| 894 |
fs,v,lv=compute_verdict(sc, sent_avg=sent_avg, ppx_score=ppx["score"], hum_score=hum["score"])
|
| 895 |
return fs,v,lv,sc,ppx,hum
|
| 896 |
+
from plagiarism_check import run_plagiarism
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 897 |
|
|
|
|
| 898 |
def run_detection(text, progress=gr.Progress()):
|
| 899 |
if not text or len(text.strip())<50: return "<div style='padding:20px;text-align:center;color:#888;'>⚠️ 최소 50자</div>",""
|
| 900 |
text=text.strip()
|