nneans commited on
Commit
f438fbf
ยท
verified ยท
1 Parent(s): 650a480

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -176
app.py CHANGED
@@ -1,298 +1,313 @@
1
  # =========================================================
2
- # KB ๊ธˆ์œต RAG ์ฑ—๋ด‡ (Local Self-Contained Version)
3
- # =========================================================
4
- # ์ด ์ฝ”๋“œ๋Š” ์„œ๋ฒ„๋‚˜ ํด๋ผ์šฐ๋“œ DB ์—†์ด, ์‚ฌ์šฉ์ž๊ฐ€ ์ง์ ‘ PDF๋ฅผ ์—…๋กœ๋“œํ•˜์—ฌ
5
- # ๋กœ์ปฌ์—์„œ ์ง€์‹ ๋ฒ ์ด์Šค๋ฅผ ๊ตฌ์ถ•ํ•˜๊ณ  ์งˆ๋ฌธํ•  ์ˆ˜ ์žˆ๋Š” ๊ตฌ์กฐ์ž…๋‹ˆ๋‹ค.
6
- # Groq(LLM), Google(Voice/Translate) API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋ฌด๋ฃŒ๋กœ ๋™์ž‘ํ•ฉ๋‹ˆ๋‹ค.
7
  # =========================================================
8
 
9
  import os
10
  import sys
11
  import numpy as np
12
  import traceback
13
- import fitz # PyMuPDF (PDF ์ฒ˜๋ฆฌ)
14
  from typing import List
15
 
16
  # --- ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ ---
17
  import gradio as gr
18
  import speech_recognition as sr
 
 
 
 
 
19
  from deep_translator import GoogleTranslator
20
  from sentence_transformers import SentenceTransformer
21
  from groq import Groq
22
  from qdrant_client import QdrantClient
23
  from qdrant_client.models import Distance, VectorParams, PointStruct
 
24
  try:
25
  from langchain.text_splitter import RecursiveCharacterTextSplitter
26
  except ImportError:
27
- # langchain 0.2.0 ์ด์ƒ์—์„œ ๊ตฌ์กฐ๊ฐ€ ๋ณ€๊ฒฝ๋œ ๊ฒฝ์šฐ
28
  from langchain_text_splitters import RecursiveCharacterTextSplitter
29
 
30
  # =========================================================
31
  # 1. ์„ค์ • ๋ฐ ์ดˆ๊ธฐํ™”
32
  # =========================================================
33
 
34
- # Groq API ํ‚ค (ํ•„์ˆ˜)
35
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your_groq_api_key_here")
36
- if not GROQ_API_KEY or GROQ_API_KEY == "your_groq_api_key_here":
37
- print("โš ๏ธ GROQ_API_KEY๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. RAG ๊ธฐ๋Šฅ ์‚ฌ์šฉ ์‹œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
38
-
39
- # ๋ชจ๋ธ ์„ค์ •
40
  EMBEDDING_MODEL_NAME = "jhgan/ko-sroberta-multitask"
41
  GROQ_MODEL_NAME = "llama-3.3-70b-versatile"
42
  COLLECTION_NAME = "local_kb"
43
 
44
- print("๐Ÿ› ๏ธ ๋ชจ๋ธ ๋ฐ ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™” ์ค‘...")
45
 
46
- # 1. ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ (๋กœ์ปฌ ์‹คํ–‰)
47
  embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
48
  embedding_model.max_seq_length = 512
49
 
50
- # 2. Qdrant ํด๋ผ์ด์–ธํŠธ (๋กœ์ปฌ ๋ฉ”๋ชจ๋ฆฌ DB - ํ”„๋กœ๊ทธ๋žจ ์ข…๋ฃŒ ์‹œ ๋ฐ์ดํ„ฐ ์‚ญ์ œ๋จ)
51
- # ์˜๊ตฌ ์ €์žฅ์„ ์›ํ•˜๋ฉด path="./local_qdrant_db" ๋กœ ๋ณ€๊ฒฝํ•˜์„ธ์š”.
52
- # ์—ฌ๊ธฐ์„œ๋Š” ํฌํŠธํด๋ฆฌ์˜ค์šฉ ๋ฐ๋ชจ๋ฅผ ์œ„ํ•ด ๋งค๋ฒˆ ๊นจ๋—ํ•œ ์ƒํƒœ์ธ ':memory:'๋ฅผ ๊ธฐ๋ณธ์œผ๋กœ ํ•ฉ๋‹ˆ๋‹ค.
53
  qdrant_client = QdrantClient(":memory:")
54
-
55
- # ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ (์ด๋ฏธ ์กด์žฌํ•˜๋ฉด ์‚ญ์ œ ํ›„ ์žฌ์ƒ์„ฑ)
56
  try:
57
  qdrant_client.recreate_collection(
58
  collection_name=COLLECTION_NAME,
59
  vectors_config=VectorParams(size=768, distance=Distance.COSINE),
60
  )
61
- print(f"โœ… ๋กœ์ปฌ Qdrant ์ปฌ๋ ‰์…˜ '{COLLECTION_NAME}' ์ƒ์„ฑ ์™„๋ฃŒ.")
62
  except Exception as e:
63
- print(f"โŒ Qdrant ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ ์‹คํŒจ: {e}")
64
 
65
- # 3. Groq ํด๋ผ์ด์–ธํŠธ
66
- try:
67
- groq_client = Groq(api_key=GROQ_API_KEY)
68
- except Exception as e:
69
- groq_client = None
70
- print(f"โŒ Groq ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™” ์‹คํŒจ: {e}")
 
 
 
71
 
72
- #์ „์—ญ ๋ณ€์ˆ˜: ๋ฌธ์„œ ID ์นด์šดํ„ฐ
73
  doc_id_counter = 0
74
 
75
- print("โœ… ๋ชจ๋“  ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
 
 
 
 
 
77
 
78
  # =========================================================
79
- # 2. ๋ฌธ์„œ ์ฒ˜๋ฆฌ ๋ฐ RAG ํ•ต์‹ฌ ๋กœ์ง
80
  # =========================================================
81
 
82
  def process_uploaded_files(files):
83
- """PDF ํŒŒ์ผ์„ ์ฝ์–ด ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๊ณ  ๋ฒกํ„ฐ DB์— ์ €์žฅ"""
84
  global doc_id_counter
85
-
86
- if not files:
87
- return "ํŒŒ์ผ์ด ์—…๋กœ๋“œ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค."
88
 
89
  total_chunks = 0
90
  status_msg = ""
91
-
92
- # ํ…์ŠคํŠธ ๋ถ„๋ฆฌ๊ธฐ ์„ค์ •
93
- text_splitter = RecursiveCharacterTextSplitter(
94
- chunk_size=500,
95
- chunk_overlap=50,
96
- length_function=len,
97
- )
98
 
99
  for file in files:
100
  try:
101
- # Gradio ๋ฒ„์ „/์„ค์ •์— ๋”ฐ๋ผ file์ด ๋ฌธ์ž์—ด(๊ฒฝ๋กœ)์ผ ์ˆ˜๋„ ์žˆ๊ณ  ๊ฐ์ฒด์ผ ์ˆ˜๋„ ์žˆ์Œ
102
  file_path = file.name if hasattr(file, 'name') else file
103
-
104
- # 1. PDF ํ…์ŠคํŠธ ์ถ”์ถœ
105
  doc = fitz.open(file_path)
106
  file_text = ""
107
- for page in doc:
108
- file_text += page.get_text()
109
 
110
  if not file_text.strip():
111
- status_msg += f"โš ๏ธ {os.path.basename(file_path)}: ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ (์ด๋ฏธ์ง€ PDF์ผ ์ˆ˜ ์žˆ์Œ)\n"
112
  continue
113
 
114
- # 2. ํ…์ŠคํŠธ ๋ถ„ํ•  (Chunking)
115
  chunks = text_splitter.split_text(file_text)
116
-
117
- # 3. ์ž„๋ฒ ๋”ฉ ๋ฐ ์ €์žฅ
118
  points = []
119
  for i, chunk in enumerate(chunks):
120
  vector = embedding_model.encode(chunk).tolist()
121
-
122
- payload = {
123
- "filename": os.path.basename(file_path),
124
- "text": chunk,
125
- "chunk_id": i
126
- }
127
-
128
  points.append(PointStruct(id=doc_id_counter, vector=vector, payload=payload))
129
  doc_id_counter += 1
130
 
131
- # Qdrant์— ์ €์žฅ
132
  if points:
133
- qdrant_client.upsert(
134
- collection_name=COLLECTION_NAME,
135
- points=points
136
- )
137
  total_chunks += len(points)
138
- status_msg += f"โœ… {os.path.basename(file_path)}: {len(points)}๊ฐœ ์ง€์‹ ์ €์žฅ ์™„๋ฃŒ.\n"
139
 
140
  except Exception as e:
141
- traceback.print_exc()
142
- file_name_debug = getattr(file, 'name', str(file))
143
- status_msg += f"โŒ {os.path.basename(file_name_debug)} ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {str(e)}\n"
144
 
145
- print(f"DEBUG: ์ด ์ €์žฅ๋œ ์ฒญํฌ ์ˆ˜: {total_chunks}")
146
- if total_chunks == 0:
147
- return status_msg + "\n(์ €์žฅ๋œ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. PDF๊ฐ€ ๋น„์–ด์žˆ๊ฑฐ๋‚˜ ์ด๋ฏธ์ง€์ผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.)"
148
-
149
- return f"์ฒ˜๋ฆฌ ์™„๋ฃŒ! ์ด {total_chunks}๊ฐœ์˜ ์ง€์‹ ์กฐ๊ฐ์ด ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.\n\n{status_msg}"
150
 
151
  def search_knowledge_base(query, top_k=5):
152
- """๋กœ์ปฌ Qdrant์—์„œ ๊ด€๋ จ ๋ฌธ์„œ ๊ฒ€์ƒ‰"""
153
  try:
154
  query_vector = embedding_model.encode(query).tolist()
155
- # qdrant-client ๋ฒ„์ „์— ๋”ฐ๋ผ .search()๊ฐ€ ์—†๊ฑฐ๋‚˜ ๋‹ค๋ฅด๊ฒŒ ๋™์ž‘ํ•  ์ˆ˜ ์žˆ์–ด .query_points() ์‚ฌ์šฉ
156
- search_result = qdrant_client.query_points(
157
- collection_name=COLLECTION_NAME,
158
- query=query_vector,
159
- limit=top_k,
160
- with_payload=True
161
  )
162
- return search_result.points
163
- except Exception as e:
164
- print(f"๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {e}")
165
  return []
166
 
167
  def generate_answer_groq(query, context_text):
168
- """Groq API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹ต๋ณ€ ์ƒ์„ฑ"""
169
- if not groq_client:
170
- return "Groq API ์„ค์ • ์˜ค๋ฅ˜"
171
-
172
  system_prompt = """
173
- ๋‹น์‹ ์€ ์นœ์ ˆํ•˜๊ณ  ์ „๋ฌธ์ ์ธ ๊ธˆ์œต AI ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค.
174
- ๋ฐ˜๋“œ์‹œ ์•„๋ž˜ ์ œ๊ณต๋œ [์ฐธ๊ณ ์ž๋ฃŒ]๋งŒ์„ ๋ฐ”ํƒ•์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜์„ธ์š”.
175
- ์ฐธ๊ณ ์ž๋ฃŒ์— ๋‚ด์šฉ์ด ์—†๋‹ค๋ฉด ์†”์งํ•˜๊ฒŒ ๋ชจ๋ฅธ๋‹ค๊ณ  ๋Œ€๋‹ตํ•˜์„ธ์š”.
176
- ์ถœ์ฒ˜(ํŒŒ์ผ์ด๋ฆ„)๋ฅผ ๋‹ต๋ณ€ ๋์— ๋ช…์‹œํ•ด์ฃผ์„ธ์š”.
177
  """
178
-
179
- user_prompt = f"์งˆ๋ฌธ: {query}\n\n[์ฐธ๊ณ ์ž๋ฃŒ]\n{context_text}"
180
-
181
  try:
182
  response = groq_client.chat.completions.create(
183
- messages=[
184
- {"role": "system", "content": system_prompt},
185
- {"role": "user", "content": user_prompt},
186
- ],
187
- model=GROQ_MODEL_NAME,
188
- temperature=0.1,
189
  )
190
  return response.choices[0].message.content
191
  except Exception as e:
192
- return f"Groq ์ƒ์„ฑ ์˜ค๋ฅ˜: {e}"
193
 
194
- # RAG ํŒŒ์ดํ”„๋ผ์ธ (ํ†ตํ•ฉ)
195
- def run_rag_pipeline(text_input, detected_lang='ko'):
196
- if not text_input:
197
- return "", "", "", ""
198
-
199
- # 1. ์งˆ๋ฌธ ๋ฒˆ์—ญ (ํ•„์š”์‹œ)
200
- korean_query = text_input
201
- if detected_lang != 'ko':
202
- try:
203
- korean_query = GoogleTranslator(source='auto', target='ko').translate(text_input)
204
- except: pass
205
-
206
- # 2. ๋ฌธ์„œ ๊ฒ€์ƒ‰
207
- hits = search_knowledge_base(korean_query)
208
 
209
- if not hits:
210
- return korean_query, "์ €์žฅ๋œ ์ง€์‹์ด ๋ถ€์กฑํ•˜์—ฌ ๋‹ต๋ณ€ํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. PDF๋ฅผ ๋จผ์ € ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”.", "", "์ฐธ๊ณ  ๋ฌธ์„œ ์—†์Œ"
211
-
212
- # 3. ์ปจํ…์ŠคํŠธ ๊ตฌ์„ฑ
213
- context_text = ""
214
- references = []
215
- for hit in hits:
216
- context_text += f"{hit.payload['text']}\n\n"
217
- references.append(f"- {hit.payload['filename']} (์œ ์‚ฌ๋„: {hit.score:.2f})")
218
 
219
- ref_str = "\n".join(references)
 
 
 
220
 
221
- # 4. ๋‹ต๋ณ€ ์ƒ์„ฑ
222
- korean_answer = generate_answer_groq(korean_query, context_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
- # 5. ๋‹ต๋ณ€ ๋ฒˆ์—ญ (ํ•„์š”์‹œ)
225
- final_answer = korean_answer
226
- if detected_lang != 'ko':
227
- try:
228
- final_answer = GoogleTranslator(source='ko', target=detected_lang).translate(korean_answer)
229
- except: pass
230
-
231
- return korean_query, korean_answer, final_answer, ref_str
232
-
233
-
234
- # =========================================================
235
- # 3. ์Œ์„ฑ ๋ฐ UI ํ—ฌํผ ํ•จ์ˆ˜
236
- # =========================================================
237
 
238
- def voice_to_text(audio_input):
239
- """์Œ์„ฑ ์ธ์‹ (Google API)"""
240
- if audio_input is None: return "์Œ์„ฑ ์ž…๋ ฅ ์—†์Œ", None
 
241
 
242
  try:
243
- sample_rate, audio_numpy = audio_input
244
  if audio_numpy.dtype == np.float32:
245
  audio_numpy = (audio_numpy * 32767).astype(np.int16)
246
  if len(audio_numpy.shape) > 1:
247
  audio_numpy = audio_numpy.mean(axis=1).astype(np.int16)
248
-
249
  audio_data = sr.AudioData(audio_numpy.tobytes(), sample_rate, 2)
250
  r = sr.Recognizer()
251
- text = r.recognize_google(audio_data, language='ko-KR')
252
- return text, 'ko'
 
 
 
 
 
253
  except sr.UnknownValueError:
254
- return "์ธ์‹ ์‹คํŒจ (๋‹ค์‹œ ๋งํ•ด์ฃผ์„ธ์š”)", None
255
- except Exception as e:
256
- return f"์˜ค๋ฅ˜: {e}", None
257
 
258
  # =========================================================
259
- # 4. Gradio UI ๊ตฌ์„ฑ
260
  # =========================================================
261
 
262
- with gr.Blocks(theme=gr.themes.Soft(), title="KB AI Challenge") as demo:
263
- gr.Markdown("# KB AI Challenge")
264
- gr.Markdown("์„œ๋ฒ„ ์—†์ด ๋กœ์ปฌ์—์„œ ๋™์ž‘ํ•˜๋Š” **๊ฐœ์ธ์šฉ RAG ์‹œ์Šคํ…œ**์ž…๋‹ˆ๋‹ค. PDF๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด ์ฆ‰์‹œ ํ•™์Šตํ•˜์—ฌ ๋‹ต๋ณ€ํ•ฉ๋‹ˆ๋‹ค.")
265
-
266
- with gr.Accordion("๐Ÿ“‚ 1. ์ง€์‹ ๋ฒ ์ด์Šค ๊ตฌ์ถ• (ํŒŒ์ผ ์—…๋กœ๋“œ)", open=True):
267
- with gr.Row():
268
- file_input = gr.File(label="PDF ์—…๋กœ๋“œ (์—ฌ๋Ÿฌ ๊ฐœ ๊ฐ€๋Šฅ)", file_count="multiple", file_types=[".pdf"])
269
- upload_btn = gr.Button("์ €์žฅํ•˜๊ธฐ", variant="primary")
270
- upload_status = gr.Textbox(label="์ฒ˜๋ฆฌ ์ƒํƒœ", interactive=False)
271
-
272
- gr.Markdown("---")
273
- gr.Markdown("### ๐ŸŽค 2. AI์™€ ๋Œ€ํ™”ํ•˜๊ธฐ")
274
 
 
 
 
 
 
 
 
275
  with gr.Row():
276
- with gr.Column(scale=1):
277
- audio_in = gr.Audio(sources=["microphone", "upload"], type="numpy", label="์Œ์„ฑ ์งˆ๋ฌธ")
278
- asr_btn = gr.Button("์Œ์„ฑ ์ธ์‹ ์‹œ์ž‘", variant="secondary")
279
- text_in = gr.Textbox(label="์ธ์‹๋œ ํ…์ŠคํŠธ (์ง์ ‘ ์ž…๋ ฅ ๊ฐ€๋Šฅ)", lines=3)
280
- chat_btn = gr.Button("์งˆ๋ฌธํ•˜๊ธฐ", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- with gr.Column(scale=2):
283
- answer_box = gr.Textbox(label="AI ๋‹ต๋ณ€ (ํ•œ๊ตญ์–ด)", lines=6, interactive=False)
284
- ref_box = gr.Textbox(label="์ฐธ๊ณ  ๋ฌธํ—Œ", lines=4, interactive=False)
285
 
286
- # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
 
 
 
 
 
 
 
 
 
 
287
  upload_btn.click(process_uploaded_files, inputs=[file_input], outputs=[upload_status])
288
 
289
- asr_btn.click(voice_to_text, inputs=[audio_in], outputs=[text_in, gr.State()])
 
290
 
291
- chat_btn.click(
292
- run_rag_pipeline,
293
- inputs=[text_in, gr.State('ko')], # ์–ธ์–ด๋Š” ๊ธฐ๋ณธ ํ•œ๊ตญ์–ด๋กœ ๊ณ ์ • (๋‹จ์ˆœํ™”)
294
- outputs=[gr.State(), answer_box, gr.State(), ref_box]
295
- )
296
 
297
  if __name__ == "__main__":
298
  demo.launch(share=True)
 
1
  # =========================================================
2
+ # KB AI Challenge - Professional RAG System (Multilingual)
 
 
 
 
3
  # =========================================================
4
 
5
  import os
6
  import sys
7
  import numpy as np
8
  import traceback
9
+ import fitz # PyMuPDF
10
  from typing import List
11
 
12
  # --- ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ ---
13
  import gradio as gr
14
  import speech_recognition as sr
15
+ from dotenv import load_dotenv
16
+
17
+ # .env ๋กœ๋“œ
18
+ load_dotenv()
19
+
20
  from deep_translator import GoogleTranslator
21
  from sentence_transformers import SentenceTransformer
22
  from groq import Groq
23
  from qdrant_client import QdrantClient
24
  from qdrant_client.models import Distance, VectorParams, PointStruct
25
+
26
  try:
27
  from langchain.text_splitter import RecursiveCharacterTextSplitter
28
  except ImportError:
 
29
  from langchain_text_splitters import RecursiveCharacterTextSplitter
30
 
31
  # =========================================================
32
  # 1. ์„ค์ • ๋ฐ ์ดˆ๊ธฐํ™”
33
  # =========================================================
34
 
 
35
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your_groq_api_key_here")
 
 
 
 
36
  EMBEDDING_MODEL_NAME = "jhgan/ko-sroberta-multitask"
37
  GROQ_MODEL_NAME = "llama-3.3-70b-versatile"
38
  COLLECTION_NAME = "local_kb"
39
 
40
+ print("๐Ÿ› ๏ธ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” ์ค‘... (System Init)")
41
 
42
+ # ๋ชจ๋ธ ๋กœ๋“œ
43
  embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
44
  embedding_model.max_seq_length = 512
45
 
46
+ # Qdrant (๋ฉ”๋ชจ๋ฆฌ)
 
 
47
  qdrant_client = QdrantClient(":memory:")
 
 
48
  try:
49
  qdrant_client.recreate_collection(
50
  collection_name=COLLECTION_NAME,
51
  vectors_config=VectorParams(size=768, distance=Distance.COSINE),
52
  )
53
+ print(f"โœ… Qdrant Collection Ready.")
54
  except Exception as e:
55
+ print(f"โŒ Qdrant Error: {e}")
56
 
57
+ # Groq Init
58
+ groq_client = None
59
+ if GROQ_API_KEY and GROQ_API_KEY != "your_groq_api_key_here":
60
+ try:
61
+ groq_client = Groq(api_key=GROQ_API_KEY)
62
+ except Exception as e:
63
+ print(f"โŒ Groq Error: {e}")
64
+ else:
65
+ print("โš ๏ธ Groq API Key Missing.")
66
 
 
67
  doc_id_counter = 0
68
 
69
+ print("โœ… System Ready.")
70
+
71
+
72
+ # =========================================================
73
+ # 2. ๋‹ค๊ตญ์–ด ์ง€์› ๋กœ์ง (Translation & STT)
74
+ # =========================================================
75
+
76
+ LANG_MAP = {
77
+ "ํ•œ๊ตญ์–ด (Korean)": {"code": "ko", "stt": "ko-KR"},
78
+ "English (์˜์–ด)": {"code": "en", "stt": "en-US"},
79
+ "ๆ—ฅๆœฌ่ชž (Japanese)": {"code": "ja", "stt": "ja-JP"},
80
+ "ไธญๆ–‡ (Chinese)": {"code": "zh-CN", "stt": "zh-CN"}
81
+ }
82
+
83
+ def translate_text(text, target_lang_code):
84
+ try:
85
+ if target_lang_code == "ko": return text
86
+ return GoogleTranslator(source='auto', target=target_lang_code).translate(text)
87
+ except:
88
+ return text
89
 
90
+ def translate_to_korean(text):
91
+ try:
92
+ return GoogleTranslator(source='auto', target='ko').translate(text)
93
+ except:
94
+ return text
95
 
96
  # =========================================================
97
+ # 3. ํ•ต์‹ฌ ๋กœ์ง (RAG Pipeline)
98
  # =========================================================
99
 
100
  def process_uploaded_files(files):
101
+ """PDF ์ฒ˜๋ฆฌ ๋ฐ ์ž„๋ฒ ๋”ฉ"""
102
  global doc_id_counter
103
+ if not files: return "ํŒŒ์ผ์ด ์„ ํƒ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค."
 
 
104
 
105
  total_chunks = 0
106
  status_msg = ""
107
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, length_function=len)
 
 
 
 
 
 
108
 
109
  for file in files:
110
  try:
 
111
  file_path = file.name if hasattr(file, 'name') else file
 
 
112
  doc = fitz.open(file_path)
113
  file_text = ""
114
+ for page in doc: file_text += page.get_text()
 
115
 
116
  if not file_text.strip():
117
+ status_msg += f"โš ๏ธ {os.path.basename(file_path)}: ํ…์ŠคํŠธ ์—†์Œ.\n"
118
  continue
119
 
 
120
  chunks = text_splitter.split_text(file_text)
 
 
121
  points = []
122
  for i, chunk in enumerate(chunks):
123
  vector = embedding_model.encode(chunk).tolist()
124
+ payload = {"filename": os.path.basename(file_path), "text": chunk}
 
 
 
 
 
 
125
  points.append(PointStruct(id=doc_id_counter, vector=vector, payload=payload))
126
  doc_id_counter += 1
127
 
 
128
  if points:
129
+ qdrant_client.upsert(collection_name=COLLECTION_NAME, points=points)
 
 
 
130
  total_chunks += len(points)
131
+ status_msg += f"โœ… {os.path.basename(file_path)} ({len(points)} ๊ฐœ ์ €์žฅ๋จ)\n"
132
 
133
  except Exception as e:
134
+ status_msg += f"โŒ ์˜ค๋ฅ˜: {os.path.basename(file_path)} - {str(e)}\n"
 
 
135
 
136
+ return f"์ด {total_chunks}๊ฐœ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์™„๋ฃŒ.\n\n{status_msg}"
 
 
 
 
137
 
138
  def search_knowledge_base(query, top_k=5):
 
139
  try:
140
  query_vector = embedding_model.encode(query).tolist()
141
+ res = qdrant_client.query_points(
142
+ collection_name=COLLECTION_NAME, query=query_vector, limit=top_k, with_payload=True
 
 
 
 
143
  )
144
+ return res.points
145
+ except:
 
146
  return []
147
 
148
  def generate_answer_groq(query, context_text):
149
+ if not groq_client: return "API ํ‚ค๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค."
150
+
 
 
151
  system_prompt = """
152
+ ๋‹น์‹ ์€ KB ๊ธˆ์œต๊ทธ๋ฃน์˜ ์ „๋ฌธ AI ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค.
153
+ ์ œ๊ณต๋œ [๋ฌธ๋งฅ]์— ๊ธฐ๋ฐ˜ํ•˜์—ฌ ์งˆ๋ฌธ์— ๋Œ€ํ•ด ์ •ํ™•ํ•˜๊ณ  ์ „๋ฌธ์ ์ธ ๋‹ต๋ณ€์„ ์ž‘์„ฑํ•˜์„ธ์š”.
154
+ ๋ชจ๋ฅด๋Š” ๋‚ด์šฉ์€ ๋ชจ๋ฅธ๋‹ค๊ณ  ๋‹ตํ•˜๊ณ , ์ถ”์ธกํ•˜์ง€ ๋งˆ์„ธ์š”.
155
+ ๋‹ต๋ณ€์€ ํ•œ๊ตญ์–ด๋กœ ์ž‘์„ฑํ•˜์„ธ์š”.
156
  """
157
+ user_prompt = f"์งˆ๋ฌธ: {query}\n\n[๋ฌธ๋งฅ]\n{context_text}"
 
 
158
  try:
159
  response = groq_client.chat.completions.create(
160
+ messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
161
+ model=GROQ_MODEL_NAME, temperature=0.1
 
 
 
 
162
  )
163
  return response.choices[0].message.content
164
  except Exception as e:
165
+ return f"์‘๋‹ต ์ƒ์„ฑ ์˜ค๋ฅ˜: {e}"
166
 
167
+ def run_rag_chat(message, history, lang_selection):
168
+ if not message: return "", history, ""
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ target_lang = LANG_MAP[lang_selection]["code"]
 
 
 
 
 
 
 
 
171
 
172
+ # 1. ์ž…๋ ฅ ๋ฒˆ์—ญ (Target -> Korean)
173
+ korean_query = message
174
+ if target_lang != "ko":
175
+ korean_query = translate_to_korean(message)
176
 
177
+ # 2. ๊ฒ€์ƒ‰ & ๋‹ต๋ณ€ ์ƒ์„ฑ (Korean)
178
+ hits = search_knowledge_base(korean_query)
179
+ if not hits:
180
+ bot_response_ko = "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค. ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
181
+ reference_text = "์ฐธ๊ณ  ๋ฌธ์„œ ์—†์Œ"
182
+ else:
183
+ context_text = "\n\n".join([h.payload['text'] for h in hits])
184
+ # ์ค‘๋ณต ์ œ๊ฑฐ ๋ฐ ๊ทธ๋ฃนํ™” (File grouping)
185
+ ref_data = {}
186
+ for h in hits:
187
+ fname = h.payload['filename']
188
+ if fname not in ref_data:
189
+ ref_data[fname] = []
190
+ ref_data[fname].append(h.score)
191
+
192
+ refs = []
193
+ for fname, scores in ref_data.items():
194
+ refs.append(f"- {fname} (๊ด€๋ จ ๋‚ด์šฉ {len(scores)}๊ฑด, ์ตœ๊ณ  ์œ ์‚ฌ๋„: {max(scores):.2f})")
195
+ reference_text = "\n".join(refs)
196
+ bot_response_ko = generate_answer_groq(korean_query, context_text)
197
 
198
+ # 3. ๋‹ต๋ณ€ ๋ฒˆ์—ญ (Korean -> Target)
199
+ final_response = bot_response_ko
200
+ if target_lang != "ko":
201
+ translated_response = translate_text(bot_response_ko, target_lang)
202
+ final_response = f"{translated_response}\n\n---\n[ํ•œ๊ตญ์–ด ์›๋ฌธ]\n{bot_response_ko}"
203
+
204
+ # ํžˆ์Šคํ† ๋ฆฌ์— ์ถ”๊ฐ€ (Messages Format for Gradio 6.x)
205
+ new_history = history + [
206
+ {"role": "user", "content": message},
207
+ {"role": "assistant", "content": final_response}
208
+ ]
209
+ return "", new_history, reference_text
 
210
 
211
+ def voice_to_text_chat(audio, history, lang_selection):
212
+ if audio is None: return "", history, "์Œ์„ฑ ์ž…๋ ฅ ์—†์Œ"
213
+
214
+ stt_lang = LANG_MAP[lang_selection]["stt"]
215
 
216
  try:
217
+ sample_rate, audio_numpy = audio
218
  if audio_numpy.dtype == np.float32:
219
  audio_numpy = (audio_numpy * 32767).astype(np.int16)
220
  if len(audio_numpy.shape) > 1:
221
  audio_numpy = audio_numpy.mean(axis=1).astype(np.int16)
 
222
  audio_data = sr.AudioData(audio_numpy.tobytes(), sample_rate, 2)
223
  r = sr.Recognizer()
224
+
225
+ # ์„ ํƒ๋œ ์–ธ์–ด๋กœ ์ธ์‹
226
+ text = r.recognize_google(audio_data, language=stt_lang)
227
+
228
+ # ์ฑ„ํŒ… ํ•จ์ˆ˜ ํ˜ธ์ถœ
229
+ return run_rag_chat(text, history, lang_selection)
230
+
231
  except sr.UnknownValueError:
232
+ return "", history, "์Œ์„ฑ์„ ์ดํ•ดํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
233
+ except Exception as e:
234
+ return "", history, f"์˜ค๋ฅ˜: {e}"
235
 
236
  # =========================================================
237
+ # 4. UI Layout (Clean Professional Korean)
238
  # =========================================================
239
 
240
+ theme = gr.themes.Soft(
241
+ primary_hue="amber",
242
+ neutral_hue="slate",
243
+ font=[gr.themes.GoogleFont("Noto Sans KR"), "sans-serif"]
244
+ )
 
 
 
 
 
 
 
245
 
246
+ css = """
247
+ footer {visibility: hidden !important;}
248
+ .gradio-container {min-height: 0px !important;}
249
+ """
250
+
251
+ with gr.Blocks(theme=theme, title="KB AI Challenge", css=css) as demo:
252
+
253
  with gr.Row():
254
+ # --- LEFT SIDEBAR ---
255
+ with gr.Column(scale=1, min_width=300, variant="panel"):
256
+ gr.Markdown("## KB AI Challenge")
257
+ gr.Markdown("**๋‹ค๊ตญ์–ด ๊ธˆ์œต AI ์–ด์‹œ์Šคํ„ดํŠธ**")
258
+
259
+ with gr.Group():
260
+ lang_dropdown = gr.Dropdown(
261
+ choices=list(LANG_MAP.keys()),
262
+ value="ํ•œ๊ตญ์–ด (Korean)",
263
+ label="์–ธ์–ด ์„ค์ •",
264
+ interactive=True
265
+ )
266
+
267
+ file_input = gr.File(label="์ง€์‹ ๋ฒ ์ด์Šค (PDF)", file_count="multiple", file_types=[".pdf"])
268
+ with gr.Row():
269
+ upload_btn = gr.Button("์—…๋กœ๋“œ ๋ฐ ๋ถ„์„", variant="primary", size="sm")
270
+ upload_status = gr.Textbox(show_label=False, placeholder="์ƒํƒœ ๋Œ€๊ธฐ ์ค‘...", interactive=False, lines=1, max_lines=1)
271
+
272
+ gr.Markdown("### ์Œ์„ฑ ๋Œ€ํ™”")
273
+ audio_input = gr.Audio(sources=["microphone"], type="numpy", label="์Œ์„ฑ ์ž…๋ ฅ", show_label=False)
274
+
275
+ with gr.Accordion("์‹œ์Šคํ…œ ์•„ํ‚คํ…์ฒ˜", open=False):
276
+ gr.Markdown(
277
+ """
278
+ **์ตœ์ ํ™” ๋‚ด์—ญ**
279
+ 1. **STT**: Google Speech API
280
+ 2. **๋ฒˆ์—ญ**: Google Translate API
281
+ 3. **LLM**: Groq LPU (Llama 3)
282
+ """
283
+ )
284
+
285
+ # --- RIGHT MAIN ---
286
+ with gr.Column(scale=3):
287
+ # chatbot (Messages format)
288
+ chatbot = gr.Chatbot(label="๋Œ€ํ™”", height=500, show_label=False)
289
 
290
+ # References
291
+ gr.Markdown("**์ฐธ๊ณ  ๋ฌธ์„œ**")
292
+ ref_output = gr.Textbox(show_label=False, interactive=False, lines=3, max_lines=5, placeholder="๊ด€๋ จ ๋ฌธ์„œ๊ฐ€ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
293
 
294
+ # Input Area
295
+ with gr.Row():
296
+ msg = gr.Textbox(
297
+ scale=6,
298
+ show_label=False,
299
+ placeholder="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”...",
300
+ container=False
301
+ )
302
+ submit_btn = gr.Button("์ „์†ก", scale=1, variant="primary")
303
+
304
+ # --- Event Handlers ---
305
  upload_btn.click(process_uploaded_files, inputs=[file_input], outputs=[upload_status])
306
 
307
+ msg.submit(run_rag_chat, [msg, chatbot, lang_dropdown], [msg, chatbot, ref_output])
308
+ submit_btn.click(run_rag_chat, [msg, chatbot, lang_dropdown], [msg, chatbot, ref_output])
309
 
310
+ audio_input.stop_recording(voice_to_text_chat, [audio_input, chatbot, lang_dropdown], [msg, chatbot, ref_output])
 
 
 
 
311
 
312
  if __name__ == "__main__":
313
  demo.launch(share=True)