rottg commited on
Commit
85ff768
·
verified ·
1 Parent(s): 171ae15

Update code

Browse files
Dockerfile CHANGED
@@ -14,6 +14,8 @@ COPY data_structures.py .
14
  COPY indexer.py .
15
  COPY search.py .
16
  COPY semantic_search.py .
 
 
17
  COPY schema.sql .
18
  COPY static/ static/
19
  COPY templates/ templates/
 
14
  COPY indexer.py .
15
  COPY search.py .
16
  COPY semantic_search.py .
17
+ COPY hybrid_search.py .
18
+ COPY gemini_client.py .
19
  COPY schema.sql .
20
  COPY static/ static/
21
  COPY templates/ templates/
dashboard.py CHANGED
@@ -27,59 +27,96 @@ from collections import defaultdict
27
  # DATABASE DOWNLOAD FROM HF DATASET
28
  # ==========================================
29
  HF_DATASET_REPO = "rottg/telegram-db"
30
- DB_FILENAME = "telegram.db"
31
  APP_DIR = os.path.dirname(os.path.abspath(__file__))
32
- DB_PATH_FULL = os.path.join(APP_DIR, DB_FILENAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  def ensure_db_exists():
36
- """Download DB from HF Dataset repo if it doesn't exist locally."""
37
- print(f"[DB] Checking for database at: {DB_PATH_FULL}")
38
  print(f"[DB] Current working directory: {os.getcwd()}")
39
 
 
40
  if os.path.exists(DB_PATH_FULL):
41
  size_mb = os.path.getsize(DB_PATH_FULL) / (1024 * 1024)
42
- print(f"✓ Database found: {DB_PATH_FULL} ({size_mb:.0f} MB)")
43
- return True
 
 
 
 
 
 
 
 
44
 
45
- print(f"[DB] Database not found. Downloading from HF Dataset {HF_DATASET_REPO}...")
46
- try:
47
- from huggingface_hub import hf_hub_download
48
- import shutil
49
-
50
- # Get token from environment
51
- token = os.environ.get("HF_TOKEN")
52
- print(f"[DB] HF_TOKEN from env: {'set' if token else 'NOT SET'}")
53
-
54
- if not token:
55
- token_file = os.path.join(APP_DIR, ".hf_token")
56
- if os.path.exists(token_file):
57
- with open(token_file) as f:
58
- token = f.read().strip()
59
- print(f"[DB] HF_TOKEN from file: set")
60
-
61
- # Download to cache, then copy to app dir
62
- cached_path = hf_hub_download(
63
- repo_id=HF_DATASET_REPO,
64
- filename=DB_FILENAME,
65
- repo_type="dataset",
66
- token=token,
67
- )
68
- print(f"[DB] Downloaded to cache: {cached_path}")
69
 
70
- # Copy to app directory
71
- shutil.copy2(cached_path, DB_PATH_FULL)
72
- size_mb = os.path.getsize(DB_PATH_FULL) / (1024 * 1024)
73
- print(f"✓ Database ready: {DB_PATH_FULL} ({size_mb:.0f} MB)")
74
- return True
75
- except Exception as e:
76
- print(f"✗ Failed to download database: {e}")
77
- import traceback
78
- traceback.print_exc()
79
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
- # Download DB on module import (for gunicorn)
83
  ensure_db_exists()
84
 
85
  # ==========================================
@@ -256,6 +293,12 @@ def settings_page():
256
  return render_template('settings.html')
257
 
258
 
 
 
 
 
 
 
259
  # ==========================================
260
  # API ENDPOINTS - OVERVIEW STATS
261
  # ==========================================
@@ -1844,6 +1887,130 @@ def api_ai_search():
1844
  return jsonify({'error': str(e), 'query': query})
1845
 
1846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1847
  def fallback_ai_search(query: str):
1848
  """Fallback search when AI is not available."""
1849
  conn = get_db()
 
27
  # DATABASE DOWNLOAD FROM HF DATASET
28
  # ==========================================
29
  HF_DATASET_REPO = "rottg/telegram-db"
 
30
  APP_DIR = os.path.dirname(os.path.abspath(__file__))
31
+ DB_PATH_FULL = os.path.join(APP_DIR, "telegram.db")
32
+ EMBEDDINGS_PATH_FULL = os.path.join(APP_DIR, "embeddings.db")
33
+ CHUNK_EMBEDDINGS_PATH = os.path.join(APP_DIR, "chunk_embeddings.db")
34
+ BM25_INDEX_PATH = os.path.join(APP_DIR, "bm25_index.pkl")
35
+
36
+
37
+ def download_from_hf(filename, local_path):
38
+ """Download a file from HF Dataset repo."""
39
+ from huggingface_hub import hf_hub_download
40
+ import shutil
41
+
42
+ token = os.environ.get("HF_TOKEN")
43
+ if not token:
44
+ token_file = os.path.join(APP_DIR, ".hf_token")
45
+ if os.path.exists(token_file):
46
+ with open(token_file) as f:
47
+ token = f.read().strip()
48
+
49
+ cached_path = hf_hub_download(
50
+ repo_id=HF_DATASET_REPO,
51
+ filename=filename,
52
+ repo_type="dataset",
53
+ token=token,
54
+ )
55
+ shutil.copy2(cached_path, local_path)
56
+ return True
57
 
58
 
59
  def ensure_db_exists():
60
+ """Download DBs from HF Dataset repo if they don't exist locally."""
 
61
  print(f"[DB] Current working directory: {os.getcwd()}")
62
 
63
+ # Download telegram.db
64
  if os.path.exists(DB_PATH_FULL):
65
  size_mb = os.path.getsize(DB_PATH_FULL) / (1024 * 1024)
66
+ print(f"✓ telegram.db found ({size_mb:.0f} MB)")
67
+ else:
68
+ print(f"[DB] Downloading telegram.db from HF...")
69
+ try:
70
+ download_from_hf("telegram.db", DB_PATH_FULL)
71
+ size_mb = os.path.getsize(DB_PATH_FULL) / (1024 * 1024)
72
+ print(f"✓ telegram.db downloaded ({size_mb:.0f} MB)")
73
+ except Exception as e:
74
+ print(f"✗ Failed to download telegram.db: {e}")
75
+ return False
76
 
77
+ # Download embeddings.db (optional - for semantic search)
78
+ if os.path.exists(EMBEDDINGS_PATH_FULL):
79
+ size_mb = os.path.getsize(EMBEDDINGS_PATH_FULL) / (1024 * 1024)
80
+ print(f"✓ embeddings.db found ({size_mb:.0f} MB)")
81
+ else:
82
+ print(f"[DB] Downloading embeddings.db from HF...")
83
+ try:
84
+ download_from_hf("embeddings.db", EMBEDDINGS_PATH_FULL)
85
+ size_mb = os.path.getsize(EMBEDDINGS_PATH_FULL) / (1024 * 1024)
86
+ print(f"✓ embeddings.db downloaded ({size_mb:.0f} MB)")
87
+ except Exception as e:
88
+ print(f"⚠ embeddings.db not available: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ # Download chunk_embeddings.db (for hybrid search)
91
+ if os.path.exists(CHUNK_EMBEDDINGS_PATH):
92
+ size_mb = os.path.getsize(CHUNK_EMBEDDINGS_PATH) / (1024 * 1024)
93
+ print(f"✓ chunk_embeddings.db found ({size_mb:.0f} MB)")
94
+ else:
95
+ print(f"[DB] Downloading chunk_embeddings.db from HF...")
96
+ try:
97
+ download_from_hf("chunk_embeddings.db", CHUNK_EMBEDDINGS_PATH)
98
+ size_mb = os.path.getsize(CHUNK_EMBEDDINGS_PATH) / (1024 * 1024)
99
+ print(f"✓ chunk_embeddings.db downloaded ({size_mb:.0f} MB)")
100
+ except Exception as e:
101
+ print(f"⚠ chunk_embeddings.db not available: {e}")
102
+
103
+ # Download bm25_index.pkl (for hybrid search)
104
+ if os.path.exists(BM25_INDEX_PATH):
105
+ size_mb = os.path.getsize(BM25_INDEX_PATH) / (1024 * 1024)
106
+ print(f"✓ bm25_index.pkl found ({size_mb:.0f} MB)")
107
+ else:
108
+ print(f"[DB] Downloading bm25_index.pkl from HF...")
109
+ try:
110
+ download_from_hf("bm25_index.pkl", BM25_INDEX_PATH)
111
+ size_mb = os.path.getsize(BM25_INDEX_PATH) / (1024 * 1024)
112
+ print(f"✓ bm25_index.pkl downloaded ({size_mb:.0f} MB)")
113
+ except Exception as e:
114
+ print(f"⚠ bm25_index.pkl not available: {e}")
115
+
116
+ return True
117
 
118
 
119
+ # Download DBs on module import (for gunicorn)
120
  ensure_db_exists()
121
 
122
  # ==========================================
 
293
  return render_template('settings.html')
294
 
295
 
296
+ @app.route('/ai-search')
297
+ def ai_search_page():
298
+ """AI-powered search page with Gemini."""
299
+ return render_template('ai_search.html')
300
+
301
+
302
  # ==========================================
303
  # API ENDPOINTS - OVERVIEW STATS
304
  # ==========================================
 
1887
  return jsonify({'error': str(e), 'query': query})
1888
 
1889
 
1890
+ @app.route('/api/hybrid/search', methods=['POST'])
1891
+ def api_hybrid_search():
1892
+ """
1893
+ Hybrid search combining:
1894
+ - Chunk-based vector search (conversation context)
1895
+ - BM25 keyword search (exact matches)
1896
+ - Query expansion (synonyms, variations)
1897
+ """
1898
+ data = request.get_json()
1899
+ query = data.get('query', '')
1900
+ limit = data.get('limit', 20)
1901
+ include_context = data.get('include_context', True)
1902
+
1903
+ if not query:
1904
+ return jsonify({'error': 'Query required'})
1905
+
1906
+ try:
1907
+ from hybrid_search import get_hybrid_search
1908
+ hs = get_hybrid_search()
1909
+
1910
+ # Get stats
1911
+ stats = hs.stats()
1912
+ if not stats.get('chunks_available') and not stats.get('single_embeddings_available'):
1913
+ return jsonify({
1914
+ 'error': 'No search indexes available. Run the Colab notebook first.',
1915
+ 'stats': stats
1916
+ })
1917
+
1918
+ # Search with or without context
1919
+ if include_context:
1920
+ results = hs.search_with_context(query, limit=limit)
1921
+ else:
1922
+ results = hs.hybrid_search(query, limit=limit)
1923
+
1924
+ # Get expanded queries for display
1925
+ expanded = hs.expand_query(query)
1926
+
1927
+ return jsonify({
1928
+ 'query': query,
1929
+ 'expanded_queries': expanded,
1930
+ 'results': results,
1931
+ 'count': len(results),
1932
+ 'stats': stats,
1933
+ 'mode': 'hybrid'
1934
+ })
1935
+
1936
+ except ImportError as e:
1937
+ return jsonify({'error': f'Hybrid search not available: {str(e)}'})
1938
+ except Exception as e:
1939
+ import traceback
1940
+ return jsonify({
1941
+ 'error': str(e),
1942
+ 'traceback': traceback.format_exc()
1943
+ })
1944
+
1945
+
1946
+ @app.route('/api/gemini/search', methods=['POST'])
1947
+ def api_gemini_search():
1948
+ """
1949
+ AI-powered search using Gemini 1.5 Flash.
1950
+ Combines hybrid search with Gemini for natural language answers.
1951
+ """
1952
+ data = request.get_json()
1953
+ query = data.get('query', '')
1954
+ limit = data.get('limit', 5)
1955
+
1956
+ if not query:
1957
+ return jsonify({'error': 'Query required'})
1958
+
1959
+ try:
1960
+ from gemini_client import ai_search, get_gemini_client
1961
+
1962
+ # Check if Gemini is available
1963
+ client = get_gemini_client()
1964
+ if not client.is_available():
1965
+ # Fall back to hybrid search without AI
1966
+ from hybrid_search import get_hybrid_search
1967
+ hs = get_hybrid_search()
1968
+ results = hs.search_with_context(query, limit=limit)
1969
+
1970
+ return jsonify({
1971
+ 'query': query,
1972
+ 'success': False,
1973
+ 'error': 'Gemini API not available. Set GEMINI_API_KEY environment variable.',
1974
+ 'search_results': results,
1975
+ 'count': len(results),
1976
+ 'mode': 'hybrid_only'
1977
+ })
1978
+
1979
+ # Perform AI search
1980
+ result = ai_search(query, limit=limit)
1981
+
1982
+ return jsonify(result)
1983
+
1984
+ except ImportError as e:
1985
+ return jsonify({'error': f'AI search not available: {str(e)}'})
1986
+ except Exception as e:
1987
+ import traceback
1988
+ return jsonify({
1989
+ 'error': str(e),
1990
+ 'traceback': traceback.format_exc()
1991
+ })
1992
+
1993
+
1994
+ @app.route('/api/gemini/status')
1995
+ def api_gemini_status():
1996
+ """Check Gemini API status."""
1997
+ try:
1998
+ from gemini_client import get_gemini_client
1999
+ client = get_gemini_client()
2000
+
2001
+ api_key = os.environ.get('GEMINI_API_KEY', '')
2002
+ return jsonify({
2003
+ 'available': client.is_available(),
2004
+ 'api_key_set': bool(api_key),
2005
+ 'api_key_preview': f"{api_key[:8]}..." if len(api_key) > 8 else None
2006
+ })
2007
+ except Exception as e:
2008
+ return jsonify({
2009
+ 'available': False,
2010
+ 'error': str(e)
2011
+ })
2012
+
2013
+
2014
  def fallback_ai_search(query: str):
2015
  """Fallback search when AI is not available."""
2016
  conn = get_db()
gemini_client.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gemini AI Client for Chat Search
3
+ Uses Gemini 1.5 Flash to summarize search results and answer questions.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ from typing import List, Dict, Optional
9
+
10
+ # Try importing Google Generative AI
11
+ try:
12
+ import google.generativeai as genai
13
+ HAS_GEMINI = True
14
+ except ImportError:
15
+ HAS_GEMINI = False
16
+
17
+
18
+ class GeminiClient:
19
+ """Client for Gemini AI API."""
20
+
21
+ def __init__(self, api_key: Optional[str] = None):
22
+ self.api_key = api_key or os.environ.get('GEMINI_API_KEY')
23
+ self.model = None
24
+ self._initialized = False
25
+
26
+ def _initialize(self):
27
+ """Initialize the Gemini client."""
28
+ if self._initialized:
29
+ return True
30
+
31
+ if not HAS_GEMINI:
32
+ print("google-generativeai not installed")
33
+ return False
34
+
35
+ if not self.api_key:
36
+ print("GEMINI_API_KEY not set")
37
+ return False
38
+
39
+ try:
40
+ genai.configure(api_key=self.api_key)
41
+ self.model = genai.GenerativeModel('gemini-1.5-flash')
42
+ self._initialized = True
43
+ print("Gemini client initialized")
44
+ return True
45
+ except Exception as e:
46
+ print(f"Failed to initialize Gemini: {e}")
47
+ return False
48
+
49
+ def answer_from_context(self, query: str, search_results: List[Dict],
50
+ max_results: int = 5) -> Dict:
51
+ """
52
+ Generate an answer based on search results.
53
+
54
+ Args:
55
+ query: User's question
56
+ search_results: List of search results with context
57
+ max_results: Max results to include in context
58
+
59
+ Returns:
60
+ Dict with 'answer', 'sources', and 'success'
61
+ """
62
+ if not self._initialize():
63
+ return {
64
+ 'success': False,
65
+ 'error': 'Gemini not available',
66
+ 'answer': None
67
+ }
68
+
69
+ # Build context from search results
70
+ context_parts = []
71
+ sources = []
72
+
73
+ for i, result in enumerate(search_results[:max_results]):
74
+ # Handle different result formats
75
+ if 'message' in result:
76
+ # search_with_context format
77
+ msg = result['message']
78
+ context_parts.append(f"""
79
+ --- תוצאה {i+1} (ציון: {result.get('score', 0):.2f}) ---
80
+ מאת: {msg.get('from_name', 'לא ידוע')}
81
+ תאריך: {msg.get('date', 'לא ידוע')}
82
+ הודעה: {msg.get('text', '')}
83
+ """)
84
+ sources.append({
85
+ 'from_name': msg.get('from_name'),
86
+ 'date': msg.get('date'),
87
+ 'message_id': result.get('message_id')
88
+ })
89
+
90
+ # Add context if available
91
+ if result.get('context_before'):
92
+ context_parts.append("הקשר לפני:")
93
+ for ctx in result['context_before']:
94
+ context_parts.append(f" [{ctx.get('from_name', '?')}] {ctx.get('text_plain', '')[:100]}")
95
+
96
+ if result.get('context_after'):
97
+ context_parts.append("הקשר אחרי:")
98
+ for ctx in result['context_after']:
99
+ context_parts.append(f" [{ctx.get('from_name', '?')}] {ctx.get('text_plain', '')[:100]}")
100
+
101
+ elif 'chunk_text' in result:
102
+ # hybrid_search format
103
+ context_parts.append(f"""
104
+ --- תוצאה {i+1} (ציון: {result.get('score', 0):.2f}) ---
105
+ {result.get('chunk_text', '')}
106
+ """)
107
+ sources.append({
108
+ 'message_id': result.get('message_id'),
109
+ 'score': result.get('score')
110
+ })
111
+
112
+ context = "\n".join(context_parts)
113
+
114
+ # Build prompt
115
+ prompt = f"""אתה עוזר שמנתח שיחות מקבוצת טלגרם ועונה על שאלות.
116
+
117
+ השאלה: {query}
118
+
119
+ להלן תוצאות חיפוש רלוונטיות מהשיחות:
120
+
121
+ {context}
122
+
123
+ הנחיות:
124
+ 1. ענה בעברית
125
+ 2. תן תשובה קצרה וממוקדת (1-3 משפטים)
126
+ 3. אם המידע לא ברור או לא קיים בתוצאות, אמור "לא מצאתי מידע ברור"
127
+ 4. ציין את המקור (שם השולח והתאריך) אם רלוונטי
128
+ 5. אל תמציא מידע שלא מופיע בתוצאות
129
+
130
+ התשובה:"""
131
+
132
+ try:
133
+ response = self.model.generate_content(prompt)
134
+ answer = response.text.strip()
135
+
136
+ return {
137
+ 'success': True,
138
+ 'answer': answer,
139
+ 'sources': sources,
140
+ 'query': query,
141
+ 'results_used': len(context_parts)
142
+ }
143
+
144
+ except Exception as e:
145
+ return {
146
+ 'success': False,
147
+ 'error': str(e),
148
+ 'answer': None
149
+ }
150
+
151
+ def is_available(self) -> bool:
152
+ """Check if Gemini is available."""
153
+ return self._initialize()
154
+
155
+
156
+ # Singleton instance
157
+ _gemini_client = None
158
+
159
+
160
+ def get_gemini_client() -> GeminiClient:
161
+ """Get or create Gemini client instance."""
162
+ global _gemini_client
163
+ if _gemini_client is None:
164
+ _gemini_client = GeminiClient()
165
+ return _gemini_client
166
+
167
+
168
+ def ai_search(query: str, limit: int = 5) -> Dict:
169
+ """
170
+ Perform AI-powered search: hybrid search + Gemini summarization.
171
+
172
+ Args:
173
+ query: Search query
174
+ limit: Max results to use
175
+
176
+ Returns:
177
+ Dict with answer and metadata
178
+ """
179
+ from hybrid_search import get_hybrid_search
180
+
181
+ # Get hybrid search results
182
+ hs = get_hybrid_search()
183
+ results = hs.search_with_context(query, limit=limit)
184
+
185
+ if not results:
186
+ return {
187
+ 'success': False,
188
+ 'error': 'No search results found',
189
+ 'answer': 'לא נמצאו תוצאות לחיפוש זה',
190
+ 'query': query
191
+ }
192
+
193
+ # Get AI answer
194
+ client = get_gemini_client()
195
+ response = client.answer_from_context(query, results, max_results=limit)
196
+
197
+ # Add raw results for transparency
198
+ response['search_results'] = results
199
+
200
+ return response
201
+
202
+
203
+ # CLI for testing
204
+ if __name__ == '__main__':
205
+ import sys
206
+
207
+ if len(sys.argv) < 2:
208
+ print("Usage: python gemini_client.py 'search query'")
209
+ print("\nChecking Gemini availability...")
210
+ client = get_gemini_client()
211
+ if client.is_available():
212
+ print("Gemini is available!")
213
+ else:
214
+ print("Gemini is NOT available. Set GEMINI_API_KEY environment variable.")
215
+ sys.exit(0)
216
+
217
+ query = ' '.join(sys.argv[1:])
218
+ print(f"\n=== AI Search: {query} ===\n")
219
+
220
+ result = ai_search(query)
221
+
222
+ if result['success']:
223
+ print(f"Answer: {result['answer']}")
224
+ print(f"\nSources: {len(result.get('sources', []))} results used")
225
+ else:
226
+ print(f"Error: {result.get('error')}")
hybrid_search.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hybrid Search - Combines Vector Search, BM25, and Query Expansion
3
+
4
+ This provides much better search for chat data by:
5
+ 1. Chunk-based vector search (captures context)
6
+ 2. BM25 keyword search (finds exact matches)
7
+ 3. Query expansion (handles variations)
8
+ """
9
+
10
+ import sqlite3
11
+ import numpy as np
12
+ import pickle
13
+ import re
14
+ import os
15
+ from typing import List, Dict, Any, Optional
16
+
17
+ # Try importing sentence-transformers
18
+ try:
19
+ from sentence_transformers import SentenceTransformer
20
+ HAS_TRANSFORMERS = True
21
+ except ImportError:
22
+ HAS_TRANSFORMERS = False
23
+
24
+ # Try importing BM25
25
+ try:
26
+ from rank_bm25 import BM25Okapi
27
+ HAS_BM25 = True
28
+ except ImportError:
29
+ HAS_BM25 = False
30
+
31
+
32
+ class HybridSearch:
33
+ """
34
+ Hybrid search combining:
35
+ - Chunk-based vector search (conversation context)
36
+ - BM25 keyword search (exact matches)
37
+ - Query expansion (synonyms, variations)
38
+ """
39
+
40
+ def __init__(self,
41
+ messages_db: str = 'telegram.db',
42
+ chunk_embeddings_db: str = 'chunk_embeddings.db',
43
+ bm25_index_path: str = 'bm25_index.pkl',
44
+ single_embeddings_db: str = 'embeddings.db'):
45
+ self.messages_db = messages_db
46
+ self.chunk_embeddings_db = chunk_embeddings_db
47
+ self.bm25_index_path = bm25_index_path
48
+ self.single_embeddings_db = single_embeddings_db
49
+
50
+ # Lazy-loaded components
51
+ self.model = None
52
+ self.chunk_embeddings = None
53
+ self.chunk_data = None
54
+ self.bm25 = None
55
+ self.bm25_message_ids = None
56
+ self.single_embeddings = None
57
+ self.single_message_ids = None
58
+
59
+ def _load_model(self):
60
+ """Load the embedding model."""
61
+ if self.model is not None:
62
+ return
63
+
64
+ if not HAS_TRANSFORMERS:
65
+ raise RuntimeError("sentence-transformers not installed")
66
+
67
+ print("Loading embedding model...")
68
+ self.model = SentenceTransformer('intfloat/multilingual-e5-large')
69
+ print("Model loaded!")
70
+
71
+ def _load_chunk_embeddings(self):
72
+ """Load chunk embeddings."""
73
+ if self.chunk_embeddings is not None:
74
+ return True
75
+
76
+ if not os.path.exists(self.chunk_embeddings_db):
77
+ print(f"Chunk embeddings not found: {self.chunk_embeddings_db}")
78
+ return False
79
+
80
+ print(f"Loading chunk embeddings from {self.chunk_embeddings_db}...")
81
+ conn = sqlite3.connect(self.chunk_embeddings_db)
82
+
83
+ # Check if chunk_type column exists (for backwards compatibility)
84
+ cursor = conn.execute("PRAGMA table_info(chunk_embeddings)")
85
+ columns = [col[1] for col in cursor.fetchall()]
86
+ has_type = 'chunk_type' in columns
87
+
88
+ if has_type:
89
+ rows = conn.execute("""
90
+ SELECT chunk_id, chunk_type, text, message_ids, anchor_message_id, embedding
91
+ FROM chunk_embeddings
92
+ """).fetchall()
93
+ else:
94
+ rows = conn.execute("""
95
+ SELECT chunk_id, 'window' as chunk_type, text, message_ids, anchor_message_id, embedding
96
+ FROM chunk_embeddings
97
+ """).fetchall()
98
+ conn.close()
99
+
100
+ if not rows:
101
+ return False
102
+
103
+ import json
104
+ self.chunk_data = []
105
+ emb_list = []
106
+
107
+ for row in rows:
108
+ chunk_id, chunk_type, text, msg_ids_json, anchor_id, emb_blob = row
109
+ emb = np.frombuffer(emb_blob, dtype=np.float32)
110
+ emb_list.append(emb)
111
+ self.chunk_data.append({
112
+ 'chunk_id': chunk_id,
113
+ 'chunk_type': chunk_type,
114
+ 'text': text,
115
+ 'message_ids': json.loads(msg_ids_json),
116
+ 'anchor_message_id': anchor_id
117
+ })
118
+
119
+ self.chunk_embeddings = np.vstack(emb_list)
120
+ # Normalize
121
+ norms = np.linalg.norm(self.chunk_embeddings, axis=1, keepdims=True)
122
+ self.chunk_embeddings = self.chunk_embeddings / np.where(norms == 0, 1, norms)
123
+
124
+ print(f"Loaded {len(self.chunk_data)} chunk embeddings")
125
+ return True
126
+
127
+ def _load_single_embeddings(self):
128
+ """Load single-message embeddings (fallback)."""
129
+ if self.single_embeddings is not None:
130
+ return True
131
+
132
+ if not os.path.exists(self.single_embeddings_db):
133
+ return False
134
+
135
+ print(f"Loading single embeddings from {self.single_embeddings_db}...")
136
+ conn = sqlite3.connect(self.single_embeddings_db)
137
+
138
+ rows = conn.execute("""
139
+ SELECT message_id, embedding FROM embeddings
140
+ """).fetchall()
141
+ conn.close()
142
+
143
+ if not rows:
144
+ return False
145
+
146
+ self.single_message_ids = []
147
+ emb_list = []
148
+
149
+ for row in rows:
150
+ msg_id, emb_blob = row
151
+ emb = np.frombuffer(emb_blob, dtype=np.float32)
152
+ emb_list.append(emb)
153
+ self.single_message_ids.append(msg_id)
154
+
155
+ self.single_embeddings = np.vstack(emb_list)
156
+ norms = np.linalg.norm(self.single_embeddings, axis=1, keepdims=True)
157
+ self.single_embeddings = self.single_embeddings / np.where(norms == 0, 1, norms)
158
+
159
+ print(f"Loaded {len(self.single_message_ids)} single embeddings")
160
+ return True
161
+
162
+ def _load_bm25(self):
163
+ """Load BM25 index."""
164
+ if self.bm25 is not None:
165
+ return True
166
+
167
+ if not os.path.exists(self.bm25_index_path):
168
+ print(f"BM25 index not found: {self.bm25_index_path}")
169
+ return False
170
+
171
+ print(f"Loading BM25 index from {self.bm25_index_path}...")
172
+ with open(self.bm25_index_path, 'rb') as f:
173
+ data = pickle.load(f)
174
+
175
+ self.bm25 = data['bm25']
176
+ self.bm25_message_ids = data['message_ids']
177
+ print(f"Loaded BM25 index with {len(self.bm25_message_ids)} documents")
178
+ return True
179
+
180
+ def expand_query(self, query: str) -> List[str]:
181
+ """
182
+ Expand query with variations.
183
+ Returns list of query variations to search.
184
+ """
185
+ queries = [query]
186
+
187
+ # Hebrew question word expansions
188
+ expansions = {
189
+ 'איפה': ['איפה', 'היכן', 'מיקום', 'כתובת', 'עיר'],
190
+ 'מתי': ['מתי', 'באיזה תאריך', 'מועד', 'זמן'],
191
+ 'מי': ['מי', 'מיהו', 'מיהי', 'שם'],
192
+ 'כמה': ['כמה', 'מספר', 'כמות'],
193
+ 'למה': ['למה', 'מדוע', 'סיבה'],
194
+ 'גר': ['גר', 'גרה', 'מתגורר', 'מתגוררת', 'גרים'],
195
+ 'עובד': ['עובד', 'עובדת', 'עובדים', 'מועסק', 'עבודה'],
196
+ }
197
+
198
+ # Add expanded variations
199
+ for word, synonyms in expansions.items():
200
+ if word in query:
201
+ for syn in synonyms:
202
+ if syn != word:
203
+ expanded = query.replace(word, syn)
204
+ if expanded not in queries:
205
+ queries.append(expanded)
206
+
207
+ return queries[:5] # Limit to 5 variations
208
+
209
+ def search_chunks(self, query: str, limit: int = 20) -> List[Dict]:
210
+ """Search using chunk embeddings (context-aware)."""
211
+ if not self._load_chunk_embeddings():
212
+ return []
213
+
214
+ self._load_model()
215
+
216
+ # Encode query with e5 prefix
217
+ query_emb = self.model.encode([f"query: {query}"], convert_to_numpy=True)[0]
218
+ query_norm = query_emb / np.linalg.norm(query_emb)
219
+
220
+ # Compute similarities
221
+ similarities = np.dot(self.chunk_embeddings, query_norm)
222
+
223
+ # Get top results
224
+ top_indices = np.argsort(similarities)[::-1][:limit]
225
+
226
+ results = []
227
+ for idx in top_indices:
228
+ score = float(similarities[idx])
229
+ chunk = self.chunk_data[idx]
230
+ results.append({
231
+ 'type': 'chunk',
232
+ 'chunk_type': chunk.get('chunk_type', 'window'), # 'thread' or 'window'
233
+ 'chunk_id': chunk['chunk_id'],
234
+ 'text': chunk['text'],
235
+ 'message_ids': chunk['message_ids'],
236
+ 'anchor_message_id': chunk['anchor_message_id'],
237
+ 'score': score
238
+ })
239
+
240
+ return results
241
+
242
+ def search_bm25(self, query: str, limit: int = 20) -> List[Dict]:
243
+ """Search using BM25 (keyword-based)."""
244
+ if not self._load_bm25():
245
+ return []
246
+
247
+ # Tokenize query
248
+ query_tokens = re.findall(r'\w+', query.lower())
249
+
250
+ # Get BM25 scores
251
+ scores = self.bm25.get_scores(query_tokens)
252
+
253
+ # Get top results
254
+ top_indices = np.argsort(scores)[::-1][:limit]
255
+
256
+ results = []
257
+ for idx in top_indices:
258
+ score = float(scores[idx])
259
+ if score > 0:
260
+ results.append({
261
+ 'type': 'bm25',
262
+ 'message_id': self.bm25_message_ids[idx],
263
+ 'score': score
264
+ })
265
+
266
+ return results
267
+
268
+ def search_single(self, query: str, limit: int = 20) -> List[Dict]:
269
+ """Search using single-message embeddings (fallback)."""
270
+ if not self._load_single_embeddings():
271
+ return []
272
+
273
+ self._load_model()
274
+
275
+ query_emb = self.model.encode([f"query: {query}"], convert_to_numpy=True)[0]
276
+ query_norm = query_emb / np.linalg.norm(query_emb)
277
+
278
+ similarities = np.dot(self.single_embeddings, query_norm)
279
+ top_indices = np.argsort(similarities)[::-1][:limit]
280
+
281
+ results = []
282
+ for idx in top_indices:
283
+ score = float(similarities[idx])
284
+ results.append({
285
+ 'type': 'single',
286
+ 'message_id': self.single_message_ids[idx],
287
+ 'score': score
288
+ })
289
+
290
+ return results
291
+
292
+ def hybrid_search(self, query: str, limit: int = 20,
293
+ vector_weight: float = 0.6,
294
+ bm25_weight: float = 0.4,
295
+ use_expansion: bool = True) -> List[Dict]:
296
+ """
297
+ Hybrid search combining vector and BM25.
298
+
299
+ Args:
300
+ query: Search query
301
+ limit: Max results
302
+ vector_weight: Weight for vector search (0-1)
303
+ bm25_weight: Weight for BM25 search (0-1)
304
+ use_expansion: Whether to expand query
305
+
306
+ Returns:
307
+ Combined search results
308
+ """
309
+ all_message_scores = {}
310
+
311
+ # Get expanded queries
312
+ queries = self.expand_query(query) if use_expansion else [query]
313
+
314
+ # Search with each query variation
315
+ for q in queries:
316
+ # Chunk/Vector search
317
+ chunk_results = self.search_chunks(q, limit=limit * 2)
318
+ for r in chunk_results:
319
+ for msg_id in r['message_ids']:
320
+ if msg_id not in all_message_scores:
321
+ all_message_scores[msg_id] = {'vector': 0, 'bm25': 0, 'chunk_text': None}
322
+ # Use max score across message appearances
323
+ all_message_scores[msg_id]['vector'] = max(
324
+ all_message_scores[msg_id]['vector'],
325
+ r['score'] * vector_weight
326
+ )
327
+ if all_message_scores[msg_id]['chunk_text'] is None:
328
+ all_message_scores[msg_id]['chunk_text'] = r['text']
329
+
330
+ # BM25 search
331
+ bm25_results = self.search_bm25(q, limit=limit * 2)
332
+ for r in bm25_results:
333
+ msg_id = r['message_id']
334
+ if msg_id not in all_message_scores:
335
+ all_message_scores[msg_id] = {'vector': 0, 'bm25': 0, 'chunk_text': None}
336
+ all_message_scores[msg_id]['bm25'] = max(
337
+ all_message_scores[msg_id]['bm25'],
338
+ r['score'] * bm25_weight / 10 # Normalize BM25 scores
339
+ )
340
+
341
+ # Combine scores
342
+ combined = []
343
+ for msg_id, scores in all_message_scores.items():
344
+ total_score = scores['vector'] + scores['bm25']
345
+ combined.append({
346
+ 'message_id': msg_id,
347
+ 'score': total_score,
348
+ 'vector_score': scores['vector'],
349
+ 'bm25_score': scores['bm25'],
350
+ 'chunk_text': scores['chunk_text']
351
+ })
352
+
353
+ # Sort by combined score
354
+ combined.sort(key=lambda x: x['score'], reverse=True)
355
+
356
+ return combined[:limit]
357
+
358
+ def search_with_context(self, query: str, limit: int = 20,
359
+ context_window: int = 3) -> List[Dict]:
360
+ """
361
+ Search and return results with surrounding context.
362
+
363
+ Args:
364
+ query: Search query
365
+ limit: Max results
366
+ context_window: Messages before/after to include
367
+
368
+ Returns:
369
+ Results with full context
370
+ """
371
+ # Get hybrid search results
372
+ results = self.hybrid_search(query, limit=limit)
373
+
374
+ if not results:
375
+ return []
376
+
377
+ # Get full context from DB
378
+ conn = sqlite3.connect(self.messages_db)
379
+ conn.row_factory = sqlite3.Row
380
+
381
+ enriched = []
382
+ for r in results:
383
+ msg_id = r['message_id']
384
+
385
+ # Get the message
386
+ msg = conn.execute(
387
+ "SELECT * FROM messages WHERE id = ?", (msg_id,)
388
+ ).fetchone()
389
+
390
+ if not msg:
391
+ continue
392
+
393
+ # Get surrounding messages
394
+ context_before = conn.execute("""
395
+ SELECT id, date, from_name, text_plain FROM messages
396
+ WHERE date_unixtime < (SELECT date_unixtime FROM messages WHERE id = ?)
397
+ ORDER BY date_unixtime DESC LIMIT ?
398
+ """, (msg_id, context_window)).fetchall()
399
+
400
+ context_after = conn.execute("""
401
+ SELECT id, date, from_name, text_plain FROM messages
402
+ WHERE date_unixtime > (SELECT date_unixtime FROM messages WHERE id = ?)
403
+ ORDER BY date_unixtime ASC LIMIT ?
404
+ """, (msg_id, context_window)).fetchall()
405
+
406
+ enriched.append({
407
+ 'message_id': msg_id,
408
+ 'score': r['score'],
409
+ 'message': {
410
+ 'id': msg['id'],
411
+ 'date': msg['date'],
412
+ 'from_name': msg['from_name'],
413
+ 'text': msg['text_plain']
414
+ },
415
+ 'context_before': [dict(m) for m in reversed(context_before)],
416
+ 'context_after': [dict(m) for m in context_after],
417
+ 'chunk_text': r.get('chunk_text')
418
+ })
419
+
420
+ conn.close()
421
+ return enriched
422
+
423
+ def stats(self) -> Dict[str, Any]:
424
+ """Get search index statistics."""
425
+ stats = {
426
+ 'chunks_available': os.path.exists(self.chunk_embeddings_db),
427
+ 'bm25_available': os.path.exists(self.bm25_index_path),
428
+ 'single_embeddings_available': os.path.exists(self.single_embeddings_db),
429
+ }
430
+
431
+ if stats['chunks_available']:
432
+ conn = sqlite3.connect(self.chunk_embeddings_db)
433
+ stats['chunk_count'] = conn.execute(
434
+ "SELECT COUNT(*) FROM chunk_embeddings"
435
+ ).fetchone()[0]
436
+ conn.close()
437
+
438
+ if stats['single_embeddings_available']:
439
+ conn = sqlite3.connect(self.single_embeddings_db)
440
+ stats['single_embedding_count'] = conn.execute(
441
+ "SELECT COUNT(*) FROM embeddings"
442
+ ).fetchone()[0]
443
+ conn.close()
444
+
445
+ return stats
446
+
447
+
448
+ # Singleton instance
449
+ _hybrid_search = None
450
+
451
+
452
+ def get_hybrid_search() -> HybridSearch:
453
+ """Get or create hybrid search instance."""
454
+ global _hybrid_search
455
+ if _hybrid_search is None:
456
+ _hybrid_search = HybridSearch()
457
+ return _hybrid_search
458
+
459
+
460
+ # CLI for testing
461
+ if __name__ == '__main__':
462
+ import sys
463
+
464
+ if len(sys.argv) < 2:
465
+ print("Usage: python hybrid_search.py 'search query'")
466
+ print("\nStats:")
467
+ hs = get_hybrid_search()
468
+ print(hs.stats())
469
+ sys.exit(0)
470
+
471
+ query = ' '.join(sys.argv[1:])
472
+ hs = get_hybrid_search()
473
+
474
+ print(f"\n=== Searching: {query} ===\n")
475
+
476
+ # Show expanded queries
477
+ expanded = hs.expand_query(query)
478
+ print(f"Expanded queries: {expanded}\n")
479
+
480
+ # Search
481
+ results = hs.search_with_context(query, limit=5)
482
+
483
+ for i, r in enumerate(results, 1):
484
+ print(f"--- Result {i} (score: {r['score']:.3f}) ---")
485
+ print(f"From: {r['message']['from_name']}")
486
+ print(f"Date: {r['message']['date']}")
487
+ print(f"Text: {r['message']['text'][:200]}...")
488
+ if r['context_before']:
489
+ print(f"\nContext before:")
490
+ for ctx in r['context_before']:
491
+ print(f" [{ctx['from_name']}] {ctx['text_plain'][:100]}...")
492
+ if r['context_after']:
493
+ print(f"\nContext after:")
494
+ for ctx in r['context_after']:
495
+ print(f" [{ctx['from_name']}] {ctx['text_plain'][:100]}...")
496
+ print()
requirements.txt CHANGED
@@ -3,3 +3,5 @@ gunicorn>=21.2
3
  requests>=2.31
4
  ijson>=3.2
5
  huggingface_hub>=0.20
 
 
 
3
  requests>=2.31
4
  ijson>=3.2
5
  huggingface_hub>=0.20
6
+ rank_bm25>=0.2.2
7
+ google-generativeai>=0.3.0
semantic_search.py CHANGED
@@ -43,7 +43,7 @@ class SemanticSearch:
43
  )
44
  if self.model is None:
45
  print("Loading embedding model...")
46
- self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
47
  print("Model loaded!")
48
 
49
  def reload_embeddings(self):
@@ -117,8 +117,8 @@ class SemanticSearch:
117
  if len(self.message_ids) == 0:
118
  return []
119
 
120
- # Encode query
121
- query_emb = self.model.encode([query], convert_to_numpy=True)[0]
122
 
123
  # Compute cosine similarity with all embeddings
124
  # embeddings are already normalized from Colab
@@ -384,7 +384,7 @@ Answer:"""
384
  'available': True,
385
  'count': count,
386
  'size_mb': round(size_mb, 1),
387
- 'model': 'paraphrase-multilingual-MiniLM-L12-v2'
388
  }
389
 
390
 
 
43
  )
44
  if self.model is None:
45
  print("Loading embedding model...")
46
+ self.model = SentenceTransformer('intfloat/multilingual-e5-large')
47
  print("Model loaded!")
48
 
49
  def reload_embeddings(self):
 
117
  if len(self.message_ids) == 0:
118
  return []
119
 
120
+ # Encode query (e5 model requires "query: " prefix)
121
+ query_emb = self.model.encode([f"query: {query}"], convert_to_numpy=True)[0]
122
 
123
  # Compute cosine similarity with all embeddings
124
  # embeddings are already normalized from Colab
 
384
  'available': True,
385
  'count': count,
386
  'size_mb': round(size_mb, 1),
387
+ 'model': 'intfloat/multilingual-e5-large'
388
  }
389
 
390
 
templates/ai_search.html ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="he" dir="rtl">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>AI Search - Telegram Analytics</title>
7
+ <link rel="stylesheet" href="/static/css/style.css">
8
+ <style>
9
+ .ai-container {
10
+ max-width: 900px;
11
+ margin: 0 auto;
12
+ }
13
+ .ai-search-box {
14
+ background: var(--card-bg);
15
+ border-radius: var(--radius-lg);
16
+ padding: var(--spacing-lg);
17
+ margin-bottom: var(--spacing-lg);
18
+ border: 1px solid var(--border-color);
19
+ }
20
+ .ai-search-input {
21
+ width: 100%;
22
+ padding: var(--spacing-md);
23
+ font-size: 1.1rem;
24
+ border: 2px solid var(--border-color);
25
+ border-radius: var(--radius-md);
26
+ background: var(--bg-secondary);
27
+ color: var(--text-primary);
28
+ margin-bottom: var(--spacing-md);
29
+ direction: rtl;
30
+ }
31
+ .ai-search-input:focus {
32
+ outline: none;
33
+ border-color: var(--accent-color);
34
+ }
35
+ .ai-search-btn {
36
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
37
+ color: white;
38
+ border: none;
39
+ padding: var(--spacing-md) var(--spacing-xl);
40
+ font-size: 1rem;
41
+ font-weight: 600;
42
+ border-radius: var(--radius-md);
43
+ cursor: pointer;
44
+ display: flex;
45
+ align-items: center;
46
+ gap: var(--spacing-sm);
47
+ transition: transform 0.2s, box-shadow 0.2s;
48
+ }
49
+ .ai-search-btn:hover {
50
+ transform: translateY(-2px);
51
+ box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
52
+ }
53
+ .ai-search-btn:disabled {
54
+ opacity: 0.6;
55
+ cursor: not-allowed;
56
+ transform: none;
57
+ }
58
+ .ai-answer-box {
59
+ background: var(--card-bg);
60
+ border-radius: var(--radius-lg);
61
+ padding: var(--spacing-lg);
62
+ margin-bottom: var(--spacing-lg);
63
+ border: 1px solid var(--border-color);
64
+ display: none;
65
+ }
66
+ .ai-answer-box.visible {
67
+ display: block;
68
+ }
69
+ .ai-answer-header {
70
+ display: flex;
71
+ align-items: center;
72
+ gap: var(--spacing-sm);
73
+ margin-bottom: var(--spacing-md);
74
+ color: var(--accent-color);
75
+ font-weight: 600;
76
+ }
77
+ .ai-answer-content {
78
+ font-size: 1.1rem;
79
+ line-height: 1.8;
80
+ color: var(--text-primary);
81
+ white-space: pre-wrap;
82
+ }
83
+ .ai-sources {
84
+ margin-top: var(--spacing-lg);
85
+ padding-top: var(--spacing-lg);
86
+ border-top: 1px solid var(--border-color);
87
+ }
88
+ .ai-sources-header {
89
+ font-weight: 600;
90
+ color: var(--text-secondary);
91
+ margin-bottom: var(--spacing-md);
92
+ }
93
+ .source-item {
94
+ background: var(--bg-secondary);
95
+ border-radius: var(--radius-md);
96
+ padding: var(--spacing-md);
97
+ margin-bottom: var(--spacing-sm);
98
+ border-right: 3px solid var(--accent-color);
99
+ }
100
+ .source-meta {
101
+ font-size: 0.85rem;
102
+ color: var(--text-muted);
103
+ margin-bottom: var(--spacing-xs);
104
+ }
105
+ .source-text {
106
+ color: var(--text-secondary);
107
+ font-size: 0.95rem;
108
+ }
109
+ .context-message {
110
+ background: var(--bg-secondary);
111
+ border-radius: var(--radius-md);
112
+ padding: var(--spacing-md);
113
+ margin-bottom: var(--spacing-sm);
114
+ border-right: 3px solid transparent;
115
+ }
116
+ .context-message.main {
117
+ border-right-color: var(--accent-color);
118
+ background: var(--card-bg);
119
+ }
120
+ .status-badge {
121
+ display: inline-flex;
122
+ align-items: center;
123
+ gap: var(--spacing-xs);
124
+ padding: var(--spacing-xs) var(--spacing-sm);
125
+ border-radius: var(--radius-sm);
126
+ font-size: 0.8rem;
127
+ margin-right: var(--spacing-sm);
128
+ }
129
+ .status-badge.available {
130
+ background: rgba(46, 204, 113, 0.2);
131
+ color: #2ecc71;
132
+ }
133
+ .status-badge.unavailable {
134
+ background: rgba(231, 76, 60, 0.2);
135
+ color: #e74c3c;
136
+ }
137
+ .example-queries {
138
+ display: flex;
139
+ flex-wrap: wrap;
140
+ gap: var(--spacing-sm);
141
+ margin-top: var(--spacing-md);
142
+ }
143
+ .example-query {
144
+ background: var(--bg-secondary);
145
+ color: var(--text-secondary);
146
+ border: 1px solid var(--border-color);
147
+ padding: var(--spacing-xs) var(--spacing-sm);
148
+ border-radius: var(--radius-sm);
149
+ font-size: 0.85rem;
150
+ cursor: pointer;
151
+ transition: all 0.2s;
152
+ }
153
+ .example-query:hover {
154
+ background: var(--accent-color);
155
+ color: white;
156
+ border-color: var(--accent-color);
157
+ }
158
+ .loading-animation {
159
+ display: flex;
160
+ align-items: center;
161
+ gap: var(--spacing-sm);
162
+ }
163
+ .loading-dots {
164
+ display: flex;
165
+ gap: 4px;
166
+ }
167
+ .loading-dots span {
168
+ width: 8px;
169
+ height: 8px;
170
+ background: var(--accent-color);
171
+ border-radius: 50%;
172
+ animation: bounce 1.4s infinite ease-in-out both;
173
+ }
174
+ .loading-dots span:nth-child(1) { animation-delay: -0.32s; }
175
+ .loading-dots span:nth-child(2) { animation-delay: -0.16s; }
176
+ @keyframes bounce {
177
+ 0%, 80%, 100% { transform: scale(0); }
178
+ 40% { transform: scale(1); }
179
+ }
180
+ </style>
181
+ </head>
182
+ <body>
183
+ <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
184
+ <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
185
+ <!-- Sidebar -->
186
+ <nav class="sidebar">
187
+ <div class="logo">
188
+ <span class="logo-icon">&#128202;</span>
189
+ <span class="logo-text">TG Analytics</span>
190
+ </div>
191
+ <ul class="nav-menu">
192
+ <li class="nav-item">
193
+ <a href="/" class="nav-link">
194
+ <span class="icon">&#128200;</span>
195
+ <span>Overview</span>
196
+ </a>
197
+ </li>
198
+ <li class="nav-item">
199
+ <a href="/users" class="nav-link">
200
+ <span class="icon">&#128101;</span>
201
+ <span>Users</span>
202
+ </a>
203
+ </li>
204
+ <li class="nav-item">
205
+ <a href="/chat" class="nav-link">
206
+ <span class="icon">&#128172;</span>
207
+ <span>Chat</span>
208
+ </a>
209
+ </li>
210
+ <li class="nav-item">
211
+ <a href="/search" class="nav-link">
212
+ <span class="icon">&#128269;</span>
213
+ <span>Search</span>
214
+ </a>
215
+ </li>
216
+ <li class="nav-item active">
217
+ <a href="/ai-search" class="nav-link">
218
+ <span class="icon">&#129302;</span>
219
+ <span>AI Search</span>
220
+ </a>
221
+ </li>
222
+ <li class="nav-item">
223
+ <a href="/moderation" class="nav-link">
224
+ <span class="icon">&#128737;</span>
225
+ <span>Moderation</span>
226
+ </a>
227
+ </li>
228
+ <li class="nav-item">
229
+ <a href="/settings" class="nav-link">
230
+ <span class="icon">&#9881;</span>
231
+ <span>Settings</span>
232
+ </a>
233
+ </li>
234
+ </ul>
235
+ </nav>
236
+
237
+ <!-- Main Content -->
238
+ <main class="main-content">
239
+ <!-- Header -->
240
+ <header class="header">
241
+ <h1>&#129302; AI Search</h1>
242
+ <div class="header-controls">
243
+ <span id="gemini-status" class="status-badge unavailable">Checking...</span>
244
+ </div>
245
+ </header>
246
+
247
+ <div class="ai-container">
248
+ <!-- Search Box -->
249
+ <div class="ai-search-box">
250
+ <input type="text" id="ai-query" class="ai-search-input"
251
+ placeholder="&#1513;&#1488;&#1500; &#1513;&#1488;&#1500;&#1492; &#1489;&#1513;&#1508;&#1492; &#1496;&#1489;&#1506;&#1497;&#1514;... (&#1488;&#1497;&#1508;&#1492; &#1491;&#1504;&#1497; &#1490;&#1512;?)"
252
+ onkeypress="if(event.key === 'Enter') performAISearch()">
253
+ <div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: var(--spacing-md);">
254
+ <button onclick="performAISearch()" class="ai-search-btn" id="search-btn">
255
+ <span>&#129302;</span> Search with AI
256
+ </button>
257
+ <div class="example-queries">
258
+ <span style="color: var(--text-muted); font-size: 0.85rem;">Examples:</span>
259
+ <button class="example-query" onclick="setQuery('&#1488;&#1497;&#1508;&#1492; &#1491;&#1504;&#1497; &#1490;&#1512;?')">&#1488;&#1497;&#1508;&#1492; &#1491;&#1504;&#1497; &#1490;&#1512;?</button>
260
+ <button class="example-query" onclick="setQuery('&#1502;&#1497; &#1492;&#1499;&#1497; &#1508;&#1506;&#1497;&#1500; &#1489;&#1511;&#1489;&#1493;&#1510;&#1492;?')">&#1502;&#1497; &#1492;&#1499;&#1497; &#1508;&#1506;&#1497;&#1500;?</button>
261
+ <button class="example-query" onclick="setQuery('&#1502;&#1492; &#1491;&#1497;&#1489;&#1512;&#1493; &#1506;&#1500; &#1492;&#1489;&#1495;&#1497;&#1512;&#1493;&#1514;?')">&#1502;&#1492; &#1491;&#1497;&#1489;&#1512;&#1493; &#1506;&#1500;...?</button>
262
+ </div>
263
+ </div>
264
+ </div>
265
+
266
+ <!-- Answer Box -->
267
+ <div class="ai-answer-box" id="answer-box">
268
+ <div class="ai-answer-header">
269
+ <span>&#129302;</span> AI Answer
270
+ </div>
271
+ <div class="ai-answer-content" id="answer-content">
272
+ <!-- Answer will be inserted here -->
273
+ </div>
274
+ <div class="ai-sources" id="sources-section" style="display: none;">
275
+ <div class="ai-sources-header">&#128214; Sources Used</div>
276
+ <div id="sources-list">
277
+ <!-- Sources will be inserted here -->
278
+ </div>
279
+ </div>
280
+ </div>
281
+
282
+ <!-- How it works -->
283
+ <div class="chart-card">
284
+ <div class="chart-header">
285
+ <h3>&#128161; How AI Search Works</h3>
286
+ </div>
287
+ <div style="padding: var(--spacing-md); color: var(--text-secondary); font-size: 0.9rem; direction: rtl;">
288
+ <ol style="line-height: 2;">
289
+ <li><strong>Hybrid Search</strong> - &#1502;&#1495;&#1508;&#1513; &#1489;-BM25 (&#1502;&#1497;&#1500;&#1497;&#1501;) + Vector Search (&#1502;&#1513;&#1502;&#1506;&#1493;&#1514;)</li>
290
+ <li><strong>Thread Chunking</strong> - &#1502;&#1511;&#1489;&#1509; &#1513;&#1488;&#1500;&#1493;&#1514; &#1493;&#1514;&#1513;&#1493;&#1489;&#1493;&#1514; &#1497;&#1495;&#1491;</li>
291
+ <li><strong>Query Expansion</strong> - &#1502;&#1512;&#1495;&#1497;&#1489; &#1488;&#1514; &#1492;&#1513;&#1488;&#1497;&#1500;&#1514;&#1492; &#1506;&#1501; &#1502;&#1497;&#1500;&#1497;&#1501; &#1504;&#1512;&#1491;&#1508;&#1493;&#1514;</li>
292
+ <li><strong>Gemini 1.5 Flash</strong> - &#1502;&#1505;&#1499;&#1501; &#1488;&#1514; &#1492;&#1514;&#1493;&#1510;&#1488;&#1493;&#1514; &#1500;&#1514;&#1513;&#1493;&#1489;&#1492; &#1488;&#1495;&#1514;</li>
293
+ </ol>
294
+ </div>
295
+ </div>
296
+ </div>
297
+ </main>
298
+
299
+ <script>
300
+ // Check Gemini status on load
301
+ async function checkGeminiStatus() {
302
+ try {
303
+ const response = await fetch('/api/gemini/status');
304
+ const data = await response.json();
305
+
306
+ const badge = document.getElementById('gemini-status');
307
+ if (data.available) {
308
+ badge.className = 'status-badge available';
309
+ badge.innerHTML = '&#10003; Gemini Ready';
310
+ } else {
311
+ badge.className = 'status-badge unavailable';
312
+ badge.innerHTML = '&#10007; Gemini Unavailable';
313
+ }
314
+ } catch (e) {
315
+ const badge = document.getElementById('gemini-status');
316
+ badge.className = 'status-badge unavailable';
317
+ badge.innerHTML = '&#10007; Error';
318
+ }
319
+ }
320
+
321
+ function setQuery(query) {
322
+ document.getElementById('ai-query').value = query;
323
+ document.getElementById('ai-query').focus();
324
+ }
325
+
326
+ async function performAISearch() {
327
+ const query = document.getElementById('ai-query').value.trim();
328
+ if (!query) return;
329
+
330
+ const btn = document.getElementById('search-btn');
331
+ const answerBox = document.getElementById('answer-box');
332
+ const answerContent = document.getElementById('answer-content');
333
+ const sourcesSection = document.getElementById('sources-section');
334
+ const sourcesList = document.getElementById('sources-list');
335
+
336
+ // Show loading
337
+ btn.disabled = true;
338
+ btn.innerHTML = '<div class="loading-animation"><div class="loading-dots"><span></span><span></span><span></span></div> Searching...</div>';
339
+
340
+ answerBox.classList.add('visible');
341
+ answerContent.innerHTML = '<div class="loading-animation"><div class="loading-dots"><span></span><span></span><span></span></div> <span>Searching and analyzing...</span></div>';
342
+ sourcesSection.style.display = 'none';
343
+
344
+ try {
345
+ const response = await fetch('/api/gemini/search', {
346
+ method: 'POST',
347
+ headers: { 'Content-Type': 'application/json' },
348
+ body: JSON.stringify({ query, limit: 5 })
349
+ });
350
+
351
+ const data = await response.json();
352
+
353
+ if (data.error && !data.answer) {
354
+ answerContent.innerHTML = `<span style="color: #e74c3c;">&#10060; Error: ${escapeHtml(data.error)}</span>`;
355
+ } else if (data.success === false && data.error) {
356
+ // Gemini not available, show hybrid results
357
+ answerContent.innerHTML = `<span style="color: #f39c12;">&#9888; ${escapeHtml(data.error)}</span><br><br>Showing search results without AI summarization:`;
358
+
359
+ if (data.search_results && data.search_results.length > 0) {
360
+ displaySources(data.search_results);
361
+ }
362
+ } else {
363
+ // Success with AI answer
364
+ answerContent.textContent = data.answer || 'No answer available';
365
+
366
+ // Show sources
367
+ if (data.search_results && data.search_results.length > 0) {
368
+ displaySources(data.search_results);
369
+ } else if (data.sources && data.sources.length > 0) {
370
+ displaySourcesMeta(data.sources);
371
+ }
372
+ }
373
+ } catch (error) {
374
+ answerContent.innerHTML = `<span style="color: #e74c3c;">&#10060; Error: ${escapeHtml(error.message)}</span>`;
375
+ }
376
+
377
+ // Reset button
378
+ btn.disabled = false;
379
+ btn.innerHTML = '<span>&#129302;</span> Search with AI';
380
+ }
381
+
382
+ function displaySources(results) {
383
+ const sourcesSection = document.getElementById('sources-section');
384
+ const sourcesList = document.getElementById('sources-list');
385
+
386
+ sourcesSection.style.display = 'block';
387
+
388
+ sourcesList.innerHTML = results.map(result => {
389
+ const msg = result.message || result;
390
+ const score = result.score ? ` (${(result.score * 100).toFixed(0)}%)` : '';
391
+
392
+ let html = `<div class="source-item">
393
+ <div class="source-meta">
394
+ ${escapeHtml(msg.from_name || 'Unknown')} - ${msg.date || ''}${score}
395
+ </div>
396
+ <div class="source-text">${escapeHtml((msg.text || '').substring(0, 200))}${(msg.text || '').length > 200 ? '...' : ''}</div>`;
397
+
398
+ // Show context if available
399
+ if (result.context_before && result.context_before.length > 0) {
400
+ html += '<div style="margin-top: 0.5rem; padding-top: 0.5rem; border-top: 1px dashed var(--border-color);">';
401
+ result.context_before.forEach(ctx => {
402
+ html += `<div class="context-message"><small>${escapeHtml(ctx.from_name || '?')}</small>: ${escapeHtml((ctx.text_plain || '').substring(0, 100))}</div>`;
403
+ });
404
+ html += '</div>';
405
+ }
406
+
407
+ html += '</div>';
408
+ return html;
409
+ }).join('');
410
+ }
411
+
412
+ function displaySourcesMeta(sources) {
413
+ const sourcesSection = document.getElementById('sources-section');
414
+ const sourcesList = document.getElementById('sources-list');
415
+
416
+ if (sources.length === 0) return;
417
+
418
+ sourcesSection.style.display = 'block';
419
+ sourcesList.innerHTML = sources.map(src => `
420
+ <div class="source-item">
421
+ <div class="source-meta">
422
+ ${escapeHtml(src.from_name || 'Unknown')} - ${src.date || ''}
423
+ </div>
424
+ </div>
425
+ `).join('');
426
+ }
427
+
428
+ function escapeHtml(text) {
429
+ if (!text) return '';
430
+ const div = document.createElement('div');
431
+ div.textContent = text;
432
+ return div.innerHTML;
433
+ }
434
+
435
+ function toggleMobileMenu() {
436
+ var s = document.querySelector('.sidebar');
437
+ var o = document.querySelector('.sidebar-overlay');
438
+ s.classList.toggle('open');
439
+ if (o) o.classList.toggle('active');
440
+ }
441
+
442
+ // Initialize
443
+ document.addEventListener('DOMContentLoaded', () => {
444
+ checkGeminiStatus();
445
+ document.getElementById('ai-query').focus();
446
+ });
447
+ </script>
448
+ </body>
449
+ </html>
templates/index.html CHANGED
@@ -41,6 +41,12 @@
41
  <span>Search</span>
42
  </a>
43
  </li>
 
 
 
 
 
 
44
  <li class="nav-item">
45
  <a href="/moderation" class="nav-link">
46
  <span class="icon">🛡️</span>
 
41
  <span>Search</span>
42
  </a>
43
  </li>
44
+ <li class="nav-item">
45
+ <a href="/ai-search" class="nav-link">
46
+ <span class="icon">🤖</span>
47
+ <span>AI Search</span>
48
+ </a>
49
+ </li>
50
  <li class="nav-item">
51
  <a href="/moderation" class="nav-link">
52
  <span class="icon">🛡️</span>
templates/moderation.html CHANGED
@@ -41,6 +41,12 @@
41
  <span>Search</span>
42
  </a>
43
  </li>
 
 
 
 
 
 
44
  <li class="nav-item active">
45
  <a href="/moderation" class="nav-link">
46
  <span class="icon">🛡️</span>
 
41
  <span>Search</span>
42
  </a>
43
  </li>
44
+ <li class="nav-item">
45
+ <a href="/ai-search" class="nav-link">
46
+ <span class="icon">🤖</span>
47
+ <span>AI Search</span>
48
+ </a>
49
+ </li>
50
  <li class="nav-item active">
51
  <a href="/moderation" class="nav-link">
52
  <span class="icon">🛡️</span>
templates/search.html CHANGED
@@ -40,6 +40,12 @@
40
  <span>Search</span>
41
  </a>
42
  </li>
 
 
 
 
 
 
43
  <li class="nav-item">
44
  <a href="/moderation" class="nav-link">
45
  <span class="icon">🛡️</span>
 
40
  <span>Search</span>
41
  </a>
42
  </li>
43
+ <li class="nav-item">
44
+ <a href="/ai-search" class="nav-link">
45
+ <span class="icon">🤖</span>
46
+ <span>AI Search</span>
47
+ </a>
48
+ </li>
49
  <li class="nav-item">
50
  <a href="/moderation" class="nav-link">
51
  <span class="icon">🛡️</span>
templates/settings.html CHANGED
@@ -200,6 +200,12 @@
200
  <span>Search</span>
201
  </a>
202
  </li>
 
 
 
 
 
 
203
  <li class="nav-item">
204
  <a href="/moderation" class="nav-link">
205
  <span class="icon">🛡️</span>
 
200
  <span>Search</span>
201
  </a>
202
  </li>
203
+ <li class="nav-item">
204
+ <a href="/ai-search" class="nav-link">
205
+ <span class="icon">🤖</span>
206
+ <span>AI Search</span>
207
+ </a>
208
+ </li>
209
  <li class="nav-item">
210
  <a href="/moderation" class="nav-link">
211
  <span class="icon">🛡️</span>
templates/users.html CHANGED
@@ -41,6 +41,12 @@
41
  <span>Search</span>
42
  </a>
43
  </li>
 
 
 
 
 
 
44
  <li class="nav-item">
45
  <a href="/moderation" class="nav-link">
46
  <span class="icon">🛡️</span>
 
41
  <span>Search</span>
42
  </a>
43
  </li>
44
+ <li class="nav-item">
45
+ <a href="/ai-search" class="nav-link">
46
+ <span class="icon">🤖</span>
47
+ <span>AI Search</span>
48
+ </a>
49
+ </li>
50
  <li class="nav-item">
51
  <a href="/moderation" class="nav-link">
52
  <span class="icon">🛡️</span>