File size: 7,548 Bytes
46336b9
2fe112f
 
 
d5ab31f
 
 
 
2fe112f
d5ab31f
2fe112f
d5ab31f
 
ac99488
94fd767
d5ab31f
 
 
6b76364
2fe112f
 
 
 
d5ab31f
 
 
 
 
 
 
 
 
 
 
 
2fe112f
 
d5ab31f
 
 
 
 
 
 
 
 
e5cb8c8
2fe112f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5cb8c8
d519e61
e5cb8c8
2fe112f
 
 
 
d519e61
2fe112f
 
 
 
 
 
 
 
d5ab31f
2fe112f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5ab31f
e5cb8c8
2fe112f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5cb8c8
d5ab31f
 
 
2fe112f
d5ab31f
 
2fe112f
 
d5ab31f
2fe112f
 
 
 
e5cb8c8
d5ab31f
 
2fe112f
 
 
d5ab31f
 
 
2fe112f
d5ab31f
0e9a3db
d5ab31f
 
 
 
 
 
1c6076d
2fe112f
d5ab31f
2fe112f
d5ab31f
 
e5cb8c8
2fe112f
 
 
e5cb8c8
 
2fe112f
 
5ae79a2
2fe112f
e5cb8c8
 
d5ab31f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
import os
import re
import json
import shutil
import sqlite3
import gradio as gr
from huggingface_hub import InferenceClient, hf_hub_download

# ---------------------------------
# Config
# ---------------------------------
DB_FILENAME = "auth_llm-v3.sqlite"
DB_PATH = f"./{DB_FILENAME}"
DATASET_REPO_ID = "ameyjoshi8198/auth-log-db"

HF_TOKEN = os.environ["HF_TOKEN"]
client = InferenceClient(token=HF_TOKEN)

MODEL_NAME = "inclusionAI/Ling-2.6-1T:novita"

# ---------------------------------
# DB setup
# ---------------------------------
def ensure_database():
    if not os.path.exists(DB_PATH) or os.path.getsize(DB_PATH) < 1024:
        print("Downloading SQLite database from HF dataset repo...")
        downloaded_path = hf_hub_download(
            repo_id=DATASET_REPO_ID,
            repo_type="dataset",
            filename=DB_FILENAME,
            token=HF_TOKEN
        )
        if downloaded_path != DB_PATH:
            shutil.copy(downloaded_path, DB_PATH)

    print(f"Database ready at {DB_PATH}")
    print(f"Database size: {os.path.getsize(DB_PATH)} bytes")

def debug_database():
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = [row[0] for row in cursor.fetchall()]
    conn.close()
    print("Available tables:", tables)
    return tables

# ---------------------------------
# Helpers
# ---------------------------------
def extract_ip(text):
    match = re.search(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text)
    return match.group(0) if match else None

def extract_hour(text):
    match = re.search(r"\b(\d{1,2})\s*(?:am|pm)?\b", text.lower())
    return int(match.group(1)) if match else None

def extract_date_fragment(text):
    months = [
        "jan", "feb", "mar", "apr", "may", "jun",
        "jul", "aug", "sep", "oct", "nov", "dec"
    ]
    t = text.lower()
    for m in months:
        if m in t:
            return m
    return None

def detect_intent(question):
    q = question.lower()

    if extract_ip(q):
        return "ip_drilldown"
    if "incident" in q:
        return "incidents"
    if "top" in q or "suspicious" in q or "threat" in q:
        return "top_threats"
    if "summary" in q or "report" in q:
        return "summary"
    if "event type" in q or "common event" in q:
        return "event_types"
    if "what happened" in q or "around" in q or "at" in q:
        return "time_slice"
    return "general"

# ---------------------------------
# SQL retrieval
# ---------------------------------
def query_db(sql, params=()):
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row
    cursor = conn.cursor()
    cursor.execute(sql, params)
    rows = [dict(r) for r in cursor.fetchall()]
    conn.close()
    return rows

def retrieve_top_threats():
    return query_db("""
        SELECT src_ip, threat_score, severity, event_count, session_count,
               failed_password_hits, invalid_user_hits, top_usernames
        FROM ip_profiles
        ORDER BY threat_score DESC
        LIMIT 10
    """)

def retrieve_incidents():
    return query_db("""
        SELECT incident_id, src_ip, start_time, end_time, event_count,
               session_count, failed_password_hits, invalid_user_hits, top_usernames
        FROM incidents
        ORDER BY start_time DESC
        LIMIT 10
    """)

def retrieve_summary():
    return query_db("""
        SELECT *
        FROM daily_summary
        ORDER BY daybucket DESC
        LIMIT 10
    """)

def retrieve_event_types():
    return query_db("""
        SELECT event_type, COUNT(*) AS hits
        FROM events
        GROUP BY event_type
        ORDER BY hits DESC
        LIMIT 10
    """)

def retrieve_ip_drilldown(ip):
    profile = query_db("""
        SELECT *
        FROM ip_profiles
        WHERE src_ip = ?
    """, (ip,))

    incidents = query_db("""
        SELECT *
        FROM incidents
        WHERE src_ip = ?
        ORDER BY start_time DESC
        LIMIT 10
    """, (ip,))

    explanations = query_db("""
        SELECT *
        FROM ip_explanations
        WHERE src_ip = ?
    """, (ip,))

    recent_events = query_db("""
        SELECT *
        FROM events
        WHERE src_ip = ?
        ORDER BY timestamp DESC
        LIMIT 25
    """, (ip,))

    return {
        "profile": profile,
        "incidents": incidents,
        "explanations": explanations,
        "recent_events": recent_events
    }

def retrieve_time_slice(question):
    hour = extract_hour(question)
    month_fragment = extract_date_fragment(question)

    sql = """
        SELECT timestamp, src_ip, username, event_type, auth_phase, severity_hint
        FROM events
        WHERE 1=1
    """
    params = []

    if hour is not None:
        sql += " AND CAST(strftime('%H', timestamp) AS INTEGER) = ?"
        params.append(hour)

    if month_fragment:
        sql += " AND lower(timestamp) LIKE ?"
        params.append(f"%{month_fragment}%")

    sql += " ORDER BY timestamp DESC LIMIT 50"

    rows = query_db(sql, tuple(params))
    return rows

def retrieve_evidence(question):
    intent = detect_intent(question)

    if intent == "top_threats":
        return {"intent": intent, "data": retrieve_top_threats()}
    elif intent == "incidents":
        return {"intent": intent, "data": retrieve_incidents()}
    elif intent == "summary":
        return {"intent": intent, "data": retrieve_summary()}
    elif intent == "event_types":
        return {"intent": intent, "data": retrieve_event_types()}
    elif intent == "ip_drilldown":
        ip = extract_ip(question)
        return {"intent": intent, "data": retrieve_ip_drilldown(ip)}
    elif intent == "time_slice":
        return {"intent": intent, "data": retrieve_time_slice(question)}
    else:
        return {
            "intent": "general",
            "data": {
                "top_threats": retrieve_top_threats(),
                "recent_incidents": retrieve_incidents(),
                "event_types": retrieve_event_types()
            }
        }

# ---------------------------------
# Answer generation
# ---------------------------------
def answer_question(question):
    try:
        evidence = retrieve_evidence(question)

        if not evidence or not evidence.get("data"):
            return "I could not find relevant evidence in the database for that question."

        prompt = f"""
You are a security log analyst.

Use ONLY the evidence below.
Do not invent facts.
If the evidence is incomplete, say so clearly.
Prefer concrete observations over speculation.

Question:
{question}

Retrieved evidence:
{json.dumps(evidence, indent=2, default=str)}
"""

        response = client.chat_completion(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1024
        )

        return response.choices[0].message.content

    except Exception as e:
        return f"Error: {str(e)}"

# ---------------------------------
# Startup
# ---------------------------------
ensure_database()
debug_database()

# ---------------------------------
# Gradio app
# ---------------------------------
demo = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(label="Ask a question about the logs", lines=2, placeholder="e.g. Why is 173.234.31.186 suspicious?"),
    outputs=gr.Textbox(label="Answer", lines=16),
    title="Log Analyzer",
    description="Ask grounded questions about the open source SSH log dataset."
)

demo.launch(server_name="0.0.0.0", server_port=7860)