NeerajCodz commited on
Commit
c18dee2
·
verified ·
1 Parent(s): 1cb9d55

Initial commit of RAG Slack bot

Browse files
Files changed (4) hide show
  1. Dockerfile +20 -0
  2. README.md +29 -10
  3. app.py +181 -0
  4. requirements.txt +11 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.12-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file into the container
8
+ COPY requirements.txt .
9
+
10
+ # Install any needed packages specified in requirements.txt
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the current directory contents into the container
14
+ COPY . .
15
+
16
+ # Expose port 7860 (default for HF Spaces, or adjust as needed)
17
+ EXPOSE 7860
18
+
19
+ # Run the application with uvicorn
20
+ CMD ["uvicorn", "app:api", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,29 @@
1
- ---
2
- title: Rag Slack
3
- emoji: 🚀
4
- colorFrom: yellow
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Slack RAG Bot
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ # Slack RAG Bot
11
+
12
+ A production-ready Slack bot that processes documents (PDF/DOCX) and answers questions using RAG (Retrieval-Augmented Generation).
13
+
14
+ ## Features
15
+
16
+ - 📄 Process PDF and DOCX files
17
+ - 🔍 Vector search using Supabase
18
+ - 💬 Answer questions based on uploaded documents
19
+ - 🤖 Powered by sentence-transformers and RoBERTa
20
+
21
+ ## Environment Variables
22
+
23
+ Set these in your HuggingFace Space settings:
24
+
25
+ - `HF_TOKEN`: Your HuggingFace token
26
+ - `SUPABASE_URL`: Your Supabase project URL
27
+ - `SUPABASE_KEY`: Your Supabase anon key
28
+ - `SLACK_BOT_TOKEN`: Your Slack bot token (xoxb-...)
29
+ - `SLACK_SIGNING_SECRET`: Your Slack signing secret
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import re
4
+ from typing import List, Dict, Any
5
+ from fastapi import FastAPI, Request
6
+ from slack_bolt import App
7
+ from slack_bolt.adapter.fastapi import SlackRequestHandler
8
+ from sentence_transformers import SentenceTransformer
9
+ from transformers import pipeline
10
+ from supabase import create_client, Client
11
+ import pypdf
12
+ from docx import Document
13
+ import requests
14
+ import uvicorn
15
+
16
+ # Load secrets from environment variables
17
+ SUPABASE_URL = os.environ.get("SUPABASE_URL")
18
+ SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
19
+ SLACK_BOT_TOKEN = os.environ.get("SLACK_BOT_TOKEN")
20
+ SLACK_SIGNING_SECRET = os.environ.get("SLACK_SIGNING_SECRET")
21
+ SLACK_CLIENT_ID = os.environ.get("SLACK_CLIENT_ID")
22
+ SLACK_CLIENT_SECRET = os.environ.get("SLACK_CLIENT_SECRET")
23
+ HF_TOKEN = os.environ.get("HF_TOKEN") # Optional for public models, but suppresses warnings
24
+
25
+ # Set HF_TOKEN if provided (helps with authentication for Hub access)
26
+ if HF_TOKEN:
27
+ from huggingface_hub import login
28
+ login(token=HF_TOKEN)
29
+
30
+ supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
31
+
32
+ app = App(token=SLACK_BOT_TOKEN, signing_secret=SLACK_SIGNING_SECRET)
33
+ api = FastAPI()
34
+
35
+ print("Loading embedding model...")
36
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
37
+ print("Loading QA model...")
38
+ qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
39
+ print("Models loaded successfully!")
40
+
41
+ def download_slack_file(url: str, token: str) -> bytes:
42
+ headers = {"Authorization": f"Bearer {token}"}
43
+ response = requests.get(url, headers=headers)
44
+ response.raise_for_status()
45
+ return response.content
46
+
47
+ def extract_text_from_pdf(file_content: bytes) -> str:
48
+ pdf_reader = pypdf.PdfReader(io.BytesIO(file_content))
49
+ text = ""
50
+ for page in pdf_reader.pages:
51
+ text += page.extract_text() + "\n"
52
+ return text
53
+
54
+ def extract_text_from_docx(file_content: bytes) -> str:
55
+ doc = Document(io.BytesIO(file_content))
56
+ text = ""
57
+ for paragraph in doc.paragraphs:
58
+ text += paragraph.text + "\n"
59
+ return text
60
+
61
+ def chunk_text(text: str, chunk_size: int = 300) -> List[str]:
62
+ words = text.split()
63
+ chunks = []
64
+ for i in range(0, len(words), chunk_size):
65
+ chunk = " ".join(words[i:i + chunk_size])
66
+ if chunk.strip():
67
+ chunks.append(chunk)
68
+ return chunks
69
+
70
+ def embed_text(text: str) -> List[float]:
71
+ embedding = embedding_model.encode(text)
72
+ return embedding.tolist()
73
+
74
+ def store_embeddings(chunks: List[str]):
75
+ for chunk in chunks:
76
+ embedding = embed_text(chunk)
77
+ supabase.table("documents").insert({
78
+ "content": chunk,
79
+ "embedding": embedding
80
+ }).execute()
81
+
82
+ def search_documents(query: str, match_count: int = 5) -> List[Dict[str, Any]]:
83
+ query_embedding = embed_text(query)
84
+ result = supabase.rpc("match_documents", {
85
+ "query_embedding": query_embedding,
86
+ "match_count": match_count
87
+ }).execute()
88
+ return result.data
89
+
90
+ def answer_question(question: str, context: str) -> str:
91
+ if not context.strip():
92
+ return "No relevant documents found."
93
+ result = qa_pipeline(question=question, context=context[:4096])
94
+ return result['answer']
95
+
96
+ @app.event("file_shared")
97
+ def handle_file_shared(event, say, client):
98
+ file_id = event["file_id"]
99
+ file_info = client.files_info(file=file_id)
100
+ file_data = file_info["file"]
101
+
102
+ file_type = file_data.get("mimetype", "")
103
+ file_url = file_data.get("url_private_download")
104
+
105
+ if not file_url:
106
+ return
107
+
108
+ try:
109
+ file_content = download_slack_file(file_url, SLACK_BOT_TOKEN)
110
+
111
+ text = ""
112
+ if "pdf" in file_type:
113
+ text = extract_text_from_pdf(file_content)
114
+ elif "wordprocessingml" in file_type or "msword" in file_type:
115
+ text = extract_text_from_docx(file_content)
116
+ else:
117
+ say("Unsupported file type. Please upload PDF or DOCX files.")
118
+ return
119
+
120
+ chunks = chunk_text(text)
121
+ store_embeddings(chunks)
122
+
123
+ say(f"✅ File processed successfully! Added {len(chunks)} chunks to knowledge base.")
124
+ except Exception as e:
125
+ say(f"❌ Error processing file: {str(e)}")
126
+
127
+ @app.event("app_mention")
128
+ def handle_mention(event, say):
129
+ text = event["text"]
130
+ user_query = re.sub(r'<@[A-Z0-9]+>', '', text).strip()
131
+
132
+ if not user_query:
133
+ say("Please ask me a question!")
134
+ return
135
+
136
+ try:
137
+ results = search_documents(user_query, match_count=5)
138
+
139
+ if not results:
140
+ say("I couldn't find any relevant information in my knowledge base.")
141
+ return
142
+
143
+ context = " ".join([doc["content"] for doc in results])
144
+ answer = answer_question(user_query, context)
145
+
146
+ say(f"💡 *Answer:* {answer}")
147
+ except Exception as e:
148
+ say(f"❌ Error answering question: {str(e)}")
149
+
150
+ handler = SlackRequestHandler(app)
151
+
152
+ @api.post("/slack/events")
153
+ async def slack_events(request: Request):
154
+ return await handler.handle(request)
155
+
156
+ @api.get("/")
157
+ async def root():
158
+ return {"status": "Slack RAG Bot is running!", "message": "Use /slack/events endpoint for Slack events"}
159
+
160
+ @api.get("/health")
161
+ async def health():
162
+ return {"status": "ok"}
163
+
164
+ @api.get("/slack/oauth/callback")
165
+ async def oauth_callback(code: str, state: str = None):
166
+ # Handle OAuth installation
167
+ from slack_sdk.oauth import AuthorizeUrlGenerator
168
+ from slack_sdk.web import WebClient
169
+
170
+ client = WebClient()
171
+ oauth_response = client.oauth_v2_access(
172
+ client_id=SLACK_CLIENT_ID,
173
+ client_secret=SLACK_CLIENT_SECRET,
174
+ code=code
175
+ )
176
+
177
+ # Save the token for this workspace
178
+ return {"status": "success", "team_id": oauth_response["team"]["id"]}
179
+
180
+ if __name__ == "__main__":
181
+ uvicorn.run(api, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ slack-bolt==1.18.0
4
+ sentence-transformers==2.2.2
5
+ transformers==4.35.2
6
+ supabase==2.0.3
7
+ pypdf==3.17.1
8
+ python-docx==1.1.0
9
+ requests==2.31.0
10
+ torch==2.1.0
11
+ huggingface-hub==0.17.3