Luciano665 commited on
Commit
6c1db65
·
verified ·
1 Parent(s): 26ff495

New main files uploaded test-1

Browse files
Files changed (2) hide show
  1. app.py +277 -0
  2. requirements.txt +102 -0
app.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import logging
4
+ from git import Repo
5
+ from dotenv import load_dotenv
6
+ from sentence_transformers import SentenceTransformer
7
+ from langchain_pinecone import PineconeVectorStore
8
+ from langchain.schema import Document
9
+ from tree_sitter_languages import get_parser
10
+ from pinecone import Pinecone
11
+ import openai
12
+ import numpy as np
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+
17
+ # Logging Configuration
18
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Environment Variables
22
+ CLONE_DIR = "./cloned_repos"
23
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
24
+ PINECONE_INDEX_KEY = "codebase-app"
25
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
26
+
27
+ # Initialize GROQ API
28
+ client = openai.OpenAI(
29
+ base_url="https://api.groq.com/openai/v1",
30
+ api_key=GROQ_API_KEY
31
+ )
32
+
33
+ # Initialize Pinecone
34
+ pinecone_client = Pinecone(api_key=PINECONE_API_KEY)
35
+ pinecone_index = pinecone_client.Index(PINECONE_INDEX_KEY)
36
+
37
+ # Initialize SentenceTransformer Embedding Model
38
+ embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
39
+
40
+ # Supported Extensions
41
+ SUPPORTED_EXTENSIONS = {".py", ".java", ".js", ".ts", ".cpp", ".h", ".ipynb"}
42
+ IGNORED_DIRS = {"node_modules", "venv", "env", ".git", "__pycache__"}
43
+
44
+ # Backend Logic: Clone Repository
45
+ def clone_repository(repo_url: str) -> str:
46
+ """Clone the GitHub repository locally."""
47
+ repo_name = repo_url.split("/")[-1].replace(".git", "")
48
+ repo_path = os.path.join(CLONE_DIR, repo_name)
49
+ if not os.path.exists(CLONE_DIR):
50
+ os.makedirs(CLONE_DIR)
51
+ if os.path.exists(repo_path):
52
+ logger.info(f"Repository already exists: {repo_path}")
53
+ return repo_path
54
+ Repo.clone_from(repo_url, repo_path)
55
+ logger.info(f"Cloned repository to: {repo_path}")
56
+ return repo_path
57
+
58
+ # Backend Logic: Parse Repository
59
+ class SimpleTreeSitterParser:
60
+ """Parser for extracting code chunks from files."""
61
+ def __init__(self, language: str):
62
+ self.language = language
63
+ try:
64
+ self.parser = get_parser(language) # Ensure only the required argument is passed
65
+ except Exception as e:
66
+ logger.error(f"Error initializing parser for {language}: {e}")
67
+ raise ValueError(f"Parser error for {language}: {e}")
68
+
69
+ def parse(self, code: str) -> list:
70
+ try:
71
+ tree = self.parser.parse(bytes(code, "utf-8"))
72
+ root = tree.root_node
73
+ chunks = []
74
+ for child in root.children:
75
+ chunks.append({
76
+ "type": child.type,
77
+ "content": code[child.start_byte:child.end_byte],
78
+ "start_line": child.start_point[0] + 1,
79
+ "end_line": child.end_point[0] + 1,
80
+ })
81
+ return chunks
82
+ except Exception as e:
83
+ logger.error(f"Error parsing code: {e}")
84
+ return []
85
+
86
+ def parse_repository(repo_path: str) -> list:
87
+ """Parse repository files into meaningful chunks."""
88
+ chunks = []
89
+ for root, _, files in os.walk(repo_path):
90
+ if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
91
+ continue
92
+ for file in files:
93
+ ext = os.path.splitext(file)[1]
94
+ if ext not in SUPPORTED_EXTENSIONS:
95
+ logger.warning(f"Skipping unsupported file: {file}")
96
+ continue
97
+ file_path = os.path.join(root, file)
98
+ language = {
99
+ ".py": "python",
100
+ ".ts": "typescript",
101
+ ".js": "javascript",
102
+ ".java": "java",
103
+ ".cpp": "cpp",
104
+ }.get(ext, "unknown")
105
+ try:
106
+ logger.info(f"Processing file: {file_path}")
107
+ code = get_file_content(file_path)
108
+ if not code:
109
+ logger.warning(f"No content found in {file_path}")
110
+ continue
111
+ parser = SimpleTreeSitterParser(language)
112
+ parsed_chunks = parser.parse(code)
113
+ chunks.extend(parsed_chunks)
114
+ except ValueError as ve:
115
+ logger.error(f"Skipping file {file_path} due to parser error: {ve}")
116
+ except Exception as e:
117
+ logger.error(f"Unexpected error processing {file_path}: {e}")
118
+ return chunks
119
+
120
+ # Helper: Read File Content
121
+ def get_file_content(file_path: str) -> str:
122
+ """Read and return the content of a file."""
123
+ try:
124
+ with open(file_path, "r", encoding="utf-8") as f:
125
+ return f.read()
126
+ except Exception as e:
127
+ logger.error(f"Error reading file {file_path}: {e}")
128
+ return ""
129
+
130
+ # Backend Logic: Store Embeddings
131
+ def store_embeddings(documents, namespace="default"):
132
+ """Store embeddings in Pinecone."""
133
+ try:
134
+ texts = [doc.page_content for doc in documents]
135
+ embeddings = embedding_model.encode(texts, show_progress_bar=True)
136
+ vectors = [
137
+ {
138
+ "id": str(i),
139
+ "values": embeddings[i].tolist(),
140
+ "metadata": {"text": doc.page_content, **doc.metadata},
141
+ }
142
+ for i, doc in enumerate(documents)
143
+ ]
144
+ pinecone_index.upsert(vectors=vectors, namespace=namespace)
145
+ logger.info(f"Stored {len(vectors)} embeddings in Pinecone namespace '{namespace}'.")
146
+ except Exception as e:
147
+ logger.error(f"Error storing embeddings: {e}")
148
+ raise
149
+
150
+ # Backend Logic: Perform RAG
151
+ def perform_rag(query: str, namespace="default") -> str:
152
+ """Retrieve context and generate responses."""
153
+ try:
154
+ query_embedding = embedding_model.encode(query).tolist()
155
+ response = pinecone_index.query(
156
+ vector=query_embedding,
157
+ top_k=10,
158
+ include_metadata=True,
159
+ namespace=namespace
160
+ )
161
+ if not response.get('matches'):
162
+ return "No relevant context found."
163
+ contexts = [match['metadata'].get('text', '') for match in response['matches']]
164
+ augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts) + "\n-------\n</CONTEXT>\n\n" + query
165
+ llm_response = client.chat.completions.create(
166
+ model="llama-3.1-8b-instant",
167
+ messages=[
168
+ {"role": "system", "content": "Answer concisely."},
169
+ {"role": "user", "content": augmented_query}
170
+ ]
171
+ )
172
+ return llm_response.choices[0].message.content
173
+ except Exception as e:
174
+ logger.error(f"Error performing RAG: {e}")
175
+ return f"Error: {e}"
176
+
177
+ # Process Repository
178
+ def process_repo(repo_url: str) -> str:
179
+ """Clone, parse, and store embeddings for a repository."""
180
+ try:
181
+ namespace = repo_url.split("/")[-1].replace(".git", "")
182
+ repo_path = clone_repository(repo_url)
183
+ chunks = parse_repository(repo_path)
184
+ if not chunks:
185
+ return "No valid chunks found in the repository."
186
+ documents = [Document(page_content=chunk["content"], metadata={"repo_url": repo_url}) for chunk in chunks]
187
+ store_embeddings(documents, namespace=namespace)
188
+ return f"Repository processed successfully in namespace '{namespace}'!"
189
+ except Exception as e:
190
+ logger.error(f"Error processing repository: {e}")
191
+ return f"Error: {e}"
192
+
193
+ # Fetch Namespaces
194
+ def fetch_namespaces():
195
+ """Retrieve namespaces from Pinecone."""
196
+ try:
197
+ stats = pinecone_index.describe_index_stats()
198
+ return list(stats.get("namespaces", {}).keys())
199
+ except Exception as e:
200
+ logger.error(f"Error fetching namespaces: {e}")
201
+ return []
202
+
203
+ # Gradio UI
204
+ def create_ui():
205
+ namespaces = fetch_namespaces()
206
+
207
+ with gr.Blocks() as demo:
208
+ namespace_state = gr.State(value=None)
209
+ chat_history = gr.State(value=[])
210
+
211
+ with gr.Column():
212
+ gr.Markdown("## Codebase Chat App with Repository Management")
213
+ gr.Markdown("""
214
+ **Instructions:**
215
+ 1. Enter the GitHub repository URL you wish to clone and click **Git Clone 😺**.
216
+ 2. Select a namespace and interact with the chatbot below.
217
+ """)
218
+
219
+ with gr.Row():
220
+ repo_url_input = gr.Textbox(label="GitHub Repository URL", placeholder="Enter repo URL to clone")
221
+ clone_button = gr.Button("Git Clone 😺")
222
+ clone_status = gr.Textbox(label="Clone Status", interactive=False)
223
+
224
+ namespace_dropdown = gr.Dropdown(choices=namespaces, label="Namespace", interactive=True)
225
+
226
+ chatbot = gr.Chatbot(label="Codebase Chatbot", type="messages")
227
+ message_input = gr.Textbox(placeholder="Enter your message here...")
228
+ send_button = gr.Button("Send")
229
+
230
+ def update_namespace_or_clone(repo_url, current_namespace):
231
+ """Clone repository and update namespaces."""
232
+ if repo_url:
233
+ message = process_repo(repo_url)
234
+ updated_namespaces = fetch_namespaces()
235
+ return (
236
+ gr.update(choices=updated_namespaces, value=None),
237
+ message,
238
+ [], # Clear chat history
239
+ None
240
+ )
241
+ return gr.update(), "Please provide a repository URL.", current_namespace, current_namespace
242
+
243
+ def handle_query(message, history, namespace):
244
+ """Handle chatbot queries."""
245
+ if not namespace:
246
+ new_history = history + [{"role": "assistant", "content": "Please select a namespace first!"}]
247
+ return new_history, new_history, gr.update(value="")
248
+
249
+ response = perform_rag(message, namespace)
250
+
251
+ # Convert history to the correct format
252
+ formatted_history = history + [
253
+ {"role": "user", "content": message},
254
+ {"role": "assistant", "content": response}
255
+ ]
256
+ return formatted_history, formatted_history, gr.update(value="")
257
+
258
+ # Bind clone button
259
+ clone_button.click(
260
+ update_namespace_or_clone,
261
+ inputs=[repo_url_input, namespace_state],
262
+ outputs=[namespace_dropdown, clone_status, chat_history, namespace_state],
263
+ )
264
+
265
+ # Bind query button
266
+ send_button.click(
267
+ handle_query,
268
+ inputs=[message_input, chat_history, namespace_dropdown],
269
+ outputs=[chatbot, chat_history, message_input],
270
+ )
271
+
272
+ return demo
273
+
274
+
275
+ if __name__ == "__main__":
276
+ app = create_ui()
277
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ annotated-types==0.7.0
5
+ anyio==4.6.2.post1
6
+ attrs==24.2.0
7
+ certifi==2024.8.30
8
+ charset-normalizer==3.4.0
9
+ click==8.1.7
10
+ dataclasses-json==0.6.7
11
+ distro==1.9.0
12
+ fastapi==0.115.5
13
+ ffmpy==0.4.0
14
+ filelock==3.16.1
15
+ frozenlist==1.5.0
16
+ fsspec==2024.10.0
17
+ gitdb==4.0.11
18
+ GitPython==3.1.43
19
+ gradio==5.7.1
20
+ gradio_client==1.5.0
21
+ h11==0.14.0
22
+ httpcore==1.0.7
23
+ httpx==0.28.0
24
+ httpx-sse==0.4.0
25
+ huggingface-hub==0.26.3
26
+ idna==3.10
27
+ Jinja2==3.1.4
28
+ jiter==0.8.0
29
+ joblib==1.4.2
30
+ jsonpatch==1.33
31
+ jsonpointer==3.0.0
32
+ langchain==0.3.9
33
+ langchain-community==0.3.8
34
+ langchain-core==0.3.21
35
+ langchain-pinecone==0.2.0
36
+ langchain-text-splitters==0.3.2
37
+ langsmith==0.1.147
38
+ markdown-it-py==3.0.0
39
+ MarkupSafe==2.1.5
40
+ marshmallow==3.23.1
41
+ mdurl==0.1.2
42
+ mpmath==1.3.0
43
+ multidict==6.1.0
44
+ mypy-extensions==1.0.0
45
+ networkx==3.4.2
46
+ numpy==1.26.4
47
+ openai==1.55.3
48
+ orjson==3.10.12
49
+ packaging==24.2
50
+ pandas==2.2.3
51
+ pillow==11.0.0
52
+ pinecone-client==5.0.1
53
+ pinecone-plugin-inference==1.1.0
54
+ pinecone-plugin-interface==0.0.7
55
+ pip==23.2.1
56
+ propcache==0.2.0
57
+ pydantic==2.10.2
58
+ pydantic_core==2.27.1
59
+ pydantic-settings==2.6.1
60
+ pydub==0.25.1
61
+ Pygments==2.18.0
62
+ python-dateutil==2.9.0.post0
63
+ python-dotenv==1.0.1
64
+ python-multipart==0.0.12
65
+ pytz==2024.2
66
+ PyYAML==6.0.2
67
+ regex==2024.11.6
68
+ requests==2.32.3
69
+ requests-toolbelt==1.0.0
70
+ rich==13.9.4
71
+ ruff==0.8.1
72
+ safehttpx==0.1.1
73
+ safetensors==0.4.5
74
+ scikit-learn==1.5.2
75
+ scipy==1.14.1
76
+ semantic-version==2.10.0
77
+ sentence-transformers==3.3.1
78
+ setuptools==65.5.0
79
+ shellingham==1.5.4
80
+ six==1.16.0
81
+ smmap==5.0.1
82
+ sniffio==1.3.1
83
+ SQLAlchemy==2.0.35
84
+ starlette==0.41.3
85
+ sympy==1.13.1
86
+ tenacity==9.0.0
87
+ threadpoolctl==3.5.0
88
+ tokenizers==0.20.3
89
+ tomlkit==0.12.0
90
+ torch==2.5.1
91
+ tqdm==4.67.1
92
+ transformers==4.46.3
93
+ tree_sitter==0.20.1
94
+ tree-sitter-languages==1.10.2
95
+ typer==0.14.0
96
+ typing_extensions==4.12.2
97
+ typing-inspect==0.9.0
98
+ tzdata==2024.2
99
+ urllib3==2.2.3
100
+ uvicorn==0.32.1
101
+ websockets==12.0
102
+ yarl==1.18.0