Spaces:

Luciano665
/

Codebase-RAG-App

Sleeping

App Files Files Community

Luciano665 commited on Dec 1, 2024

Commit

6c1db65

verified ·

1 Parent(s): 26ff495

New main files uploaded test-1

Browse files

Files changed (2) hide show

app.py +277 -0
requirements.txt +102 -0

app.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import os
+import gradio as gr
+import logging
+from git import Repo
+from dotenv import load_dotenv
+from sentence_transformers import SentenceTransformer
+from langchain_pinecone import PineconeVectorStore
+from langchain.schema import Document
+from tree_sitter_languages import get_parser
+from pinecone import Pinecone
+import openai
+import numpy as np
+# Load environment variables
+load_dotenv()
+# Logging Configuration
+logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Environment Variables
+CLONE_DIR = "./cloned_repos"
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+PINECONE_INDEX_KEY = "codebase-app"
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+# Initialize GROQ API
+client = openai.OpenAI(
+    base_url="https://api.groq.com/openai/v1",
+    api_key=GROQ_API_KEY
+)
+# Initialize Pinecone
+pinecone_client = Pinecone(api_key=PINECONE_API_KEY)
+pinecone_index = pinecone_client.Index(PINECONE_INDEX_KEY)
+# Initialize SentenceTransformer Embedding Model
+embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
+# Supported Extensions
+SUPPORTED_EXTENSIONS = {".py", ".java", ".js", ".ts", ".cpp", ".h", ".ipynb"}
+IGNORED_DIRS = {"node_modules", "venv", "env", ".git", "__pycache__"}
+# Backend Logic: Clone Repository
+def clone_repository(repo_url: str) -> str:
+    """Clone the GitHub repository locally."""
+    repo_name = repo_url.split("/")[-1].replace(".git", "")
+    repo_path = os.path.join(CLONE_DIR, repo_name)
+    if not os.path.exists(CLONE_DIR):
+        os.makedirs(CLONE_DIR)
+    if os.path.exists(repo_path):
+        logger.info(f"Repository already exists: {repo_path}")
+        return repo_path
+    Repo.clone_from(repo_url, repo_path)
+    logger.info(f"Cloned repository to: {repo_path}")
+    return repo_path
+# Backend Logic: Parse Repository
+class SimpleTreeSitterParser:
+    """Parser for extracting code chunks from files."""
+    def __init__(self, language: str):
+        self.language = language
+        try:
+            self.parser = get_parser(language)  # Ensure only the required argument is passed
+        except Exception as e:
+            logger.error(f"Error initializing parser for {language}: {e}")
+            raise ValueError(f"Parser error for {language}: {e}")
+    def parse(self, code: str) -> list:
+        try:
+            tree = self.parser.parse(bytes(code, "utf-8"))
+            root = tree.root_node
+            chunks = []
+            for child in root.children:
+                chunks.append({
+                    "type": child.type,
+                    "content": code[child.start_byte:child.end_byte],
+                    "start_line": child.start_point[0] + 1,
+                    "end_line": child.end_point[0] + 1,
+                })
+            return chunks
+        except Exception as e:
+            logger.error(f"Error parsing code: {e}")
+            return []
+def parse_repository(repo_path: str) -> list:
+    """Parse repository files into meaningful chunks."""
+    chunks = []
+    for root, _, files in os.walk(repo_path):
+        if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
+            continue
+        for file in files:
+            ext = os.path.splitext(file)[1]
+            if ext not in SUPPORTED_EXTENSIONS:
+                logger.warning(f"Skipping unsupported file: {file}")
+                continue
+            file_path = os.path.join(root, file)
+            language = {
+                ".py": "python",
+                ".ts": "typescript",
+                ".js": "javascript",
+                ".java": "java",
+                ".cpp": "cpp",
+            }.get(ext, "unknown")
+            try:
+                logger.info(f"Processing file: {file_path}")
+                code = get_file_content(file_path)
+                if not code:
+                    logger.warning(f"No content found in {file_path}")
+                    continue
+                parser = SimpleTreeSitterParser(language)
+                parsed_chunks = parser.parse(code)
+                chunks.extend(parsed_chunks)
+            except ValueError as ve:
+                logger.error(f"Skipping file {file_path} due to parser error: {ve}")
+            except Exception as e:
+                logger.error(f"Unexpected error processing {file_path}: {e}")
+    return chunks
+# Helper: Read File Content
+def get_file_content(file_path: str) -> str:
+    """Read and return the content of a file."""
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    except Exception as e:
+        logger.error(f"Error reading file {file_path}: {e}")
+        return ""
+# Backend Logic: Store Embeddings
+def store_embeddings(documents, namespace="default"):
+    """Store embeddings in Pinecone."""
+    try:
+        texts = [doc.page_content for doc in documents]
+        embeddings = embedding_model.encode(texts, show_progress_bar=True)
+        vectors = [
+            {
+                "id": str(i),
+                "values": embeddings[i].tolist(),
+                "metadata": {"text": doc.page_content, **doc.metadata},
+            }
+            for i, doc in enumerate(documents)
+        ]
+        pinecone_index.upsert(vectors=vectors, namespace=namespace)
+        logger.info(f"Stored {len(vectors)} embeddings in Pinecone namespace '{namespace}'.")
+    except Exception as e:
+        logger.error(f"Error storing embeddings: {e}")
+        raise
+# Backend Logic: Perform RAG
+def perform_rag(query: str, namespace="default") -> str:
+    """Retrieve context and generate responses."""
+    try:
+        query_embedding = embedding_model.encode(query).tolist()
+        response = pinecone_index.query(
+            vector=query_embedding,
+            top_k=10,
+            include_metadata=True,
+            namespace=namespace
+        )
+        if not response.get('matches'):
+            return "No relevant context found."
+        contexts = [match['metadata'].get('text', '') for match in response['matches']]
+        augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts) + "\n-------\n</CONTEXT>\n\n" + query
+        llm_response = client.chat.completions.create(
+            model="llama-3.1-8b-instant",
+            messages=[
+                {"role": "system", "content": "Answer concisely."},
+                {"role": "user", "content": augmented_query}
+            ]
+        )
+        return llm_response.choices[0].message.content
+    except Exception as e:
+        logger.error(f"Error performing RAG: {e}")
+        return f"Error: {e}"
+# Process Repository
+def process_repo(repo_url: str) -> str:
+    """Clone, parse, and store embeddings for a repository."""
+    try:
+        namespace = repo_url.split("/")[-1].replace(".git", "")
+        repo_path = clone_repository(repo_url)
+        chunks = parse_repository(repo_path)
+        if not chunks:
+            return "No valid chunks found in the repository."
+        documents = [Document(page_content=chunk["content"], metadata={"repo_url": repo_url}) for chunk in chunks]
+        store_embeddings(documents, namespace=namespace)
+        return f"Repository processed successfully in namespace '{namespace}'!"
+    except Exception as e:
+        logger.error(f"Error processing repository: {e}")
+        return f"Error: {e}"
+# Fetch Namespaces
+def fetch_namespaces():
+    """Retrieve namespaces from Pinecone."""
+    try:
+        stats = pinecone_index.describe_index_stats()
+        return list(stats.get("namespaces", {}).keys())
+    except Exception as e:
+        logger.error(f"Error fetching namespaces: {e}")
+        return []
+# Gradio UI
+def create_ui():
+    namespaces = fetch_namespaces()
+    with gr.Blocks() as demo:
+        namespace_state = gr.State(value=None)
+        chat_history = gr.State(value=[])
+        with gr.Column():
+            gr.Markdown("## Codebase Chat App with Repository Management")
+            gr.Markdown("""
+            **Instructions:**
+            1. Enter the GitHub repository URL you wish to clone and click **Git Clone 😺**.
+            2. Select a namespace and interact with the chatbot below.
+            """)
+            with gr.Row():
+                repo_url_input = gr.Textbox(label="GitHub Repository URL", placeholder="Enter repo URL to clone")
+                clone_button = gr.Button("Git Clone 😺")
+                clone_status = gr.Textbox(label="Clone Status", interactive=False)
+                namespace_dropdown = gr.Dropdown(choices=namespaces, label="Namespace", interactive=True)
+            chatbot = gr.Chatbot(label="Codebase Chatbot", type="messages")
+            message_input = gr.Textbox(placeholder="Enter your message here...")
+            send_button = gr.Button("Send")
+        def update_namespace_or_clone(repo_url, current_namespace):
+            """Clone repository and update namespaces."""
+            if repo_url:
+                message = process_repo(repo_url)
+                updated_namespaces = fetch_namespaces()
+                return (
+                    gr.update(choices=updated_namespaces, value=None),
+                    message,
+                    [],  # Clear chat history
+                    None
+                )
+            return gr.update(), "Please provide a repository URL.", current_namespace, current_namespace
+        def handle_query(message, history, namespace):
+            """Handle chatbot queries."""
+            if not namespace:
+                new_history = history + [{"role": "assistant", "content": "Please select a namespace first!"}]
+                return new_history, new_history, gr.update(value="")
+            response = perform_rag(message, namespace)
+            # Convert history to the correct format
+            formatted_history = history + [
+                {"role": "user", "content": message},
+                {"role": "assistant", "content": response}
+            ]
+            return formatted_history, formatted_history, gr.update(value="")
+        # Bind clone button
+        clone_button.click(
+            update_namespace_or_clone,
+            inputs=[repo_url_input, namespace_state],
+            outputs=[namespace_dropdown, clone_status, chat_history, namespace_state],
+        )
+        # Bind query button
+        send_button.click(
+            handle_query,
+            inputs=[message_input, chat_history, namespace_dropdown],
+            outputs=[chatbot, chat_history, message_input],
+        )
+    return demo
+if __name__ == "__main__":
+    app = create_ui()
+    app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,102 @@

+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+attrs==24.2.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+dataclasses-json==0.6.7
+distro==1.9.0
+fastapi==0.115.5
+ffmpy==0.4.0
+filelock==3.16.1
+frozenlist==1.5.0
+fsspec==2024.10.0
+gitdb==4.0.11
+GitPython==3.1.43
+gradio==5.7.1
+gradio_client==1.5.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.0
+httpx-sse==0.4.0
+huggingface-hub==0.26.3
+idna==3.10
+Jinja2==3.1.4
+jiter==0.8.0
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+langchain==0.3.9
+langchain-community==0.3.8
+langchain-core==0.3.21
+langchain-pinecone==0.2.0
+langchain-text-splitters==0.3.2
+langsmith==0.1.147
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.23.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+networkx==3.4.2
+numpy==1.26.4
+openai==1.55.3
+orjson==3.10.12
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+pinecone-client==5.0.1
+pinecone-plugin-inference==1.1.0
+pinecone-plugin-interface==0.0.7
+pip==23.2.1
+propcache==0.2.0
+pydantic==2.10.2
+pydantic_core==2.27.1
+pydantic-settings==2.6.1
+pydub==0.25.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.12
+pytz==2024.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+requests-toolbelt==1.0.0
+rich==13.9.4
+ruff==0.8.1
+safehttpx==0.1.1
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+semantic-version==2.10.0
+sentence-transformers==3.3.1
+setuptools==65.5.0
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+SQLAlchemy==2.0.35
+starlette==0.41.3
+sympy==1.13.1
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+tomlkit==0.12.0
+torch==2.5.1
+tqdm==4.67.1
+transformers==4.46.3
+tree_sitter==0.20.1
+tree-sitter-languages==1.10.2
+typer==0.14.0
+typing_extensions==4.12.2
+typing-inspect==0.9.0
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.32.1
+websockets==12.0
+yarl==1.18.0