Pranoy Mukherjee commited on
Commit ·
7a376ec
1
Parent(s): 0e9cb33
Fix Windows repo cloning and add docs agent
Browse files- .env.example +1 -0
- .gitignore +1 -0
- app/config.py +1 -0
- app/services/repo_crawler.py +75 -14
- tests/test_repo_crawler.py +36 -0
.env.example
CHANGED
|
@@ -7,3 +7,4 @@ MAX_FILES=200
|
|
| 7 |
MAX_FILE_SIZE_KB=250
|
| 8 |
MAX_CHARS_PER_CHUNK=12000
|
| 9 |
CLONE_TIMEOUT_SECONDS=60
|
|
|
|
|
|
| 7 |
MAX_FILE_SIZE_KB=250
|
| 8 |
MAX_CHARS_PER_CHUNK=12000
|
| 9 |
CLONE_TIMEOUT_SECONDS=60
|
| 10 |
+
CLONE_BASE_DIR=.swarm_audit_tmp
|
.gitignore
CHANGED
|
@@ -209,6 +209,7 @@ tempCodeRunnerFile.py
|
|
| 209 |
# SwarmAudit local test artifacts
|
| 210 |
.tmp_pytest/
|
| 211 |
pytest-cache-files-*
|
|
|
|
| 212 |
|
| 213 |
# PyPI configuration file
|
| 214 |
.pypirc
|
|
|
|
| 209 |
# SwarmAudit local test artifacts
|
| 210 |
.tmp_pytest/
|
| 211 |
pytest-cache-files-*
|
| 212 |
+
.swarm_audit_tmp/
|
| 213 |
|
| 214 |
# PyPI configuration file
|
| 215 |
.pypirc
|
app/config.py
CHANGED
|
@@ -13,6 +13,7 @@ class Settings(BaseSettings):
|
|
| 13 |
max_file_size_kb: int = 250
|
| 14 |
max_chars_per_chunk: int = 12000
|
| 15 |
clone_timeout_seconds: int = 60
|
|
|
|
| 16 |
|
| 17 |
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
|
| 18 |
|
|
|
|
| 13 |
max_file_size_kb: int = 250
|
| 14 |
max_chars_per_chunk: int = 12000
|
| 15 |
clone_timeout_seconds: int = 60
|
| 16 |
+
clone_base_dir: str = ".swarm_audit_tmp"
|
| 17 |
|
| 18 |
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
|
| 19 |
|
app/services/repo_crawler.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
|
|
| 1 |
import shutil
|
| 2 |
-
import
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
from urllib.parse import urlparse
|
| 5 |
|
| 6 |
-
from git import Repo
|
| 7 |
|
| 8 |
from app.config import Settings
|
| 9 |
from app.schemas import RepoScanResult, SourceFile
|
|
@@ -61,19 +63,11 @@ class RepoCrawler:
|
|
| 61 |
|
| 62 |
def clone_and_scan(self, repo_url: str) -> RepoScanResult:
|
| 63 |
clone_url = validate_github_url(repo_url)
|
| 64 |
-
temp_root =
|
| 65 |
repo_path = temp_root / "repo"
|
| 66 |
|
| 67 |
try:
|
| 68 |
-
|
| 69 |
-
clone_url,
|
| 70 |
-
repo_path,
|
| 71 |
-
depth=1,
|
| 72 |
-
single_branch=True,
|
| 73 |
-
kill_after_timeout=self.settings.clone_timeout_seconds,
|
| 74 |
-
env={"GIT_TERMINAL_PROMPT": "0"},
|
| 75 |
-
multi_options=["--filter=blob:none"],
|
| 76 |
-
)
|
| 77 |
return self.scan_local_repo(repo_url=repo_url, repo_path=repo_path)
|
| 78 |
except Exception:
|
| 79 |
shutil.rmtree(temp_root, ignore_errors=True)
|
|
@@ -137,12 +131,79 @@ class RepoCrawler:
|
|
| 137 |
|
| 138 |
repo_path = Path(scan_result.local_path)
|
| 139 |
temp_root = repo_path.parent
|
| 140 |
-
temp_dir = Path(tempfile.gettempdir()).resolve()
|
| 141 |
|
| 142 |
try:
|
| 143 |
resolved_temp_root = temp_root.resolve()
|
|
|
|
| 144 |
except FileNotFoundError:
|
| 145 |
return
|
| 146 |
|
| 147 |
-
if
|
| 148 |
shutil.rmtree(temp_root, ignore_errors=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
import shutil
|
| 3 |
+
import subprocess
|
| 4 |
+
import uuid
|
| 5 |
from pathlib import Path
|
| 6 |
from urllib.parse import urlparse
|
| 7 |
|
| 8 |
+
from git import GitCommandError, Repo
|
| 9 |
|
| 10 |
from app.config import Settings
|
| 11 |
from app.schemas import RepoScanResult, SourceFile
|
|
|
|
| 63 |
|
| 64 |
def clone_and_scan(self, repo_url: str) -> RepoScanResult:
|
| 65 |
clone_url = validate_github_url(repo_url)
|
| 66 |
+
temp_root = self._create_clone_root()
|
| 67 |
repo_path = temp_root / "repo"
|
| 68 |
|
| 69 |
try:
|
| 70 |
+
self._clone_repo(clone_url, repo_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
return self.scan_local_repo(repo_url=repo_url, repo_path=repo_path)
|
| 72 |
except Exception:
|
| 73 |
shutil.rmtree(temp_root, ignore_errors=True)
|
|
|
|
| 131 |
|
| 132 |
repo_path = Path(scan_result.local_path)
|
| 133 |
temp_root = repo_path.parent
|
|
|
|
| 134 |
|
| 135 |
try:
|
| 136 |
resolved_temp_root = temp_root.resolve()
|
| 137 |
+
resolved_base_dir = Path(self.settings.clone_base_dir).resolve()
|
| 138 |
except FileNotFoundError:
|
| 139 |
return
|
| 140 |
|
| 141 |
+
if resolved_base_dir in resolved_temp_root.parents and temp_root.name.startswith("swarm_audit_"):
|
| 142 |
shutil.rmtree(temp_root, ignore_errors=True)
|
| 143 |
+
|
| 144 |
+
def _create_clone_root(self) -> Path:
|
| 145 |
+
base_dir = Path(self.settings.clone_base_dir)
|
| 146 |
+
base_dir.mkdir(parents=True, exist_ok=True)
|
| 147 |
+
clone_root = base_dir / f"swarm_audit_{uuid.uuid4().hex}"
|
| 148 |
+
clone_root.mkdir(parents=False, exist_ok=False)
|
| 149 |
+
return clone_root
|
| 150 |
+
|
| 151 |
+
def _clone_repo(self, clone_url: str, repo_path: Path) -> None:
|
| 152 |
+
clone_kwargs = {
|
| 153 |
+
"depth": 1,
|
| 154 |
+
"single_branch": True,
|
| 155 |
+
"env": self._git_env(),
|
| 156 |
+
"multi_options": ["--filter=blob:none"],
|
| 157 |
+
}
|
| 158 |
+
if os.name != "nt":
|
| 159 |
+
clone_kwargs["kill_after_timeout"] = self.settings.clone_timeout_seconds
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
Repo.clone_from(clone_url, repo_path, **clone_kwargs)
|
| 163 |
+
except GitCommandError as exc:
|
| 164 |
+
if not self._should_retry_with_openssl(exc):
|
| 165 |
+
raise
|
| 166 |
+
shutil.rmtree(repo_path, ignore_errors=True)
|
| 167 |
+
self._clone_repo_with_openssl(clone_url, repo_path)
|
| 168 |
+
|
| 169 |
+
def _should_retry_with_openssl(self, exc: GitCommandError) -> bool:
|
| 170 |
+
if os.name != "nt":
|
| 171 |
+
return False
|
| 172 |
+
return "schannel" in str(exc).lower()
|
| 173 |
+
|
| 174 |
+
def _clone_repo_with_openssl(self, clone_url: str, repo_path: Path) -> None:
|
| 175 |
+
command = [
|
| 176 |
+
"git",
|
| 177 |
+
"-c",
|
| 178 |
+
"http.sslBackend=openssl",
|
| 179 |
+
"clone",
|
| 180 |
+
"-v",
|
| 181 |
+
"--depth=1",
|
| 182 |
+
"--single-branch",
|
| 183 |
+
"--filter=blob:none",
|
| 184 |
+
"--",
|
| 185 |
+
clone_url,
|
| 186 |
+
str(repo_path),
|
| 187 |
+
]
|
| 188 |
+
result = subprocess.run(
|
| 189 |
+
command,
|
| 190 |
+
cwd=Path.cwd(),
|
| 191 |
+
env={**os.environ, **self._git_env()},
|
| 192 |
+
text=True,
|
| 193 |
+
capture_output=True,
|
| 194 |
+
timeout=self.settings.clone_timeout_seconds,
|
| 195 |
+
check=False,
|
| 196 |
+
)
|
| 197 |
+
if result.returncode != 0:
|
| 198 |
+
raise RuntimeError(f"Git clone failed with OpenSSL fallback: {result.stderr.strip()}")
|
| 199 |
+
|
| 200 |
+
def _git_env(self) -> dict[str, str]:
|
| 201 |
+
return {
|
| 202 |
+
"GIT_TERMINAL_PROMPT": "0",
|
| 203 |
+
"HTTP_PROXY": "",
|
| 204 |
+
"HTTPS_PROXY": "",
|
| 205 |
+
"ALL_PROXY": "",
|
| 206 |
+
"http_proxy": "",
|
| 207 |
+
"https_proxy": "",
|
| 208 |
+
"all_proxy": "",
|
| 209 |
+
}
|
tests/test_repo_crawler.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
from pathlib import Path
|
|
|
|
| 2 |
|
| 3 |
import pytest
|
|
|
|
| 4 |
|
| 5 |
from app.config import Settings
|
| 6 |
from app.services.repo_crawler import RepoCrawler, validate_github_url
|
|
@@ -36,3 +38,37 @@ def test_scan_local_repo_includes_readme_for_docs_agent(tmp_path: Path):
|
|
| 36 |
|
| 37 |
assert result.files[0].path == "README.md"
|
| 38 |
assert result.files[0].language == "Markdown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
+
from unittest.mock import patch
|
| 3 |
|
| 4 |
import pytest
|
| 5 |
+
from git import GitCommandError
|
| 6 |
|
| 7 |
from app.config import Settings
|
| 8 |
from app.services.repo_crawler import RepoCrawler, validate_github_url
|
|
|
|
| 38 |
|
| 39 |
assert result.files[0].path == "README.md"
|
| 40 |
assert result.files[0].language == "Markdown"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_clone_and_scan_omits_gitpython_timeout_on_windows(tmp_path: Path):
|
| 44 |
+
crawler = RepoCrawler(Settings(max_files=10, max_file_size_kb=1, clone_base_dir=str(tmp_path / "clones")))
|
| 45 |
+
|
| 46 |
+
with patch("app.services.repo_crawler.os.name", "nt"), patch(
|
| 47 |
+
"app.services.repo_crawler.Repo.clone_from"
|
| 48 |
+
) as clone_from, patch.object(
|
| 49 |
+
crawler,
|
| 50 |
+
"scan_local_repo",
|
| 51 |
+
return_value=crawler.scan_local_repo("https://github.com/example/project", tmp_path),
|
| 52 |
+
):
|
| 53 |
+
crawler.clone_and_scan("https://github.com/example/project")
|
| 54 |
+
|
| 55 |
+
assert "kill_after_timeout" not in clone_from.call_args.kwargs
|
| 56 |
+
assert clone_from.call_args.kwargs["env"]["HTTPS_PROXY"] == ""
|
| 57 |
+
assert clone_from.call_args.kwargs["env"]["ALL_PROXY"] == ""
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def test_clone_and_scan_retries_schannel_failure_with_openssl(tmp_path: Path):
|
| 61 |
+
crawler = RepoCrawler(Settings(max_files=10, max_file_size_kb=1, clone_base_dir=str(tmp_path / "clones")))
|
| 62 |
+
schannel_error = GitCommandError("git clone", 128, stderr="schannel: AcquireCredentialsHandle failed")
|
| 63 |
+
|
| 64 |
+
with patch("app.services.repo_crawler.os.name", "nt"), patch(
|
| 65 |
+
"app.services.repo_crawler.Repo.clone_from",
|
| 66 |
+
side_effect=schannel_error,
|
| 67 |
+
), patch.object(crawler, "_clone_repo_with_openssl") as clone_with_openssl, patch.object(
|
| 68 |
+
crawler,
|
| 69 |
+
"scan_local_repo",
|
| 70 |
+
return_value=crawler.scan_local_repo("https://github.com/example/project", tmp_path),
|
| 71 |
+
):
|
| 72 |
+
crawler.clone_and_scan("https://github.com/example/project")
|
| 73 |
+
|
| 74 |
+
clone_with_openssl.assert_called_once()
|