Pranoy Mukherjee commited on
Commit
7a376ec
·
1 Parent(s): 0e9cb33

Fix Windows repo cloning and add docs agent

Browse files
.env.example CHANGED
@@ -7,3 +7,4 @@ MAX_FILES=200
7
  MAX_FILE_SIZE_KB=250
8
  MAX_CHARS_PER_CHUNK=12000
9
  CLONE_TIMEOUT_SECONDS=60
 
 
7
  MAX_FILE_SIZE_KB=250
8
  MAX_CHARS_PER_CHUNK=12000
9
  CLONE_TIMEOUT_SECONDS=60
10
+ CLONE_BASE_DIR=.swarm_audit_tmp
.gitignore CHANGED
@@ -209,6 +209,7 @@ tempCodeRunnerFile.py
209
  # SwarmAudit local test artifacts
210
  .tmp_pytest/
211
  pytest-cache-files-*
 
212
 
213
  # PyPI configuration file
214
  .pypirc
 
209
  # SwarmAudit local test artifacts
210
  .tmp_pytest/
211
  pytest-cache-files-*
212
+ .swarm_audit_tmp/
213
 
214
  # PyPI configuration file
215
  .pypirc
app/config.py CHANGED
@@ -13,6 +13,7 @@ class Settings(BaseSettings):
13
  max_file_size_kb: int = 250
14
  max_chars_per_chunk: int = 12000
15
  clone_timeout_seconds: int = 60
 
16
 
17
  model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
18
 
 
13
  max_file_size_kb: int = 250
14
  max_chars_per_chunk: int = 12000
15
  clone_timeout_seconds: int = 60
16
+ clone_base_dir: str = ".swarm_audit_tmp"
17
 
18
  model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
19
 
app/services/repo_crawler.py CHANGED
@@ -1,9 +1,11 @@
 
1
  import shutil
2
- import tempfile
 
3
  from pathlib import Path
4
  from urllib.parse import urlparse
5
 
6
- from git import Repo
7
 
8
  from app.config import Settings
9
  from app.schemas import RepoScanResult, SourceFile
@@ -61,19 +63,11 @@ class RepoCrawler:
61
 
62
  def clone_and_scan(self, repo_url: str) -> RepoScanResult:
63
  clone_url = validate_github_url(repo_url)
64
- temp_root = Path(tempfile.mkdtemp(prefix="swarm_audit_"))
65
  repo_path = temp_root / "repo"
66
 
67
  try:
68
- Repo.clone_from(
69
- clone_url,
70
- repo_path,
71
- depth=1,
72
- single_branch=True,
73
- kill_after_timeout=self.settings.clone_timeout_seconds,
74
- env={"GIT_TERMINAL_PROMPT": "0"},
75
- multi_options=["--filter=blob:none"],
76
- )
77
  return self.scan_local_repo(repo_url=repo_url, repo_path=repo_path)
78
  except Exception:
79
  shutil.rmtree(temp_root, ignore_errors=True)
@@ -137,12 +131,79 @@ class RepoCrawler:
137
 
138
  repo_path = Path(scan_result.local_path)
139
  temp_root = repo_path.parent
140
- temp_dir = Path(tempfile.gettempdir()).resolve()
141
 
142
  try:
143
  resolved_temp_root = temp_root.resolve()
 
144
  except FileNotFoundError:
145
  return
146
 
147
- if temp_dir in resolved_temp_root.parents and temp_root.name.startswith("swarm_audit_"):
148
  shutil.rmtree(temp_root, ignore_errors=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import shutil
3
+ import subprocess
4
+ import uuid
5
  from pathlib import Path
6
  from urllib.parse import urlparse
7
 
8
+ from git import GitCommandError, Repo
9
 
10
  from app.config import Settings
11
  from app.schemas import RepoScanResult, SourceFile
 
63
 
64
  def clone_and_scan(self, repo_url: str) -> RepoScanResult:
65
  clone_url = validate_github_url(repo_url)
66
+ temp_root = self._create_clone_root()
67
  repo_path = temp_root / "repo"
68
 
69
  try:
70
+ self._clone_repo(clone_url, repo_path)
 
 
 
 
 
 
 
 
71
  return self.scan_local_repo(repo_url=repo_url, repo_path=repo_path)
72
  except Exception:
73
  shutil.rmtree(temp_root, ignore_errors=True)
 
131
 
132
  repo_path = Path(scan_result.local_path)
133
  temp_root = repo_path.parent
 
134
 
135
  try:
136
  resolved_temp_root = temp_root.resolve()
137
+ resolved_base_dir = Path(self.settings.clone_base_dir).resolve()
138
  except FileNotFoundError:
139
  return
140
 
141
+ if resolved_base_dir in resolved_temp_root.parents and temp_root.name.startswith("swarm_audit_"):
142
  shutil.rmtree(temp_root, ignore_errors=True)
143
+
144
+ def _create_clone_root(self) -> Path:
145
+ base_dir = Path(self.settings.clone_base_dir)
146
+ base_dir.mkdir(parents=True, exist_ok=True)
147
+ clone_root = base_dir / f"swarm_audit_{uuid.uuid4().hex}"
148
+ clone_root.mkdir(parents=False, exist_ok=False)
149
+ return clone_root
150
+
151
+ def _clone_repo(self, clone_url: str, repo_path: Path) -> None:
152
+ clone_kwargs = {
153
+ "depth": 1,
154
+ "single_branch": True,
155
+ "env": self._git_env(),
156
+ "multi_options": ["--filter=blob:none"],
157
+ }
158
+ if os.name != "nt":
159
+ clone_kwargs["kill_after_timeout"] = self.settings.clone_timeout_seconds
160
+
161
+ try:
162
+ Repo.clone_from(clone_url, repo_path, **clone_kwargs)
163
+ except GitCommandError as exc:
164
+ if not self._should_retry_with_openssl(exc):
165
+ raise
166
+ shutil.rmtree(repo_path, ignore_errors=True)
167
+ self._clone_repo_with_openssl(clone_url, repo_path)
168
+
169
+ def _should_retry_with_openssl(self, exc: GitCommandError) -> bool:
170
+ if os.name != "nt":
171
+ return False
172
+ return "schannel" in str(exc).lower()
173
+
174
+ def _clone_repo_with_openssl(self, clone_url: str, repo_path: Path) -> None:
175
+ command = [
176
+ "git",
177
+ "-c",
178
+ "http.sslBackend=openssl",
179
+ "clone",
180
+ "-v",
181
+ "--depth=1",
182
+ "--single-branch",
183
+ "--filter=blob:none",
184
+ "--",
185
+ clone_url,
186
+ str(repo_path),
187
+ ]
188
+ result = subprocess.run(
189
+ command,
190
+ cwd=Path.cwd(),
191
+ env={**os.environ, **self._git_env()},
192
+ text=True,
193
+ capture_output=True,
194
+ timeout=self.settings.clone_timeout_seconds,
195
+ check=False,
196
+ )
197
+ if result.returncode != 0:
198
+ raise RuntimeError(f"Git clone failed with OpenSSL fallback: {result.stderr.strip()}")
199
+
200
+ def _git_env(self) -> dict[str, str]:
201
+ return {
202
+ "GIT_TERMINAL_PROMPT": "0",
203
+ "HTTP_PROXY": "",
204
+ "HTTPS_PROXY": "",
205
+ "ALL_PROXY": "",
206
+ "http_proxy": "",
207
+ "https_proxy": "",
208
+ "all_proxy": "",
209
+ }
tests/test_repo_crawler.py CHANGED
@@ -1,6 +1,8 @@
1
  from pathlib import Path
 
2
 
3
  import pytest
 
4
 
5
  from app.config import Settings
6
  from app.services.repo_crawler import RepoCrawler, validate_github_url
@@ -36,3 +38,37 @@ def test_scan_local_repo_includes_readme_for_docs_agent(tmp_path: Path):
36
 
37
  assert result.files[0].path == "README.md"
38
  assert result.files[0].language == "Markdown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pathlib import Path
2
+ from unittest.mock import patch
3
 
4
  import pytest
5
+ from git import GitCommandError
6
 
7
  from app.config import Settings
8
  from app.services.repo_crawler import RepoCrawler, validate_github_url
 
38
 
39
  assert result.files[0].path == "README.md"
40
  assert result.files[0].language == "Markdown"
41
+
42
+
43
+ def test_clone_and_scan_omits_gitpython_timeout_on_windows(tmp_path: Path):
44
+ crawler = RepoCrawler(Settings(max_files=10, max_file_size_kb=1, clone_base_dir=str(tmp_path / "clones")))
45
+
46
+ with patch("app.services.repo_crawler.os.name", "nt"), patch(
47
+ "app.services.repo_crawler.Repo.clone_from"
48
+ ) as clone_from, patch.object(
49
+ crawler,
50
+ "scan_local_repo",
51
+ return_value=crawler.scan_local_repo("https://github.com/example/project", tmp_path),
52
+ ):
53
+ crawler.clone_and_scan("https://github.com/example/project")
54
+
55
+ assert "kill_after_timeout" not in clone_from.call_args.kwargs
56
+ assert clone_from.call_args.kwargs["env"]["HTTPS_PROXY"] == ""
57
+ assert clone_from.call_args.kwargs["env"]["ALL_PROXY"] == ""
58
+
59
+
60
+ def test_clone_and_scan_retries_schannel_failure_with_openssl(tmp_path: Path):
61
+ crawler = RepoCrawler(Settings(max_files=10, max_file_size_kb=1, clone_base_dir=str(tmp_path / "clones")))
62
+ schannel_error = GitCommandError("git clone", 128, stderr="schannel: AcquireCredentialsHandle failed")
63
+
64
+ with patch("app.services.repo_crawler.os.name", "nt"), patch(
65
+ "app.services.repo_crawler.Repo.clone_from",
66
+ side_effect=schannel_error,
67
+ ), patch.object(crawler, "_clone_repo_with_openssl") as clone_with_openssl, patch.object(
68
+ crawler,
69
+ "scan_local_repo",
70
+ return_value=crawler.scan_local_repo("https://github.com/example/project", tmp_path),
71
+ ):
72
+ crawler.clone_and_scan("https://github.com/example/project")
73
+
74
+ clone_with_openssl.assert_called_once()