File size: 3,020 Bytes
e3a472a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | """Clone a git repository into a local cache directory.
Uses GitPython if installed, falls back to shelling out to `git`. Always
shallow-clones (depth=1) by default — for retrieval we don't need history,
and shallow makes the Linux kernel ingest in seconds instead of minutes.
"""
from __future__ import annotations
import os
import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
URL_RE = re.compile(r"^(https?://|git@)([\w./:-]+?)(\.git)?/?$")
@dataclass
class CloneResult:
url: str
local_path: Path
sha: str
cached: bool
def normalize_url(url_or_path: str) -> str:
"""Accept https://, git@, owner/repo, or a local path."""
s = url_or_path.strip()
if os.path.isdir(s):
return os.path.abspath(s)
if s.startswith("git@") or s.startswith("http"):
return s
if "/" in s and not s.startswith("/"):
# owner/repo shorthand -> github
return f"https://github.com/{s}.git"
return s
def slugify(url: str) -> str:
"""Stable filesystem-friendly slug from a URL."""
if os.path.isdir(url):
return Path(url).name
m = URL_RE.match(url)
if not m:
return re.sub(r"[^a-zA-Z0-9._-]+", "_", url)
body = m.group(2)
return re.sub(r"[^a-zA-Z0-9._-]+", "_", body)
def _git(*args: str, cwd: Optional[Path] = None) -> str:
proc = subprocess.run(
["git", *args],
cwd=str(cwd) if cwd else None,
capture_output=True, text=True,
)
if proc.returncode != 0:
raise RuntimeError(f"git {' '.join(args)} failed: {proc.stderr.strip()}")
return proc.stdout.strip()
def clone(
url_or_path: str,
cache_dir: str | Path = ".repomind_cache/repos",
depth: int = 1,
force: bool = False,
) -> CloneResult:
"""Clone url to cache_dir/<slug>. If local path is given, just return it."""
url = normalize_url(url_or_path)
cache_dir = Path(cache_dir)
cache_dir.mkdir(parents=True, exist_ok=True)
# Local-path mode — no clone, just record the SHA if it's a git repo.
if os.path.isdir(url):
local = Path(url)
try:
sha = _git("rev-parse", "HEAD", cwd=local)
except Exception:
sha = "no-git"
return CloneResult(url=str(local), local_path=local, sha=sha, cached=True)
target = cache_dir / slugify(url)
if target.exists() and not force:
try:
sha = _git("rev-parse", "HEAD", cwd=target)
return CloneResult(url=url, local_path=target, sha=sha, cached=True)
except Exception:
shutil.rmtree(target, ignore_errors=True)
if target.exists() and force:
shutil.rmtree(target, ignore_errors=True)
args = ["clone", "--filter=blob:none"]
if depth > 0:
args += ["--depth", str(depth)]
args += [url, str(target)]
_git(*args)
sha = _git("rev-parse", "HEAD", cwd=target)
return CloneResult(url=url, local_path=target, sha=sha, cached=False)
|