File size: 3,020 Bytes
e3a472a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""Clone a git repository into a local cache directory.

Uses GitPython if installed, falls back to shelling out to `git`. Always
shallow-clones (depth=1) by default — for retrieval we don't need history,
and shallow makes the Linux kernel ingest in seconds instead of minutes.
"""
from __future__ import annotations
import os
import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Optional


URL_RE = re.compile(r"^(https?://|git@)([\w./:-]+?)(\.git)?/?$")


@dataclass
class CloneResult:
    url: str
    local_path: Path
    sha: str
    cached: bool


def normalize_url(url_or_path: str) -> str:
    """Accept https://, git@, owner/repo, or a local path."""
    s = url_or_path.strip()
    if os.path.isdir(s):
        return os.path.abspath(s)
    if s.startswith("git@") or s.startswith("http"):
        return s
    if "/" in s and not s.startswith("/"):
        # owner/repo shorthand -> github
        return f"https://github.com/{s}.git"
    return s


def slugify(url: str) -> str:
    """Stable filesystem-friendly slug from a URL."""
    if os.path.isdir(url):
        return Path(url).name
    m = URL_RE.match(url)
    if not m:
        return re.sub(r"[^a-zA-Z0-9._-]+", "_", url)
    body = m.group(2)
    return re.sub(r"[^a-zA-Z0-9._-]+", "_", body)


def _git(*args: str, cwd: Optional[Path] = None) -> str:
    proc = subprocess.run(
        ["git", *args],
        cwd=str(cwd) if cwd else None,
        capture_output=True, text=True,
    )
    if proc.returncode != 0:
        raise RuntimeError(f"git {' '.join(args)} failed: {proc.stderr.strip()}")
    return proc.stdout.strip()


def clone(
    url_or_path: str,
    cache_dir: str | Path = ".repomind_cache/repos",
    depth: int = 1,
    force: bool = False,
) -> CloneResult:
    """Clone url to cache_dir/<slug>. If local path is given, just return it."""
    url = normalize_url(url_or_path)
    cache_dir = Path(cache_dir)
    cache_dir.mkdir(parents=True, exist_ok=True)

    # Local-path mode — no clone, just record the SHA if it's a git repo.
    if os.path.isdir(url):
        local = Path(url)
        try:
            sha = _git("rev-parse", "HEAD", cwd=local)
        except Exception:
            sha = "no-git"
        return CloneResult(url=str(local), local_path=local, sha=sha, cached=True)

    target = cache_dir / slugify(url)
    if target.exists() and not force:
        try:
            sha = _git("rev-parse", "HEAD", cwd=target)
            return CloneResult(url=url, local_path=target, sha=sha, cached=True)
        except Exception:
            shutil.rmtree(target, ignore_errors=True)

    if target.exists() and force:
        shutil.rmtree(target, ignore_errors=True)

    args = ["clone", "--filter=blob:none"]
    if depth > 0:
        args += ["--depth", str(depth)]
    args += [url, str(target)]
    _git(*args)
    sha = _git("rev-parse", "HEAD", cwd=target)
    return CloneResult(url=url, local_path=target, sha=sha, cached=False)