File size: 4,060 Bytes
03815d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Chakravyuh — reproducibility + dev convenience targets
#
# Designed so a fresh clone can produce every README claim with one command.
# All targets assume Python 3.11+ and a fresh virtualenv.

PYTHON ?= python
PIP    ?= pip
ADAPTER ?= ujjwalpardeshi/chakravyuh-analyzer-lora-v2
BENCH  ?= data/chakravyuh-bench-v0/scenarios.jsonl
SEED   ?= 42

.PHONY: help install test lint format reproduce \
        eval-v2 bootstrap smoke-test link-check link-check-http clean

help:
	@echo "Chakravyuh make targets:"
	@echo "  install          Install repo + LLM + eval extras"
	@echo "  test             Run pytest (341 collected; 338 pass + 3 skip)"
	@echo "  smoke-test       In-process env reset+step smoke test (~5s, no GPU)"
	@echo "  link-check       Check every local README link / asset path resolves"
	@echo "  link-check-http  HEAD-probe every http(s) README link (allowed-fail; external)"
	@echo "  lint             Run ruff lint (no auto-fix)"
	@echo "  format           Run black + ruff --fix"
	@echo "  eval-v2          Re-run v2 evaluation against the bench (~10 min CPU cached, ~2-4h fresh GPU)"
	@echo "  bootstrap        Compute 10k-iteration bootstrap CIs from the eval (~1 min CPU)"
	@echo "  reproduce        eval-v2 + bootstrap (numbers within 0.5pp of README; set CHAKRAVYUH_SKIP_INFERENCE=1 to use cached scores)"
	@echo "  clean            Remove pyc / pycache / build artifacts"

install:
	$(PIP) install -e '.[llm,eval]'

test:
	$(PYTHON) -m pytest tests/ -v

smoke-test:
	$(PYTHON) scripts/smoke_test.py

link-check:
	@echo "Checking local file references in README.md..."
	@missing=0; \
	for path in $$(grep -oE '\]\([^)]+\)' README.md | sed -E 's/\]\(([^)]+)\).*/\1/' | grep -v '^http' | grep -v '^#' | grep -v '^mailto:' | cut -d'#' -f1); do \
	    [ -e "$$path" ] || { echo "BROKEN: $$path"; missing=$$((missing+1)); }; \
	done; \
	if [ "$$missing" -eq 0 ]; then echo "All local README links resolve."; else echo "$$missing broken link(s)."; exit 1; fi

# Best-effort external link probe. HEAD requests with a short timeout; we accept
# 2xx, 301/302 redirects, and 403 (some hosts block bots — newsmeter, pib, etc.)
# but flag 404/410/500. Skips localhost URLs and URLs containing shell glue
# (` ` or `&&`) which are illustrative command snippets, not real links. This
# target is allowed to fail in CI because external availability is flaky.
link-check-http:
	@echo "HEAD-probing http(s) links in README.md..."
	@bad=0; checked=0; \
	urls=$$(grep -oE 'https?://[^)[:space:]]+' README.md | sed -E 's/[`",]+$$//' | sort -u | grep -v 'localhost' | grep -v ' ' | grep -v '&&'); \
	for url in $$urls; do \
	    checked=$$((checked+1)); \
	    code=$$(curl -sS -L --max-time 8 -o /dev/null -w '%{http_code}' -A 'Chakravyuh-LinkCheck/1.0' -I "$$url" 2>/dev/null || echo 000); \
	    case "$$code" in \
	        2*|301|302|307|308|401|403|405|406|429) ;; \
	        *) echo "BROKEN[$$code]: $$url"; bad=$$((bad+1)) ;; \
	    esac; \
	done; \
	echo "Probed $$checked URL(s); $$bad failure(s)."; \
	if [ "$$bad" -ne 0 ]; then exit 1; fi

lint:
	$(PYTHON) -m ruff check chakravyuh_env/ training/ eval/ server/ tests/

format:
	$(PYTHON) -m black chakravyuh_env/ training/ eval/ server/ tests/
	$(PYTHON) -m ruff check --fix chakravyuh_env/ training/ eval/ server/ tests/

eval-v2:
	$(PYTHON) eval/mode_c_real_cases.py \
	  --model-id $(ADAPTER) \
	  --bench $(BENCH) \
	  --seed $(SEED) \
	  --output logs/eval_v2_reproduce.json

bootstrap: eval-v2
	$(PYTHON) eval/bootstrap_ci.py \
	  --eval-file logs/eval_v2_reproduce.json \
	  --iterations 10000 \
	  --output logs/bootstrap_v2_reproduce.json

reproduce: install bootstrap
	@echo ""
	@echo "Reproduction complete. Compare these against README claims (target: within 0.5pp):"
	@echo "  - logs/eval_v2_reproduce.json     # detection, FPR, F1, per-difficulty"
	@echo "  - logs/bootstrap_v2_reproduce.json # 95% CI bands"

clean:
	find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
	find . -type f -name '*.pyc' -delete 2>/dev/null || true
	rm -rf build/ dist/ *.egg-info/