civicsetu / Makefile
adeshboudh16
feat(eval): RAGAS evaluation framework + RAG pipeline improvements
f8b04c3
.PHONY: help install dev serve ingest lint format typecheck test test-cov e2e \
eval eval-p1 eval-p2 eval-smoke eval-smoke-p1 eval-smoke-p2 \
eval-collect eval-score eval-score-smoke eval-large eval-reset \
docker-up docker-down clean \
frontend-install frontend-dev frontend-build frontend-start frontend-lint frontend-typecheck
help:
@echo "CivicSetu β€” available commands:"
@echo ""
@echo " Setup:"
@echo " make install Install all dependencies"
@echo " make dev Install with dev + eval extras"
@echo ""
@echo " Run:"
@echo " make docker-up Start PostgreSQL + pgvector + Neo4j"
@echo " make docker-down Stop all containers"
@echo " make serve Start FastAPI server (hot reload)"
@echo ""
@echo " Data:"
@echo " make ingest Ingest all 5 jurisdictions (Central, MH, UP, KA, TN)"
@echo ""
@echo " Quality:"
@echo " make lint Run ruff linter"
@echo " make format Run ruff formatter"
@echo " make typecheck Run mypy"
@echo " make test Run unit test suite"
@echo " make test-cov Run tests with coverage report"
@echo " make e2e Run 12-case E2E query benchmark"
@echo " make eval Full eval β€” both phases, default RAG LLMs + Groq judge"
@echo " make eval-p1 Phase 1 only: graph invocation β†’ eval_phase1_results.json"
@echo " make eval-p2 Phase 2 only: RAGAS scoring from cached phase 1"
@echo " make eval-smoke 5-row smoke test β€” both phases"
@echo " make eval-smoke-p1 5-row smoke test β€” phase 1 only"
@echo " make eval-smoke-p2 5-row smoke test β€” phase 2 only (needs eval-smoke-p1 first)"
@echo " make eval-large Full eval with osmapi qwen3.5-397b graph model"
@echo " make eval-reset Delete phase 1 cache to force fresh graph invocation"
@echo ""
@echo " Judge override: make eval-p2 JUDGE_PROVIDER=gemini JUDGE_MODEL=gemini/gemini-2.5-flash-lite"
@echo " Judge override: make eval-p2 JUDGE_PROVIDER=osmapi JUDGE_MODEL=qwen3.5-397b-a17b"
@echo " Graph model override: make eval-p1 EVAL_PRIMARY_MODEL=neysa/qwen3.5-122b-a10b"
@echo ""
@echo " make clean Remove __pycache__ and .pyc files"
install:
uv sync
dev:
uv sync --extra dev --extra eval
serve:
uv run uvicorn civicsetu.api.main:app \
--host 0.0.0.0 \
--port 8000 \
--reload \
--log-level info
ingest:
uv run python scripts/ingest.py
lint:
uv run ruff check src/ tests/
format:
uv run ruff format src/ tests/
typecheck:
uv run mypy src/
test:
uv run pytest tests/ -v --tb=short
test-cov:
uv run pytest tests/ --cov=civicsetu --cov-report=term-missing -q
e2e:
PYTHONUTF8=1 uv run python scripts/test_e2e_queries.py
# ── Eval ──────────────────────────────────────────────────────────────────────
# Phase 1 uses default RAG app LLMs from .env (gemini-2.5-flash-lite chain).
# Override graph model: make eval-p1 EVAL_PRIMARY_MODEL=qwen3.5-122b-a10b
# Default judge comes from scripts/run_eval.py / .env.
# Override judge provider/model:
# make eval-p2 JUDGE_PROVIDER=gemini JUDGE_MODEL=gemini/gemini-2.5-flash-lite
# make eval-p2 JUDGE_PROVIDER=osmapi JUDGE_MODEL=qwen3.5-397b-a17b
# make eval-p2 JUDGE_PROVIDER=groq JUDGE_MODEL=llama-3.3-70b-versatile
# Gemini judge uses GEMINI_API_KEY_2 for both LLM and embeddings.
# Groq judge uses GROQ_API_KEY_2 (or GROQ_API_KEY / JUDGE_GROQ_API_KEY) for LLM, GEMINI_API_KEY_2 for embeddings.
# osmapi judge uses OSMAPI_API_KEY for LLM, GEMINI_API_KEY_2 for embeddings.
MAKE_JUDGE_ENV = $(if $(JUDGE_PROVIDER),JUDGE_PROVIDER=$(JUDGE_PROVIDER) )$(if $(JUDGE_MODEL),JUDGE_MODEL=$(JUDGE_MODEL) )
# Full eval β€” both phases
eval:
PYTHONUTF8=1 $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py
# Phase 1 only β€” graph invocation, saves to eval_phase1_results.json
eval-p1:
PYTHONUTF8=1 EVAL_PHASE=1 uv run python scripts/run_eval.py
# Phase 2 only β€” RAGAS scoring from cached phase 1
eval-p2:
@test -f eval_phase1_results.json || (echo "ERROR: eval_phase1_results.json not found β€” run 'make eval-p1' first" && exit 1)
PYTHONUTF8=1 EVAL_PHASE=2 $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py
# 5-row smoke tests
eval-smoke:
PYTHONUTF8=1 EVAL_LIMIT=5 $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py
eval-smoke-p1:
PYTHONUTF8=1 EVAL_PHASE=1 EVAL_LIMIT=5 uv run python scripts/run_eval.py
eval-smoke-p2:
@test -f eval_phase1_results.json || (echo "ERROR: eval_phase1_results.json not found β€” run 'make eval-smoke-p1' first" && exit 1)
PYTHONUTF8=1 EVAL_PHASE=2 EVAL_LIMIT=5 $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py
# Large graph model via osmapi
eval-large:
PYTHONUTF8=1 EVAL_PRIMARY_MODEL=qwen3.5-397b-a17b $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py
eval-reset:
rm -f eval_phase1_results.json
@echo "Phase 1 cache cleared β€” next eval-p1 will re-invoke the graph"
eval-reset-all:
rm -f eval_phase1_results.json eval_results.json
@echo "Both caches cleared β€” next eval will run fully fresh"
# Aliases for backward compat
eval-collect: eval-p1
eval-score: eval-p2
eval-score-smoke: eval-smoke-p2
docker-up:
cd infra && docker compose up -d
@echo "Waiting for services to be healthy..."
@sleep 5
@docker compose -f infra/docker-compose.yml ps
docker-down:
cd infra && docker compose down
clean:
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
find . -name "*.pyc" -delete 2>/dev/null || true
find . -name ".pytest_cache" -exec rm -rf {} + 2>/dev/null || true
# Frontend
frontend-install:
cd frontend && npm install
frontend-dev:
cd frontend && npm run dev
frontend-build:
cd frontend && npm run build
frontend-start:
cd frontend && npm run start
frontend-lint:
cd frontend && npm run lint
frontend-typecheck:
cd frontend && npm run typecheck