Spaces:

adesh01
/

civicsetu

Running

adeshboudh16

feat(eval): RAGAS evaluation framework + RAG pipeline improvements

f8b04c3 9 days ago

6.02 kB

	.PHONY: help install dev serve ingest lint format typecheck test test-cov e2e \
	eval eval-p1 eval-p2 eval-smoke eval-smoke-p1 eval-smoke-p2 \
	eval-collect eval-score eval-score-smoke eval-large eval-reset \
	docker-up docker-down clean \
	frontend-install frontend-dev frontend-build frontend-start frontend-lint frontend-typecheck

	help:
	@echo "CivicSetu — available commands:"
	@echo ""
	@echo " Setup:"
	@echo " make install Install all dependencies"
	@echo " make dev Install with dev + eval extras"
	@echo ""
	@echo " Run:"
	@echo " make docker-up Start PostgreSQL + pgvector + Neo4j"
	@echo " make docker-down Stop all containers"
	@echo " make serve Start FastAPI server (hot reload)"
	@echo ""
	@echo " Data:"
	@echo " make ingest Ingest all 5 jurisdictions (Central, MH, UP, KA, TN)"
	@echo ""
	@echo " Quality:"
	@echo " make lint Run ruff linter"
	@echo " make format Run ruff formatter"
	@echo " make typecheck Run mypy"
	@echo " make test Run unit test suite"
	@echo " make test-cov Run tests with coverage report"
	@echo " make e2e Run 12-case E2E query benchmark"
	@echo " make eval Full eval — both phases, default RAG LLMs + Groq judge"
	@echo " make eval-p1 Phase 1 only: graph invocation → eval_phase1_results.json"
	@echo " make eval-p2 Phase 2 only: RAGAS scoring from cached phase 1"
	@echo " make eval-smoke 5-row smoke test — both phases"
	@echo " make eval-smoke-p1 5-row smoke test — phase 1 only"
	@echo " make eval-smoke-p2 5-row smoke test — phase 2 only (needs eval-smoke-p1 first)"
	@echo " make eval-large Full eval with osmapi qwen3.5-397b graph model"
	@echo " make eval-reset Delete phase 1 cache to force fresh graph invocation"
	@echo ""
	@echo " Judge override: make eval-p2 JUDGE_PROVIDER=gemini JUDGE_MODEL=gemini/gemini-2.5-flash-lite"
	@echo " Judge override: make eval-p2 JUDGE_PROVIDER=osmapi JUDGE_MODEL=qwen3.5-397b-a17b"
	@echo " Graph model override: make eval-p1 EVAL_PRIMARY_MODEL=neysa/qwen3.5-122b-a10b"
	@echo ""
	@echo " make clean Remove __pycache__ and .pyc files"

	install:
	uv sync

	dev:
	uv sync --extra dev --extra eval

	serve:
	uv run uvicorn civicsetu.api.main:app \
	--host 0.0.0.0 \
	--port 8000 \
	--reload \
	--log-level info

	ingest:
	uv run python scripts/ingest.py

	lint:
	uv run ruff check src/ tests/

	format:
	uv run ruff format src/ tests/

	typecheck:
	uv run mypy src/

	test:
	uv run pytest tests/ -v --tb=short

	test-cov:
	uv run pytest tests/ --cov=civicsetu --cov-report=term-missing -q

	e2e:
	PYTHONUTF8=1 uv run python scripts/test_e2e_queries.py

	# ── Eval ──────────────────────────────────────────────────────────────────────
	# Phase 1 uses default RAG app LLMs from .env (gemini-2.5-flash-lite chain).
	# Override graph model: make eval-p1 EVAL_PRIMARY_MODEL=qwen3.5-122b-a10b
	# Default judge comes from scripts/run_eval.py / .env.
	# Override judge provider/model:
	# make eval-p2 JUDGE_PROVIDER=gemini JUDGE_MODEL=gemini/gemini-2.5-flash-lite
	# make eval-p2 JUDGE_PROVIDER=osmapi JUDGE_MODEL=qwen3.5-397b-a17b
	# make eval-p2 JUDGE_PROVIDER=groq JUDGE_MODEL=llama-3.3-70b-versatile
	# Gemini judge uses GEMINI_API_KEY_2 for both LLM and embeddings.
	# Groq judge uses GROQ_API_KEY_2 (or GROQ_API_KEY / JUDGE_GROQ_API_KEY) for LLM, GEMINI_API_KEY_2 for embeddings.
	# osmapi judge uses OSMAPI_API_KEY for LLM, GEMINI_API_KEY_2 for embeddings.
	MAKE_JUDGE_ENV = $(if $(JUDGE_PROVIDER),JUDGE_PROVIDER=$(JUDGE_PROVIDER) )$(if $(JUDGE_MODEL),JUDGE_MODEL=$(JUDGE_MODEL) )

	# Full eval — both phases
	eval:
	PYTHONUTF8=1 $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py

	# Phase 1 only — graph invocation, saves to eval_phase1_results.json
	eval-p1:
	PYTHONUTF8=1 EVAL_PHASE=1 uv run python scripts/run_eval.py

	# Phase 2 only — RAGAS scoring from cached phase 1
	eval-p2:
	@test -f eval_phase1_results.json \|\| (echo "ERROR: eval_phase1_results.json not found — run 'make eval-p1' first" && exit 1)
	PYTHONUTF8=1 EVAL_PHASE=2 $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py

	# 5-row smoke tests
	eval-smoke:
	PYTHONUTF8=1 EVAL_LIMIT=5 $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py

	eval-smoke-p1:
	PYTHONUTF8=1 EVAL_PHASE=1 EVAL_LIMIT=5 uv run python scripts/run_eval.py

	eval-smoke-p2:
	@test -f eval_phase1_results.json \|\| (echo "ERROR: eval_phase1_results.json not found — run 'make eval-smoke-p1' first" && exit 1)
	PYTHONUTF8=1 EVAL_PHASE=2 EVAL_LIMIT=5 $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py

	# Large graph model via osmapi
	eval-large:
	PYTHONUTF8=1 EVAL_PRIMARY_MODEL=qwen3.5-397b-a17b $(MAKE_JUDGE_ENV)uv run python scripts/run_eval.py

	eval-reset:
	rm -f eval_phase1_results.json
	@echo "Phase 1 cache cleared — next eval-p1 will re-invoke the graph"

	eval-reset-all:
	rm -f eval_phase1_results.json eval_results.json
	@echo "Both caches cleared — next eval will run fully fresh"

	# Aliases for backward compat
	eval-collect: eval-p1
	eval-score: eval-p2
	eval-score-smoke: eval-smoke-p2

	docker-up:
	cd infra && docker compose up -d
	@echo "Waiting for services to be healthy..."
	@sleep 5
	@docker compose -f infra/docker-compose.yml ps

	docker-down:
	cd infra && docker compose down

	clean:
	find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null \|\| true
	find . -name "*.pyc" -delete 2>/dev/null \|\| true
	find . -name ".pytest_cache" -exec rm -rf {} + 2>/dev/null \|\| true

	# Frontend
	frontend-install:
	cd frontend && npm install

	frontend-dev:
	cd frontend && npm run dev

	frontend-build:
	cd frontend && npm run build

	frontend-start:
	cd frontend && npm run start

	frontend-lint:
	cd frontend && npm run lint

	frontend-typecheck:
	cd frontend && npm run typecheck