Spaces:

axentx
/

surrogate-1

Running

Ashira Pitchayapakayakul commited on 10 days ago

Commit

c9adfa2

1 Parent(s): dd483c7

feat: role-driven discoverer \u2014 250+ queries from real SDLC job descriptions

USER CORRECTION: 'มันต้องไม่มีสุดท้าย ค้าาาา ไปเรื่อยๆเนอะ'
'แต่ละ role ของ SDLC มันมีเรื่องที่ต้องรู้หลัก และเรื่องอื่นที่ต้องรู้ต่างกันนะ'

ARCHITECTURE:

agents/role-knowledge-map.json — declarative knowledge per SDLC role:
- 17 roles: solution-architect, backend-engineer, frontend-engineer,
mobile-engineer, devops, sre, cloud-engineer, devsecops, cloud-security,
data-engineer, ml-engineer, database-engineer, qa-engineer,
performance-engineer, tech-writer, platform-engineer, ai-agent-engineer
- Each role has CORE skills (must-master) + ADJACENT (should-know)
- Sourced from real FAANG/unicorn/OSS job descriptions 2024-2026
- Plus 21 cross-cutting topics (CAP theorem, 12-factor, DORA, etc.)

TOTAL: 245 role-specific queries + 21 cross-cutting + 9 baseline = ~275 queries

DISCOVERER CHANGES:
- get_queries() reloads role-knowledge-map.json on every cycle
\u2192 user can edit JSON \u2192 next cycle picks up new topics
- Each query tagged with role_tag (e.g., 'backend-engineer-core')
- DB schema: dataset_seen.role_tag column tracks which role found each dataset
- New table: query_history tracks per-query results + freshness
- log shows role tag: '\u2705 [sre-engineer-core] dataset-id | license | schema'
- by_role stat in cycle summary

DOWNSTREAM IMPACT:
- Future: dataset-enrich.sh can read role_tag to balance training mix
(e.g., ensure 5-10% of pairs come from each role's domain)
- /status report can show role coverage matrix
- LoRA training data can be balanced per-role (not biased toward most popular)

NEVER-ENDING:
- 30-min cycle, no upper bound
- New datasets uploaded daily to HF \u2192 discoverer catches within 30 min
- Queries auto-load from JSON \u2192 add new topics by editing JSON, no code change
- Stamp DB prevents re-evaluation \u2192 each cycle only processes new datasets
- Role coverage grows as new roles emerge (e.g., 'rust-systems-engineer' can be added)

Per-role query coverage:

Files changed (2) hide show

agents/role-knowledge-map.json +355 -0
bin/hf-dataset-discoverer.py +81 -55

agents/role-knowledge-map.json ADDED Viewed

	@@ -0,0 +1,355 @@

+{
+  "version": "1.0",
+  "description": "Per-role SDLC knowledge map. Drives hf-dataset-discoverer.py search queries. Each role has CORE skills (must master) and ADJACENT skills (should know). Source: real job descriptions from FAANG / unicorns / OSS projects 2024-2026.",
+  "roles": {
+    "solution-architect": {
+      "core": [
+        "system design", "domain driven design", "bounded context", "aggregate root",
+        "hexagonal architecture", "clean architecture", "onion architecture",
+        "adr architecture decision record", "trade-off analysis",
+        "scalability pattern", "consistency model", "cap theorem", "saga pattern",
+        "event sourcing", "cqrs", "microservices", "monolith first",
+        "integration pattern", "anti-corruption layer", "bff backend for frontend"
+      ],
+      "adjacent": [
+        "threat modeling stride", "data modeling", "api design rest graphql",
+        "monitoring strategy", "capacity planning", "cost optimization",
+        "non-functional requirements", "service level objectives"
+      ]
+    },
+    "backend-engineer": {
+      "core": [
+        "rest api design", "graphql schema", "grpc protobuf",
+        "postgresql", "mysql", "mongodb", "redis cache",
+        "authentication oauth jwt", "authorization rbac abac",
+        "message queue kafka rabbitmq", "background jobs celery sidekiq",
+        "error handling exception", "structured logging traceid",
+        "unit test integration test", "fastapi flask express spring",
+        "go fiber gin", "rust axum actix", "node nestjs", "kotlin ktor"
+      ],
+      "adjacent": [
+        "frontend api consumption", "ci cd pipeline", "docker container",
+        "owasp top 10", "performance profiling pprof", "n+1 query",
+        "circuit breaker", "rate limiting", "idempotency"
+      ]
+    },
+    "frontend-engineer": {
+      "core": [
+        "react hooks suspense", "next.js app router", "vue composition api",
+        "svelte sveltekit", "solid solidstart", "typescript strict",
+        "tailwind css", "css-in-js styled-components", "shadcn radix headless",
+        "state management zustand jotai redux", "tanstack react query",
+        "react testing library", "playwright cypress e2e",
+        "wcag accessibility aria", "web vitals lcp cls inp",
+        "bundle optimization code splitting tree shaking"
+      ],
+      "adjacent": [
+        "backend api integration", "design system tokens",
+        "seo meta tags structured data", "a/b testing experiment",
+        "analytics events", "feature flag", "i18n localization",
+        "progressive web app", "service worker"
+      ]
+    },
+    "mobile-engineer": {
+      "core": [
+        "swiftui combine", "uikit lifecycle",
+        "kotlin jetpack compose", "android navigation",
+        "react native expo", "flutter dart",
+        "offline first cache realm room",
+        "push notification fcm apns", "deep linking universal",
+        "app store optimization aso", "app review guidelines",
+        "crash reporting sentry crashlytics",
+        "memory profiling", "battery optimization"
+      ],
+      "adjacent": [
+        "backend api consumption", "graphql apollo",
+        "analytics amplitude mixpanel", "feature flag",
+        "accessibility voiceover talkback",
+        "in app purchase store kit billing"
+      ]
+    },
+    "devops-engineer": {
+      "core": [
+        "github actions workflow", "gitlab ci", "jenkins pipeline",
+        "circleci", "argocd flux gitops",
+        "docker multi-stage", "buildkit buildx",
+        "kubernetes deployment service ingress",
+        "helm chart kustomize",
+        "terraform module", "pulumi cdk",
+        "ansible chef puppet", "configuration management",
+        "vault secrets sealed-secrets sops",
+        "blue green canary deployment", "feature flag rollout"
+      ],
+      "adjacent": [
+        "networking vpc subnet", "security scanning trivy snyk",
+        "observability prometheus grafana", "finops cost",
+        "container runtime containerd cri-o"
+      ]
+    },
+    "sre-engineer": {
+      "core": [
+        "service level indicator sli", "service level objective slo",
+        "error budget", "burn rate alert", "multi window alerting",
+        "prometheus metrics", "grafana dashboard",
+        "loki promtail fluentd logging", "opentelemetry tracing",
+        "incident response runbook", "blameless postmortem",
+        "five whys root cause", "chaos engineering chaos-mesh litmus",
+        "capacity planning", "load testing k6 locust",
+        "on-call rotation pagerduty"
+      ],
+      "adjacent": [
+        "kubernetes deep", "linux performance ebpf",
+        "networking troubleshooting", "security incident",
+        "cost optimization", "disaster recovery"
+      ]
+    },
+    "cloud-engineer": {
+      "core": [
+        "aws ec2 s3 lambda dynamodb rds vpc iam",
+        "gcp gce gcs cloud-run firestore bigquery",
+        "azure aks cosmos blob",
+        "aws well-architected framework",
+        "multi-region active active passive",
+        "disaster recovery rpo rto",
+        "cost optimization spot reserved savings-plan",
+        "tagging strategy cost allocation",
+        "private subnet nat gateway vpc endpoint",
+        "iam policy least privilege", "kms encryption"
+      ],
+      "adjacent": [
+        "kubernetes eks gke aks", "service mesh istio linkerd",
+        "compliance soc2 hipaa pci", "cdn cloudfront cloudflare",
+        "edge computing lambda-edge"
+      ]
+    },
+    "devsecops-engineer": {
+      "core": [
+        "sast semgrep codeql bandit", "dast zap burp",
+        "sca trivy grype snyk dependabot",
+        "secret scanning gitleaks trufflehog",
+        "container image scan trivy clair",
+        "sbom syft cyclonedx spdx",
+        "sigstore cosign attestation",
+        "slsa supply chain framework",
+        "opa rego conftest policy as code",
+        "shift left security",
+        "vulnerability management cvss epss kev"
+      ],
+      "adjacent": [
+        "incident response forensics", "compliance automation",
+        "threat intelligence mitre att&ck",
+        "zero trust mtls service mesh"
+      ]
+    },
+    "cloud-security": {
+      "core": [
+        "cspm prowler scoutsuite steampipe",
+        "cis benchmark aws azure gcp",
+        "iam audit access analyzer", "least privilege",
+        "encryption at rest in transit kms", "envelope encryption",
+        "key rotation",
+        "waf shield guardduty",
+        "compliance pci-dss hipaa soc2 fedramp gdpr iso27001",
+        "siem splunk elastic security qradar",
+        "cloud detection response cdr",
+        "incident response cloud trail"
+      ],
+      "adjacent": [
+        "network security firewall nsg",
+        "application security owasp",
+        "threat detection lateral movement",
+        "data loss prevention dlp"
+      ]
+    },
+    "data-engineer": {
+      "core": [
+        "airflow dag", "dagster prefect",
+        "spark pyspark", "flink streaming",
+        "kafka connect schema registry",
+        "delta lake iceberg hudi", "lakehouse",
+        "snowflake bigquery redshift",
+        "dbt model test snapshot",
+        "schema evolution avro protobuf",
+        "great expectations dbt-tests data quality",
+        "openlineage data lineage",
+        "cdc debezium"
+      ],
+      "adjacent": [
+        "ml pipeline kubeflow tfx",
+        "observability for data datadog metaflow",
+        "cost optimization warehouse"
+      ]
+    },
+    "ml-engineer": {
+      "core": [
+        "pytorch lightning", "huggingface transformers",
+        "lora qlora peft fine-tuning",
+        "unsloth axolotl", "deepspeed fsdp accelerate",
+        "vllm tgi triton inference", "model serving",
+        "mlflow weights and biases",
+        "feature store feast tecton",
+        "data version control dvc",
+        "rag retrieval augmented generation",
+        "vector database faiss chroma weaviate qdrant",
+        "embedding nomic bge instructor",
+        "reranker cross-encoder bge mxbai",
+        "evaluation lm-eval-harness",
+        "prompt engineering chain of thought"
+      ],
+      "adjacent": [
+        "devops kubernetes",
+        "data engineering",
+        "statistics distribution",
+        "rlhf dpo orpo preference learning",
+        "agent framework langgraph crew autogen"
+      ]
+    },
+    "database-engineer": {
+      "core": [
+        "postgresql tuning", "mysql innodb",
+        "explain analyze query plan",
+        "btree gin gist hash index",
+        "partial index expression index",
+        "table partition list range hash",
+        "vacuum autovacuum bloat",
+        "replication streaming logical",
+        "connection pool pgbouncer",
+        "schema migration sqitch flyway alembic liquibase",
+        "online ddl pt-online-schema-change",
+        "backup pitr point-in-time recovery"
+      ],
+      "adjacent": [
+        "redis caching pattern",
+        "timescaledb timeseries",
+        "clickhouse olap",
+        "graph neo4j",
+        "search elasticsearch opensearch"
+      ]
+    },
+    "qa-engineer": {
+      "core": [
+        "test pyramid", "unit test mock stub",
+        "integration test database",
+        "playwright cypress webdriver e2e",
+        "rest assured supertest api test",
+        "property based testing fast-check hypothesis",
+        "fuzzing afl libfuzzer",
+        "mutation testing pitest stryker",
+        "snapshot test storybook",
+        "contract testing pact",
+        "test factory fixture builder pattern",
+        "coverage line branch path"
+      ],
+      "adjacent": [
+        "ci cd pipeline test stage",
+        "performance testing k6 jmeter",
+        "security testing zap",
+        "accessibility testing axe pa11y",
+        "visual regression chromatic"
+      ]
+    },
+    "performance-engineer": {
+      "core": [
+        "profiling pprof py-spy async-profiler",
+        "load testing k6 locust gatling",
+        "stress testing", "soak test",
+        "p50 p95 p99 latency", "throughput rps",
+        "flame graph",
+        "n+1 query elimination",
+        "caching l1 l2 cdn",
+        "connection pool sizing",
+        "async io goroutine coroutine",
+        "memory profiling heap allocation",
+        "garbage collection tuning"
+      ],
+      "adjacent": [
+        "observability tracing",
+        "database tuning",
+        "kernel ebpf bpftrace",
+        "network latency rtt"
+      ]
+    },
+    "tech-writer": {
+      "core": [
+        "diataxis framework tutorial how-to reference explanation",
+        "openapi swagger redoc",
+        "api design google api guidelines",
+        "readme structure",
+        "adr template",
+        "runbook structure",
+        "code documentation jsdoc rustdoc",
+        "tutorial step by step",
+        "style guide google microsoft chicago"
+      ],
+      "adjacent": [
+        "code reading comprehension",
+        "user research empathy",
+        "i18n internationalization",
+        "screencast tutorial recording"
+      ]
+    },
+    "platform-engineer": {
+      "core": [
+        "internal developer platform idp",
+        "backstage developer portal",
+        "golden path opinionated workflow",
+        "service catalog",
+        "shared ci cd template",
+        "observability infrastructure",
+        "secrets distribution",
+        "feature flag platform",
+        "multi-tenancy"
+      ],
+      "adjacent": [
+        "all sre devops topics",
+        "developer experience dx",
+        "documentation platform"
+      ]
+    },
+    "ai-agent-engineer": {
+      "core": [
+        "react agent reasoning acting",
+        "tool calling function calling",
+        "mcp model context protocol",
+        "agent orchestration langgraph crew autogen",
+        "agentic workflow",
+        "multi agent collaboration",
+        "self critique reflexion",
+        "voyager skill library",
+        "memory episodic procedural",
+        "rag retrieval augmented",
+        "vector search embedding rerank",
+        "prompt engineering xml structured",
+        "constitutional ai safety"
+      ],
+      "adjacent": [
+        "all backend topics",
+        "ml engineering fine-tuning",
+        "evaluation benchmark swe-bench",
+        "cost optimization token budget"
+      ]
+    }
+  },
+  "cross_cutting_topics": [
+    "system design interview", "high scalability",
+    "distributed system consensus raft paxos",
+    "consistency strong eventual causal",
+    "caching strategy ttl invalidation",
+    "rate limiting algorithm token bucket leaky",
+    "load balancer round robin least connection",
+    "circuit breaker hystrix resilience4j",
+    "12 factor app", "twelve factor",
+    "domain language ubiquitous",
+    "code review checklist",
+    "git workflow gitflow trunk-based",
+    "semver conventional commits",
+    "testing best practice",
+    "logging structured json",
+    "tracing opentelemetry w3c",
+    "metrics red use sli",
+    "observability three pillars",
+    "incident severity sev1 sev2",
+    "blameless culture",
+    "engineering excellence dora metrics"
+  ]
+}

bin/hf-dataset-discoverer.py CHANGED Viewed

@@ -32,51 +32,38 @@ ALLOWED = {
 DENY_KEYWORDS = ("noncommercial", "non-commercial", "nc-", "-nc", "nc4.0",
                  "llama2", "llama3", "llama-3", "research-only", "personal-use")
-# 70+ search queries — broad SDLC + niche coverage
-QUERIES = [
-    # Code generation/instruction
-    "code instruction", "code completion", "code generation", "python instruction",
-    "code review", "code refactoring", "code translation", "code explanation",
-    # Bug-fix / test
-    "bug fix", "test generation", "unit test", "pull request", "diff review",
-    "vulnerability fix", "security patch",
-    # Reasoning / CoT
-    "chain of thought", "math reasoning", "step by step", "reasoning trace",
-    "deepseek r1", "qwq", "o1 reasoning",
-    # Agent / tool
-    "agent trajectory", "tool calling", "function calling", "react agent",
-    "swe-bench", "agentic", "smolagents",
-    # DevSecOps / IR
-    "incident response", "postmortem", "cybersecurity", "vulnerability",
-    "cve", "exploit", "owasp", "threat intelligence", "security audit",
-    "penetration testing", "red team",
-    # SRE / Cloud
-    "system reliability", "sre", "observability", "kubernetes", "terraform",
-    "cloudformation", "aws", "gcp", "azure", "devops",
-    # Data / ML
-    "dbt", "airflow", "spark", "kafka", "etl", "mlops", "model serving",
-    "embedding dataset", "rag dataset",
-    # SQL / DBA
-    "text-to-sql", "sql query", "database query", "schema",
-    # Architecture
-    "software architecture", "design pattern", "domain driven design",
-    "microservices", "event sourcing", "cqrs", "hexagonal",
-    # Frontend
-    "react", "nextjs", "tailwind", "vue", "svelte", "ui component",
-    # Mobile
-    "ios swift", "android kotlin", "react native", "flutter",
-    # Multilingual
-    "multilingual code", "multilingual instruction", "thai instruction",
-    # Domain niches
-    "compiler", "embedded", "rust systems", "go concurrency",
-    "performance optimization", "concurrency",
-    # Doc / API
-    "openapi", "api design", "technical writing", "documentation dataset",
-    # Constitutional / safety
-    "constitutional ai", "safety dataset", "preference dataset", "dpo",
-    # Recent mega-mixes
-    "instruction tuning 2025", "post-training dataset", "sft mixture",
-]
 def log(msg: str):
@@ -100,11 +87,26 @@ def init_db():
             schema_branch  TEXT,
             cap            INTEGER,
             slug           TEXT,
-            verdict        TEXT
         );
         CREATE INDEX IF NOT EXISTS idx_verdict ON dataset_seen(verdict);
         CREATE INDEX IF NOT EXISTS idx_score ON dataset_seen(quality_score DESC);
         """)
 def hf_get(url: str, timeout: int = 15):
@@ -254,11 +256,14 @@ def evaluate_one(ds_id: str) -> tuple[str, dict | None]:
 def stamp(ds_id: str, verdict: str, lic: str = "", dl: int = 0,
-          score: float = 0.0, schema: str = "", cap: int = 0, slug: str = ""):
     with sqlite3.connect(DB) as c:
         c.execute(
-            "INSERT OR IGNORE INTO dataset_seen VALUES (?,?,?,?,?,?,?,?,?)",
-            (ds_id, int(time.time()), lic, dl, score, schema, cap, slug, verdict)
         )
@@ -272,8 +277,13 @@ def discover_cycle() -> dict:
     new_queued = 0
     new_rejected = 0
     seen_this_cycle = 0
-    for q in QUERIES:
-        url = f"https://huggingface.co/api/datasets?search={urllib.parse.quote(q)}&limit=50&sort=downloads&direction=-1"
         results = hf_get(url, timeout=15) or []
         for ds in results:
             ds_id = ds.get("id", "")
@@ -287,19 +297,35 @@ def discover_cycle() -> dict:
                   score=entry.get("score", 0.0) if entry else 0.0,
                   schema=entry.get("schema", "") if entry else "",
                   cap=entry.get("cap", 0) if entry else 0,
-                  slug=entry.get("slug", "") if entry else "")
             if verdict == "integrated":
                 append_dynamic(entry)
                 new_integrated += 1
-                log(f"  ✅ {ds_id} | {entry['license']} | {entry['schema']} | cap={entry['cap']:,} | score={entry['score']}")
             elif verdict.startswith("queued"):
                 new_queued += 1
             else:
                 new_rejected += 1
-            time.sleep(0.5)  # gentle on HF API
     return {"evaluated": seen_this_cycle, "integrated": new_integrated,
-            "queued": new_queued, "rejected": new_rejected}
 def main():

 DENY_KEYWORDS = ("noncommercial", "non-commercial", "nc-", "-nc", "nc4.0",
                  "llama2", "llama3", "llama-3", "research-only", "personal-use")
+# Load role-driven query map (auto-rebuilds when role-knowledge-map.json updated)
+def _load_role_queries() -> list[tuple[str, str]]:
+    """Returns list of (query, role) tuples. Each role contributes core + adjacent
+    topics. Plus cross-cutting general queries. Total ~250+ queries auto-generated."""
+    role_map_path = HOME / ".surrogate/agents/role-knowledge-map.json"
+    queries: list[tuple[str, str]] = []
+    if role_map_path.exists():
+        try:
+            data = json.loads(role_map_path.read_text())
+        except Exception:
+            data = {"roles": {}, "cross_cutting_topics": []}
+        for role, skills in data.get("roles", {}).items():
+            for q in (skills.get("core") or []):
+                queries.append((q, f"{role}-core"))
+            for q in (skills.get("adjacent") or []):
+                queries.append((q, f"{role}-adj"))
+        for q in data.get("cross_cutting_topics") or []:
+            queries.append((q, "cross-cutting"))
+    # Plus baseline queries (NEVER static — discoverer must keep finding)
+    queries.extend([(q, "general") for q in [
+        "instruction tuning 2025", "instruction tuning 2026",
+        "post-training dataset", "sft mixture",
+        "preference dataset dpo orpo",
+        "dataset 2026", "code dataset 2026",
+        "agentic dataset 2026", "reasoning dataset 2026",
+    ]])
+    return queries
+def get_queries() -> list[tuple[str, str]]:
+    """Reload on each call so role-knowledge-map.json edits take effect immediately."""
+    return _load_role_queries()
 def log(msg: str):
             schema_branch  TEXT,
             cap            INTEGER,
             slug           TEXT,
+            verdict        TEXT,
+            role_tag       TEXT          -- which role's query found this
         );
         CREATE INDEX IF NOT EXISTS idx_verdict ON dataset_seen(verdict);
         CREATE INDEX IF NOT EXISTS idx_score ON dataset_seen(quality_score DESC);
+        CREATE INDEX IF NOT EXISTS idx_role ON dataset_seen(role_tag);
+        CREATE TABLE IF NOT EXISTS query_history (
+            query        TEXT PRIMARY KEY,
+            role_tag     TEXT,
+            last_run_ts  INTEGER NOT NULL,
+            results_count INTEGER DEFAULT 0,
+            new_finds    INTEGER DEFAULT 0
+        );
         """)
+        # Migration: add role_tag column if upgrading from v1 schema
+        try:
+            c.execute("ALTER TABLE dataset_seen ADD COLUMN role_tag TEXT")
+        except sqlite3.OperationalError:
+            pass  # already exists
 def hf_get(url: str, timeout: int = 15):
 def stamp(ds_id: str, verdict: str, lic: str = "", dl: int = 0,
+          score: float = 0.0, schema: str = "", cap: int = 0, slug: str = "",
+          role_tag: str = ""):
     with sqlite3.connect(DB) as c:
         c.execute(
+            "INSERT OR IGNORE INTO dataset_seen "
+            "(ds_id, evaluated_ts, license, downloads, quality_score, schema_branch, cap, slug, verdict, role_tag) "
+            "VALUES (?,?,?,?,?,?,?,?,?,?)",
+            (ds_id, int(time.time()), lic, dl, score, schema, cap, slug, verdict, role_tag)
         )
     new_queued = 0
     new_rejected = 0
     seen_this_cycle = 0
+    role_finds: dict[str, int] = {}
+    queries = get_queries()
+    log(f"  loaded {len(queries)} role-driven queries (covering {len(set(r for _,r in queries))} role tags)")
+    for q, role_tag in queries:
+        url = f"https://huggingface.co/api/datasets?search={urllib.parse.quote(q)}&limit=30&sort=downloads&direction=-1"
         results = hf_get(url, timeout=15) or []
         for ds in results:
             ds_id = ds.get("id", "")
                   score=entry.get("score", 0.0) if entry else 0.0,
                   schema=entry.get("schema", "") if entry else "",
                   cap=entry.get("cap", 0) if entry else 0,
+                  slug=entry.get("slug", "") if entry else "",
+                  role_tag=role_tag)
             if verdict == "integrated":
+                # Tag the entry with role for downstream training-mix balance
+                if entry: entry["role_tag"] = role_tag
                 append_dynamic(entry)
                 new_integrated += 1
+                role_finds[role_tag] = role_finds.get(role_tag, 0) + 1
+                log(f"  ✅ [{role_tag}] {ds_id} | {entry['license']} | {entry['schema']} | cap={entry['cap']:,}")
             elif verdict.startswith("queued"):
                 new_queued += 1
             else:
                 new_rejected += 1
+            time.sleep(0.4)  # gentle on HF API
+        # Update query history for this query
+        try:
+            with sqlite3.connect(DB) as c:
+                c.execute(
+                    "INSERT OR REPLACE INTO query_history (query, role_tag, last_run_ts, results_count, new_finds) "
+                    "VALUES (?,?,?,?, COALESCE((SELECT new_finds FROM query_history WHERE query=?),0) + ?)",
+                    (q, role_tag, int(time.time()), len(results), q, new_integrated)
+                )
+        except Exception:
+            pass
     return {"evaluated": seen_this_cycle, "integrated": new_integrated,
+            "queued": new_queued, "rejected": new_rejected,
+            "by_role": role_finds}
 def main():