OSINT1

Paused

OSINT1 / scripts /build_fixed_levels_dataset.py

siddeshwar-kagatikar

Deploy clean snapshot to Hugging Face Space.

db4fa53 13 days ago

7.05 kB

	from __future__ import annotations

	import argparse
	import json
	from collections import Counter
	from dataclasses import asdict
	from pathlib import Path
	from typing import Any

	from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
	from osint_env.data.generator import DatasetGenerator
	from osint_env.domain.models import Edge, TaskInstance
	from osint_env.llm import build_llm_client


	def edge_to_dict(edge: Edge) -> dict[str, Any]:
	return {
	"src": edge.src,
	"rel": edge.rel,
	"dst": edge.dst,
	"confidence": float(edge.confidence),
	}


	def task_to_dict(task: TaskInstance) -> dict[str, Any]:
	return {
	"task_id": task.task_id,
	"task_type": task.task_type,
	"question": task.question,
	"answer": task.answer,
	"supporting_edges": [edge_to_dict(e) for e in task.supporting_edges],
	"metadata": dict(task.metadata),
	}


	def build_fixed_snapshot(seed_path: Path) -> dict[str, Any]:
	seeding = load_seeding_config(seed_path)
	fixed_nodes = []
	for node in seeding.seeded_nodes:
	fixed_nodes.append(
	{
	"node_id": node.node_id,
	"node_type": str(getattr(node.node_type, "value", node.node_type)),
	"attrs": dict(node.attrs),
	}
	)
	fixed_edges = [
	{
	"src": edge.src,
	"rel": edge.rel,
	"dst": edge.dst,
	"confidence": float(edge.confidence),
	}
	for edge in seeding.seeded_edges
	]
	fixed_questions = []
	for idx, q in enumerate(seeding.seeded_questions):
	fixed_questions.append(
	{
	"task_id": f"fixed_task_{idx:02d}",
	"task_type": q.task_type,
	"question": q.question,
	"answer": q.answer,
	"supporting_edges": [
	{
	"src": edge.src,
	"rel": edge.rel,
	"dst": edge.dst,
	"confidence": float(edge.confidence),
	}
	for edge in q.supporting_edges
	],
	"metadata": dict(q.metadata),
	}
	)

	difficulty_counts = Counter(str(q.get("metadata", {}).get("difficulty", "unknown")) for q in fixed_questions)
	return {
	"dataset_name": "fixed_levels_submission_set",
	"source_seed": str(seed_path),
	"graph": {
	"nodes": fixed_nodes,
	"edges": fixed_edges,
	"node_count": len(fixed_nodes),
	"edge_count": len(fixed_edges),
	},
	"questions": fixed_questions,
	"question_count": len(fixed_questions),
	"difficulty_counts": dict(difficulty_counts),
	}


	def build_complete_snapshot(shared_config_path: Path, seed_path: Path) -> dict[str, Any]:
	shared = load_shared_config(shared_config_path)
	env_cfg = clone_environment_config(shared.environment)
	env_cfg.seeding = load_seeding_config(seed_path)

	llm_client = build_llm_client(env_cfg.llm)
	generator = DatasetGenerator(config=env_cfg, llm=llm_client)

	graph = generator.build_canonical_graph()
	views = generator.build_platform_views(graph)
	tasks = generator.generate_tasks(graph, views, count=max(15, len(env_cfg.seeding.seeded_questions)))

	difficulty_counts = Counter(str(task.metadata.get("difficulty", "unknown")) for task in tasks)

	return {
	"dataset_name": "fixed_levels_submission_set",
	"generation_mode": "llm_expanded",
	"shared_config": str(shared_config_path),
	"seed_file": str(seed_path),
	"llm": asdict(env_cfg.llm),
	"environment": {
	"n_users": env_cfg.n_users,
	"alias_density": env_cfg.alias_density,
	"noise_level": env_cfg.noise_level,
	"red_herring_rate": env_cfg.red_herring_rate,
	"seed": env_cfg.seed,
	},
	"canonical_graph": {
	"node_count": len(graph.nodes),
	"edge_count": len(graph.edges),
	"nodes": [
	{
	"node_id": node.node_id,
	"node_type": node.node_type.value,
	"attrs": dict(node.attrs),
	}
	for node in sorted(graph.nodes.values(), key=lambda n: n.node_id)
	],
	"edges": [edge_to_dict(edge) for edge in graph.edges],
	},
	"platform_views": {
	"microblog_posts": views.microblog_posts,
	"forum_threads": views.forum_threads,
	"profiles": views.profiles,
	"counts": {
	"microblog_posts": len(views.microblog_posts),
	"forum_threads": len(views.forum_threads),
	"profiles": len(views.profiles),
	},
	},
	"tasks": [task_to_dict(task) for task in tasks],
	"task_count": len(tasks),
	"difficulty_counts": dict(difficulty_counts),
	}


	def main() -> None:
	parser = argparse.ArgumentParser(description="Build fixed difficulty dataset artifacts.")
	parser.add_argument(
	"--seed-file",
	default="datasets/fixed_levels/seed_fixed_levels.json",
	help="Path to seeding JSON with fixed graph/questions.",
	)
	parser.add_argument(
	"--shared-config",
	default="datasets/fixed_levels/shared_config_fixed_levels.json",
	help="Path to shared config used for LLM-expanded generation.",
	)
	parser.add_argument(
	"--output-dir",
	default="datasets/fixed_levels",
	help="Directory where dataset artifacts are written.",
	)
	args = parser.parse_args()

	output_dir = Path(args.output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	seed_path = Path(args.seed_file)
	shared_path = Path(args.shared_config)

	fixed_snapshot = build_fixed_snapshot(seed_path)
	fixed_path = output_dir / "fixed_graph_questions.json"
	fixed_path.write_text(json.dumps(fixed_snapshot, indent=2, sort_keys=True), encoding="utf-8")

	complete_snapshot = build_complete_snapshot(shared_path, seed_path)
	complete_path = output_dir / "complete_dataset_qwen_generated.json"
	complete_path.write_text(json.dumps(complete_snapshot, indent=2, sort_keys=True), encoding="utf-8")

	summary = {
	"fixed_dataset": str(fixed_path),
	"complete_dataset": str(complete_path),
	"fixed_nodes": fixed_snapshot["graph"]["node_count"],
	"fixed_edges": fixed_snapshot["graph"]["edge_count"],
	"fixed_questions": fixed_snapshot["question_count"],
	"complete_nodes": complete_snapshot["canonical_graph"]["node_count"],
	"complete_edges": complete_snapshot["canonical_graph"]["edge_count"],
	"complete_tasks": complete_snapshot["task_count"],
	"difficulty_counts": complete_snapshot["difficulty_counts"],
	}
	print(json.dumps(summary, indent=2, sort_keys=True))


	if __name__ == "__main__":
	main()