Spaces:

Siddeshwar1625
/

OSINT

Paused

File size: 2,436 Bytes

d814291

from pathlib import Path

from osint_env.data.generator import DatasetGenerator
from osint_env.domain.models import EnvironmentConfig


def _write_metaqa_fixture(root: Path) -> None:
    root.mkdir(parents=True, exist_ok=True)
    (root / "kb.txt").write_text(
        "\n".join(
            [
                "Movie A|starred_actors|Actor X",
                "Movie B|starred_actors|Actor X",
                "Movie A|directed_by|Director D",
                "Movie C|directed_by|Director D",
                "Movie C|release_year|2002",
            ]
        ),
        encoding="utf-8",
    )

    rows = {
        "1-hop": ("what movies did [Actor X] act in\tMovie A|Movie B\n", "actor_to_movie\n"),
        "2-hop": ("which films share the director of [Movie A]\tMovie C\n", "movie_to_director_to_movie\n"),
        "3-hop": (
            "which release year corresponds to films with same director as [Movie A]\t2002\n",
            "movie_to_director_to_movie_to_year\n",
        ),
    }

    for hop, (qa_line, qtype_line) in rows.items():
        qa_dir = root / hop / "vanilla"
        qa_dir.mkdir(parents=True, exist_ok=True)
        (qa_dir / "qa_train.txt").write_text(qa_line, encoding="utf-8")
        (root / hop / "qa_train_qtype.txt").write_text(qtype_line, encoding="utf-8")


def test_metaqa_mode_builds_graph_and_hop_tasks(tmp_path: Path):
    metaqa_root = tmp_path / "metaQA"
    _write_metaqa_fixture(metaqa_root)

    cfg = EnvironmentConfig(
        seed=5,
        dataset_mode="metaqa",
        metaqa_root=str(metaqa_root),
        metaqa_variant="vanilla",
        metaqa_hops=["1-hop", "2-hop", "3-hop"],
        metaqa_splits=["train"],
    )

    gen = DatasetGenerator(cfg)
    graph = gen.build_canonical_graph()
    views = gen.build_platform_views(graph)
    tasks = gen.generate_tasks(graph, views, count=24)

    assert len(graph.nodes) >= 5
    assert any(edge.rel == "directed_by" for edge in graph.edges)
    assert any(post["post_id"].startswith("post_metaqa_") for post in views.microblog_posts)
    assert any(profile["user_id"] == "Actor X" for profile in views.profiles)

    hop_labels = {str(task.metadata.get("hop", "")) for task in tasks}
    difficulties = {str(task.metadata.get("difficulty", "")) for task in tasks}

    assert hop_labels == {"1-hop", "2-hop", "3-hop"}
    assert difficulties == {"easy", "medium", "hard"}
    assert all(task.supporting_edges for task in tasks)