File size: 3,050 Bytes
2b6a536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import sys
import time
from pathlib import Path

import requests
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)

# Add parent directory to path to import models
sys.path.insert(0, str(Path(__file__).parent.parent))
from models import Discussion, QuestionAndSolution

BASE_URL = "https://discuss.huggingface.co"


# configure retry decorator for your requests
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=1, max=60),
    retry=retry_if_exception_type(requests.HTTPError),
)
def safe_get(url, **kwargs):
    resp = requests.get(url, **kwargs)
    if resp.status_code == 422:
        # read retry‐after header if present
        retry_after = resp.headers.get("Retry-After")
        if retry_after:
            delay = float(retry_after)
        else:
            # fallback to guess
            delay = 30
        print(f"429 hit — waiting {delay} seconds...")
        time.sleep(delay)
        resp.raise_for_status()
    else:
        resp.raise_for_status()
    return resp


def get_solved_discussions(n_posts: int = 50):
    page = 1
    discussions = []
    while len(discussions) < n_posts:
        url = f"{BASE_URL}/search.json?q=status:solved+order:latest&page={page}"
        resp = safe_get(url)
        topics = resp.json()["topics"]
        if not topics:
            break
        for post in topics:
            discussions.append(
                Discussion(
                    title=post["fancy_title"],
                    url=f"{BASE_URL}/t/{post['slug']}/{post['id']}",
                    topic_id=post["id"],
                    category=post["category_id"],
                    created_at=post["created_at"],
                )
            )
            if len(discussions) >= n_posts:
                break
        page += 1
        time.sleep(0.5)  # simple pacing to avoid bursts
    return discussions


def get_qa_pair(discussions, start_idx: int = 0):
    for discussion in discussions[start_idx:]:
        resp = safe_get(discussion.url + ".json")
        data = resp.json()
        posts = data["post_stream"]["posts"]
        accepted_nr = min(
            max(data["accepted_answer"]["post_number"] - 1, 0), len(posts) - 1
        )
        question = posts[0]["cooked"]
        solution = posts[accepted_nr]["cooked"]
        yield QuestionAndSolution(
            discussion_title=discussion.title,
            discussion_url=discussion.url,
            discussion_topic_id=discussion.topic_id,
            discussion_category=discussion.category,
            discussion_created_at=discussion.created_at,
            question=question,
            solution=solution,
            thread=posts,
        )
        time.sleep(0.5)


if __name__ == "__main__":
    discussions = get_solved_discussions(n_posts=300)
    print(f"Fetched {len(discussions)} discussions")
    with open("qa_pairs.jsonl", "a") as f:
        for qa_pair in get_qa_pair(discussions):
            f.write(qa_pair.model_dump_json() + "\n")