| import requests |
| import time |
| from tqdm import tqdm |
| from dotenv import load_dotenv |
| import os |
| from datetime import datetime, timedelta |
|
|
| load_dotenv() |
|
|
| url = "https://api.github.com/search/repositories" |
| headers = {"Authorization": "token " + os.environ["GITHUB_PAT"]} |
| timeout_duration = 10 |
| output_file = "repositories.txt" |
| last_date_file = "last_date.txt" |
|
|
|
|
| def fetch_repositories_by_date_range(start_date, end_date): |
| page = 1 |
| query = f"language:Python size:5..5000 stars:>=10 created:{start_date}..{end_date}" |
| params = { |
| "q": query, |
| "per_page": 100, |
| "sort": "stars", |
| } |
|
|
| with open(output_file, "a") as file, tqdm( |
| desc=f"Fetching {start_date} to {end_date}", unit="page" |
| ) as pbar: |
| while True: |
| params["page"] = page |
| try: |
| response = requests.get( |
| url, headers=headers, params=params, timeout=timeout_duration |
| ) |
|
|
| if response.status_code == 200: |
| data = response.json() |
| repositories = data.get("items", []) |
|
|
| for repo in repositories: |
| file.write(f"{repo['html_url']}\n") |
|
|
| if len(repositories) < params["per_page"]: |
| break |
|
|
| page += 1 |
| pbar.update(1) |
| time.sleep(1) |
|
|
| elif response.status_code == 429: |
| reset_time = int(response.headers.get("x-ratelimit-reset", 0)) |
| wait_time = max(reset_time - int(time.time()), 0) |
| print(f"Rate limit exceeded. Waiting for {wait_time} seconds...") |
| time.sleep(wait_time) |
|
|
| else: |
| print("Error:", response.status_code, response.json()) |
| break |
|
|
| except requests.exceptions.Timeout: |
| print("Request timed out. Retrying...") |
| time.sleep(5) |
|
|
|
|
| def generate_date_ranges(start_year=2015): |
| end_date = datetime.now() |
| current_date = datetime(start_year, 1, 1) |
|
|
| if os.path.exists(last_date_file): |
| with open(last_date_file, "r") as f: |
| last_date_str = f.read().strip() |
| if last_date_str: |
| current_date = datetime.strptime(last_date_str, "%Y-%m-%d") |
|
|
| while current_date < end_date: |
| next_date = current_date + timedelta(days=30) |
| yield current_date.strftime("%Y-%m-%d"), min(next_date, end_date).strftime( |
| "%Y-%m-%d" |
| ) |
| current_date = next_date |
|
|
|
|
| |
| date_ranges = list(generate_date_ranges()) |
| with tqdm(total=len(date_ranges), desc="Total Date Ranges") as date_pbar: |
| for start_date, end_date in date_ranges: |
| print(f"Fetching repositories created between {start_date} and {end_date}") |
| fetch_repositories_by_date_range(start_date, end_date) |
|
|
| |
| with open(last_date_file, "w") as f: |
| f.write(end_date) |
|
|
| date_pbar.update(1) |
|
|