| import requests |
| import os |
| from urllib.parse import urlparse |
| from tqdm import tqdm |
| from tenacity import retry, stop_after_attempt, wait_exponential |
|
|
|
|
| @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) |
| def download_file(url, session, download_folder): |
| try: |
| |
| response = session.get(url) |
| response.raise_for_status() |
|
|
| |
| parsed_url = urlparse(url).path.strip("/").split("/") |
| file_name = f"{parsed_url[0]}_{parsed_url[1]}_{parsed_url[3]}.py" |
|
|
| |
| original_file_name = file_name |
| counter = 1 |
| while os.path.exists(os.path.join(download_folder, file_name)): |
| |
| file_name = f"{os.path.splitext(original_file_name)[0]}_{counter}{os.path.splitext(original_file_name)[1]}" |
| counter += 1 |
|
|
| |
| with open(os.path.join(download_folder, file_name), "wb") as file: |
| file.write(response.content) |
| return f"Downloaded {file_name}" |
| except requests.exceptions.HTTPError as e: |
| if e.response.status_code == 404: |
| return f"404 Not Found: {url}" |
| else: |
| raise |
|
|
|
|
| def download_files_linearly(urls, download_folder, start_line=0): |
| |
| os.makedirs(download_folder, exist_ok=True) |
|
|
| with requests.Session() as session: |
| for i, url in enumerate( |
| tqdm(urls[start_line:], initial=start_line, total=len(urls)) |
| ): |
| result = download_file(url, session, download_folder) |
| print(result) |
|
|
|
|
| def read_urls_from_file(file_name): |
| |
| with open(file_name, "r") as file: |
| |
| return [line.strip() for line in file.readlines()] |
|
|
|
|
| |
| urls_file = "python_files.txt" |
|
|
| |
| download_folder = "downloaded_files" |
|
|
| |
| urls = read_urls_from_file(urls_file) |
|
|
| |
| start_line = 123791 |
|
|
| |
| download_files_linearly(urls, download_folder, start_line) |
|
|