| import os |
| import argparse |
| import csv |
| from time import sleep |
| import time |
| import json |
| import numpy as np |
| import fitz |
| import pandas as pd |
| import requests |
| from src.retrieval.html2lines import url2lines, line_correction |
|
|
| csv.field_size_limit(100000000) |
|
|
| MAX_RETRIES = 3 |
| TIMEOUT = 5 |
|
|
|
|
| def scrape_text_from_url(url, temp_name): |
| response = None |
| for attempt in range(MAX_RETRIES): |
| try: |
| response = requests.get(url, timeout=TIMEOUT) |
| except requests.RequestException as e: |
| if attempt < MAX_RETRIES - 1: |
| sleep(3) |
|
|
| if ( |
| response is None or response.status_code == 503 |
| ): |
| return [] |
|
|
| if url.endswith(".pdf"): |
| with open(f"pdf_dir/{temp_name}.pdf", "wb") as f: |
| f.write(response.content) |
|
|
| extracted_text = "" |
| doc = fitz.open(f"pdf_dir/{temp_name}.pdf") |
| for page in doc: |
| extracted_text += page.get_text() if page.get_text() else "" |
|
|
| return line_correction(extracted_text.split("\n")) |
|
|
| return line_correction(url2lines(url)) |
|
|
|
|
| if __name__ == "__main__": |
|
|
| parser = argparse.ArgumentParser(description="Scraping text from URLs.") |
| parser.add_argument( |
| "-i", |
| "--tsv_input_file", |
| type=str, |
| help="The path of the input files containing URLs from Google search.", |
| ) |
| parser.add_argument( |
| "-o", |
| "--json_output_dir", |
| type=str, |
| default="output", |
| help="The output JSON file to save the scraped data.", |
| ) |
| parser.add_argument( |
| "--overwrite_out_file", |
| action="store_true", |
| ) |
|
|
| args = parser.parse_args() |
|
|
| assert ( |
| os.path.splitext(args.tsv_input_file)[-1] == ".tsv" |
| ), "The input should be a tsv file." |
|
|
| os.makedirs(args.json_output_dir, exist_ok=True) |
|
|
| total_scraped, empty, total_failed = 0, 0, 0 |
|
|
| print(f"Processing files {args.tsv_input_file}") |
|
|
| st = time.time() |
|
|
| claim_id = os.path.splitext(os.path.basename(args.tsv_input_file))[0] |
| json_output_path = os.path.join(args.json_output_dir, f"{claim_id}.json") |
|
|
| lines_skipped = 0 |
| if os.path.exists(json_output_path): |
| if args.overwrite_out_file: |
| os.remove(json_output_path) |
| else: |
| with open(json_output_path, "r", encoding="utf-8") as json_file: |
| existing_data = json_file.readlines() |
| lines_skipped = len(existing_data) |
| print(f" Skipping {lines_skipped} lines in {json_output_path}") |
|
|
| |
| try: |
| df = pd.read_csv(args.tsv_input_file, sep="\t", header=None) |
| data = df.values |
| print("Data loaded successfully with Pandas.") |
|
|
| except Exception as e: |
| print("Error loading with csv:", e) |
| try: |
| data = np.genfromtxt( |
| args.tsv_input_file, delimiter="\t", dtype=None, encoding=None |
| ) |
| print("Data loaded successfully with NumPy.") |
| except Exception as e: |
| print("Error loading with NumPy:", e) |
| try: |
| data = [] |
| with open(args.tsv_input_file, "r", newline="") as tsvfile: |
| reader = csv.reader(tsvfile, delimiter="\t") |
| for row in reader: |
| data.append(row) |
| print("Data loaded successfully with csv.") |
| except Exception as e: |
| print("Error loading with csv:", e) |
| data = None |
|
|
| if len(data) == lines_skipped: |
| print(" No more lines need to be processed!") |
| else: |
| with open(json_output_path, "a", encoding="utf-8") as json_file: |
| for index, row in enumerate(data): |
| if index < lines_skipped: |
| continue |
| url = row[2] |
| json_data = { |
| "claim_id": claim_id, |
| "type": row[1], |
| "query": row[3], |
| "url": url, |
| "url2text": [], |
| } |
| print(f"Scraping text for url_{index}: {url}!") |
| try: |
| scrape_result = scrape_text_from_url(url, claim_id) |
| json_data["url2text"] = scrape_result |
|
|
| if len(json_data["url2text"]) > 0: |
| total_scraped += 1 |
| else: |
| empty += 1 |
|
|
| except Exception as e: |
| total_failed += 1 |
|
|
| json_file.write(json.dumps(json_data, ensure_ascii=False) + "\n") |
| json_file.flush() |
|
|
| print(f"Output for {args.tsv_input_file} saved to {json_output_path}") |
| elapsed_time = time.time() - st |
| elapsed_minutes = int(elapsed_time // 60) |
| elapsed_seconds = int(elapsed_time % 60) |
| print(f"Time elapsed: {elapsed_minutes}min {elapsed_seconds}sec") |
| print(f"{total_scraped} scraped, {empty} empty, {total_failed} failed") |
|
|