Spaces:
Sleeping
Sleeping
File size: 1,459 Bytes
d38101e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import pandas as pd
from googlesearch import search
import time
import random
df = pd.read_csv("search_progress.csv")
df1 = df.drop("query_index", axis=1)
print("Initial DataFrame:")
print(df1.head())
df1.columns = ["title", "url"]
unfinished = df1[(df1.isnull().any(axis=1)) | ~((df1["url"].str.contains("amazon", na=False)) | (df1["url"].str.contains("google", na=False)))]
unfinished_list = unfinished["title"].tolist()
unfinished_urls = [None] * len(unfinished_list)
for idx,i in enumerate(unfinished_list):
print()
print(f"Processing title {idx + 1}/{len(unfinished_list)}: {i}")
try:
results1 = search(i, num_results=3, lang="en")
results2 = search(i.replace("google", "amazon"), num_results=3, lang="en")
url = list(results1) + list(results2)
count = 0
print("\n")
print(f"Searching for: {i}")
for j in url:
count += 1
print(count, j)
index = int(input("Enter the index of the correct URL (1-3): ")) - 1
unfinished_urls[idx] = url[index]
except Exception as e:
print(f"Error occurred while searching for {i}: {e}")
unfinished_urls[idx] = None
time.sleep(random.randint(1,5)) # Sleep to avoid hitting the search API too quickly
unfinished["url"] = unfinished_urls
print("Updated DataFrame with URLs:")
print(unfinished.head())
df1.update(unfinished)
df1.to_csv("search_progress1.csv", index=False)
|