Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from googlesearch import search | |
| import time | |
| import random | |
| df = pd.read_csv("search_progress.csv") | |
| df1 = df.drop("query_index", axis=1) | |
| print("Initial DataFrame:") | |
| print(df1.head()) | |
| df1.columns = ["title", "url"] | |
| unfinished = df1[(df1.isnull().any(axis=1)) | ~((df1["url"].str.contains("amazon", na=False)) | (df1["url"].str.contains("google", na=False)))] | |
| unfinished_list = unfinished["title"].tolist() | |
| unfinished_urls = [None] * len(unfinished_list) | |
| for idx,i in enumerate(unfinished_list): | |
| print() | |
| print(f"Processing title {idx + 1}/{len(unfinished_list)}: {i}") | |
| try: | |
| results1 = search(i, num_results=3, lang="en") | |
| results2 = search(i.replace("google", "amazon"), num_results=3, lang="en") | |
| url = list(results1) + list(results2) | |
| count = 0 | |
| print("\n") | |
| print(f"Searching for: {i}") | |
| for j in url: | |
| count += 1 | |
| print(count, j) | |
| index = int(input("Enter the index of the correct URL (1-3): ")) - 1 | |
| unfinished_urls[idx] = url[index] | |
| except Exception as e: | |
| print(f"Error occurred while searching for {i}: {e}") | |
| unfinished_urls[idx] = None | |
| time.sleep(random.randint(1,5)) # Sleep to avoid hitting the search API too quickly | |
| unfinished["url"] = unfinished_urls | |
| print("Updated DataFrame with URLs:") | |
| print(unfinished.head()) | |
| df1.update(unfinished) | |
| df1.to_csv("search_progress1.csv", index=False) | |