# 🔧 Install dependencies (uncomment if running locally) # !pip install gradio pandas sentence-transformers import os import zipfile import requests import pandas as pd import gradio as gr from sentence_transformers import SentenceTransformer, util ### STEP 1: Download and unzip the influencer dataset from Hugging Face # Replace this with your actual dataset ZIP URL url = "https://huggingface.co/datasets/your-username/influencer-dataset-merged/resolve/main/top_100_influencers.zip" zip_path = "top_100_influencers.zip" # Download zip file if not already present if not os.path.exists(zip_path): print("📥 Downloading influencer dataset...") r = requests.get(url) with open(zip_path, "wb") as f: f.write(r.content) # Unzip the file into a folder unzip_dir = "influencer_data" if not os.path.exists(unzip_dir): print("📦 Unzipping dataset...") with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(unzip_dir) ### STEP 2: Merge all CSVs into one print("🔗 Merging influencer files...") all_dfs = [] for file in os.listdir(unzip_dir): if file.endswith(".csv"): df = pd.read_csv(os.path.join(unzip_dir, file)) df["Source File"] = file # Optional: keep track of file origin all_dfs.append(df) df = pd.concat(all_dfs, ignore_index=True) # Basic cleanup df.drop_duplicates(inplace=True) df.dropna(subset=["Name", "Niche"], inplace=True) df.fillna("", inplace=True) # Save combined dataset (optional) df.to_csv("top_100_influencers_combined.csv", index=False) print("✅ Combined dataset ready!") ### STEP 3: Build the recommender engine # Combine fields for semantic embedding df["profile_text"] = df["Name"] + " - " + df["Platform"] + " - " + df["Niche"] + " - " + df["Country"] # Load sentence embedding model print("🧠 Loading embedding model...") model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Precompute influencer embeddings print("🔢 Encoding influencer profiles...") influencer_embeddings = model.encode(df["profile_text"].tolist(), convert_to_tensor=True) ### STEP 4: Define similarity search + UI def recommend_influencers(brand_description): query_embedding = model.encode(brand_description, convert_to_tensor=True) cosine_scores = util.pytorch_cos_sim(query_embedding, influencer_embeddings)[0] top_indices = cosine_scores.topk(3).indices.tolist() recommendations = [] for idx in top_indices: row = df.iloc[idx] recommendations.append({ "Influencer": row["Name"], "Platform": row["Platform"], "Niche": row["Niche"], "Country": row["Country"], "Engagement Rate": row.get("Engagement Rate", "N/A"), "Followers": row.get("Followers", "N/A") }) return recommendations def format_output(brand_input): recs = recommend_influencers(brand_input) output = "" for i, rec in enumerate(recs, 1): output += f"### {i}. {rec['Influencer']} ({rec['Platform']})\n" output += f"- Niche: {rec['Niche']}\n" output += f"- Country: {rec['Country']}\n" output += f"- Engagement Rate: {rec['Engagement Rate']}\n" output += f"- Followers: {rec['Followers']}\n\n" return output demo = gr.Interface( fn=format_output, inputs=gr.Textbox(label="Enter your brand or campaign description", placeholder="e.g. Sustainable fashion for Gen Z"), outputs=gr.Markdown(label="Top 3 Influencer Matches"), title="InfluMatch: Influencer Recommender", description="Describe your brand or campaign and get 3 matching influencer suggestions.", examples=[ ["Tech gadgets for millennial men"], ["Skincare brand for Gen Z in the US"], ["Luxury travel experiences for couples"], ["Eco-friendly fashion accessories"] ] ) if __name__ == "__main__": demo.launch()