Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import gradio as gr | |
| import numpy as np | |
| import os | |
| import re | |
| from langchain_chroma import Chroma | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| # Ensure model caching | |
| os.environ["HF_HOME"] = "/tmp/hf_cache" | |
| os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache" | |
| # Initialize embeddings with caching | |
| print("Loading embeddings model...") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={'device': 'cpu'}, | |
| encode_kwargs={'normalize_embeddings': True} | |
| ) | |
| # Initialize ChromaDB | |
| print("Initializing ChromaDB...") | |
| if not os.path.exists("chroma_books"): | |
| print("Creating new ChromaDB from tagged_description.txt...") | |
| try: | |
| raw_docs = TextLoader("tagged_description.txt", encoding="utf-8").load() | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=0, | |
| chunk_overlap=0, | |
| length_function=len, | |
| ) | |
| documents = text_splitter.split_documents(raw_docs) | |
| print(f"Loaded {len(documents)} documents") | |
| db_books = Chroma.from_documents( | |
| documents, | |
| embedding=embeddings, | |
| collection_name="books", | |
| persist_directory="chroma_books", | |
| ) | |
| print("ChromaDB created successfully!") | |
| except FileNotFoundError: | |
| print("ERROR: tagged_description.txt not found!") | |
| raise | |
| else: | |
| print("Loading existing ChromaDB...") | |
| db_books = Chroma( | |
| persist_directory="chroma_books", | |
| embedding_function=embeddings, | |
| collection_name="books" | |
| ) | |
| # Load books data | |
| print("Loading books data...") | |
| try: | |
| books = pd.read_csv("final_book_df.csv") | |
| books["large_thumbnail"] = books["thumbnail"] + "&fife=w800" | |
| # Better fallback image handling | |
| books["large_thumbnail"] = np.where( | |
| books["large_thumbnail"].isna() | books["thumbnail"].isna(), | |
| "cover-not-found.jpg", | |
| books["large_thumbnail"] | |
| ) | |
| # Ensure 'authors' and 'categories' are string type for literal search | |
| books['authors'] = books['authors'].astype(str) | |
| books['categories'] = books['categories'].astype(str) | |
| books['title_and_subtitle'] = books['title_and_subtitle'].astype(str) | |
| print(f"Loaded {len(books)} books") | |
| except FileNotFoundError: | |
| print("ERROR: final_book_df.csv not found!") | |
| raise | |
| def retrieve_semantic_recommendations( | |
| query: str, | |
| category: str = None, | |
| tone: str = None, | |
| initial_top_k: int = 50, | |
| final_top_k: int = 8, | |
| ) -> pd.DataFrame: | |
| """Retrieve semantic recommendations based on query, category, and tone.""" | |
| recs = db_books.similarity_search(query, k=initial_top_k) | |
| books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs] | |
| book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k) | |
| # Filter by category | |
| if category and category != "All": | |
| book_recs = book_recs[book_recs["categories"] == category] | |
| # Sort by emotional tone | |
| if tone == "Happy": | |
| book_recs = book_recs.sort_values(by="joy", ascending=False) | |
| elif tone == "Surprising": | |
| book_recs = book_recs.sort_values(by="surprise", ascending=False) | |
| elif tone == "Angry": | |
| book_recs = book_recs.sort_values(by="anger", ascending=False) | |
| elif tone == "Suspenseful": | |
| book_recs = book_recs.sort_values(by="fear", ascending=False) | |
| elif tone == "Sad": | |
| book_recs = book_recs.sort_values(by="sadness", ascending=False) | |
| return book_recs.head(final_top_k) | |
| def retrieve_literal_recommendations( | |
| query: str, | |
| category: str = None, | |
| tone: str = None, | |
| final_top_k: int = 8, | |
| ) -> pd.DataFrame: | |
| """Retrieve literal recommendations using flexible regex pattern matching.""" | |
| if not query.strip(): | |
| return pd.DataFrame() | |
| # Create flexible regex pattern - matches partial words and handles word boundaries | |
| query_words = query.lower().strip().split() | |
| # Create regex patterns for each word that can match anywhere in the text | |
| patterns = [] | |
| for word in query_words: | |
| # Escape special regex characters and create flexible pattern | |
| escaped_word = re.escape(word) | |
| # Pattern that matches the word with optional word boundaries | |
| pattern = f".*{escaped_word}.*" | |
| patterns.append(pattern) | |
| # Combine patterns with OR logic for flexible matching | |
| combined_pattern = "|".join(patterns) | |
| try: | |
| # Search in title, subtitle, and authors using regex | |
| title_matches = books['title_and_subtitle'].str.contains( | |
| combined_pattern, case=False, na=False, regex=True | |
| ) | |
| author_matches = books['authors'].str.contains( | |
| combined_pattern, case=False, na=False, regex=True | |
| ) | |
| # Combine both matches | |
| literal_recs = books[title_matches | author_matches].copy() | |
| # If no results with combined pattern, try individual word patterns | |
| if literal_recs.empty and len(query_words) > 1: | |
| for word in query_words: | |
| escaped_word = re.escape(word.lower()) | |
| pattern = f".*{escaped_word}.*" | |
| word_title_matches = books['title_and_subtitle'].str.contains( | |
| pattern, case=False, na=False, regex=True | |
| ) | |
| word_author_matches = books['authors'].str.contains( | |
| pattern, case=False, na=False, regex=True | |
| ) | |
| word_matches = books[word_title_matches | word_author_matches].copy() | |
| literal_recs = pd.concat([literal_recs, word_matches]).drop_duplicates() | |
| if len(literal_recs) >= final_top_k: | |
| break | |
| except re.error: | |
| # Fallback to simple string matching if regex fails | |
| query_lower = query.lower() | |
| literal_recs = books[ | |
| books['title_and_subtitle'].str.contains(query_lower, case=False, na=False) | | |
| books['authors'].str.contains(query_lower, case=False, na=False) | |
| ].copy() | |
| # Filter by category | |
| if category and category != "All": | |
| literal_recs = literal_recs[literal_recs["categories"] == category] | |
| # Sort by emotional tone | |
| if tone == "Happy": | |
| literal_recs = literal_recs.sort_values(by="joy", ascending=False) | |
| elif tone == "Surprising": | |
| literal_recs = literal_recs.sort_values(by="surprise", ascending=False) | |
| elif tone == "Angry": | |
| literal_recs = literal_recs.sort_values(by="anger", ascending=False) | |
| elif tone == "Suspenseful": | |
| literal_recs = literal_recs.sort_values(by="fear", ascending=False) | |
| elif tone == "Sad": | |
| literal_recs = literal_recs.sort_values(by="sadness", ascending=False) | |
| return literal_recs.head(final_top_k) | |
| def create_book_card_html(row): | |
| """Create an HTML card for a single book with full description, ratings, and download link.""" | |
| # Handle missing description | |
| description = row.get("description", "No description available") | |
| if pd.isna(description): | |
| description = "No description available" | |
| # Format authors | |
| authors = row.get("authors", "Unknown Author") | |
| if pd.isna(authors) or authors == "nan": | |
| authors_str = "Unknown Author" | |
| else: | |
| authors_split = str(authors).split(";") | |
| if len(authors_split) == 2: | |
| authors_str = f"{authors_split[0]} and {authors_split[1]}" | |
| elif len(authors_split) > 2: | |
| authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}" | |
| else: | |
| authors_str = authors | |
| # Get other info | |
| title = row.get("title_and_subtitle", "Unknown Title") | |
| thumbnail = row.get("large_thumbnail", "https://via.placeholder.com/120x180/333333/cccccc?text=No+Cover") | |
| download_url = row.get("url", "") | |
| category = row.get("categories", "Unknown") | |
| # Handle ratings | |
| average_rating = row.get("average_rating", 0) | |
| ratings_count = row.get("ratings_count", 0) | |
| # Convert to proper numeric values | |
| try: | |
| avg_rating = float(average_rating) if not pd.isna(average_rating) else 0 | |
| rating_count = int(ratings_count) if not pd.isna(ratings_count) else 0 | |
| except (ValueError, TypeError): | |
| avg_rating = 0 | |
| rating_count = 0 | |
| # Create star rating display | |
| def create_star_rating(rating): | |
| """Create HTML for star rating display.""" | |
| full_stars = int(rating) | |
| half_star = 1 if (rating - full_stars) >= 0.5 else 0 | |
| empty_stars = 5 - full_stars - half_star | |
| stars_html = "" | |
| # Full stars | |
| stars_html += "β " * full_stars | |
| # Half star | |
| if half_star: | |
| stars_html += "β" | |
| # Empty stars | |
| stars_html += "β" * empty_stars | |
| return stars_html | |
| # Format rating display | |
| if avg_rating > 0: | |
| stars = create_star_rating(avg_rating) | |
| rating_display = f""" | |
| <div style="margin: 2px 0; display: flex; align-items: center; gap: 6px; flex-wrap: wrap;"> | |
| <span style="color: #ffd700; font-size: 12px; letter-spacing: 1px;">{stars}</span> | |
| <span style="color: #cccccc; font-size: 10px;"> | |
| {avg_rating:.1f} ({rating_count:,}) | |
| </span> | |
| </div> | |
| """ | |
| else: | |
| rating_display = """ | |
| <div style="margin: 2px 0;"> | |
| <span style="color: #888888; font-size: 10px;">No ratings</span> | |
| </div> | |
| """ | |
| # Create download button if URL exists | |
| download_button = "" | |
| if download_url and not pd.isna(download_url) and str(download_url).strip(): | |
| download_button = f""" | |
| <div style="margin-top: 6px;"> | |
| <a href="{download_url}" target="_blank" | |
| style="background-color: #4CAF50; color: white; padding: 6px 12px; | |
| text-decoration: none; border-radius: 4px; font-size: 10px; | |
| display: inline-block; text-align: center;"> | |
| π Get Book | |
| </a> | |
| </div> | |
| """ | |
| # Create the card HTML with responsive design and better image fallback | |
| card_html = f""" | |
| <div style="border: 1px solid #444; border-radius: 8px; padding: 12px; margin: 10px 0; | |
| background-color: #2b2b2b; box-shadow: 0 2px 4px rgba(0,0,0,0.3);"> | |
| <div style="display: flex; gap: 12px; flex-direction: row;"> | |
| <div style="flex-shrink: 0;"> | |
| <img src="{thumbnail}" alt="Book cover" | |
| style="width: 80px; height: 120px; object-fit: cover; border-radius: 4px; | |
| background-color: #333; border: 1px solid #555;" | |
| onerror="this.src='https://via.placeholder.com/120x180/333333/cccccc?text=No+Cover';"> | |
| </div> | |
| <div style="flex-grow: 1; min-width: 0; display: flex; flex-direction: column;"> | |
| <h3 style="margin: 0 0 6px 0; color: #ffffff; font-size: 14px; line-height: 1.2; | |
| word-wrap: break-word; overflow-wrap: break-word;"> | |
| {title} | |
| </h3> | |
| <p style="margin: 0 0 4px 0; color: #cccccc; font-size: 11px; font-style: italic;"> | |
| {authors_str} | |
| </p> | |
| <p style="margin: 0 0 4px 0; color: #aaaaaa; font-size: 10px;"> | |
| {category} | |
| </p> | |
| {rating_display} | |
| <div style="flex-grow: 1; margin: 6px 0;"> | |
| <p style="margin: 0; color: #dddddd; font-size: 11px; line-height: 1.3; | |
| display: -webkit-box; -webkit-line-clamp: 4; -webkit-box-orient: vertical; | |
| overflow: hidden; text-overflow: ellipsis;"> | |
| {description} | |
| </p> | |
| </div> | |
| {download_button} | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| return card_html | |
| def recommend_books(query: str, category: str, tone: str, search_type: str): | |
| """Main recommendation function for Gradio interface.""" | |
| if not query.strip(): | |
| return "<p>Please enter a search query to get book recommendations.</p>" | |
| try: | |
| if search_type == "Semantic Search": | |
| recommendations = retrieve_semantic_recommendations(query, category, tone) | |
| elif search_type == "Literal Search": | |
| recommendations = retrieve_literal_recommendations(query, category, tone) | |
| else: | |
| return "<p>Invalid search type selected.</p>" | |
| if recommendations.empty: | |
| return "<p>No books found matching your criteria. Try adjusting your search terms or filters.</p>" | |
| # Create HTML for all book cards | |
| html_cards = [] | |
| for _, row in recommendations.iterrows(): | |
| card_html = create_book_card_html(row) | |
| html_cards.append(card_html) | |
| # Combine all cards with a header | |
| full_html = f""" | |
| <div style="font-family: Arial, sans-serif; background-color: #1a1a1a; padding: 20px; border-radius: 8px;"> | |
| <h2 style="color: #ffffff; margin-bottom: 20px;"> | |
| π Found {len(recommendations)} recommendations for: "{query}" ({search_type}) | |
| </h2> | |
| {''.join(html_cards)} | |
| </div> | |
| """ | |
| return full_html | |
| except Exception as e: | |
| print(f"Error in recommend_books: {e}") | |
| return f"<p>An error occurred while searching for books: {str(e)}</p>" | |
| def update_search_interface(search_type): | |
| """Update the interface based on search type selection.""" | |
| if search_type == "Literal Search": | |
| return { | |
| search_instructions: gr.update( | |
| value="**Literal Search Mode:** Type book titles or author names directly. Supports partial matching - e.g., 'harry' will find 'Harry Potter', 'tolkien' will find J.R.R. Tolkien books.", | |
| visible=True | |
| ) | |
| } | |
| else: | |
| return { | |
| search_instructions: gr.update( | |
| value="**Semantic Search Mode:** Describe what kind of book you're looking for using natural language - e.g., 'fantasy adventure with magic'.", | |
| visible=True | |
| ) | |
| } | |
| # Prepare dropdown options | |
| categories = ["All"] + sorted(books["categories"].unique().tolist()) | |
| tones = ["All", "Happy", "Surprising", "Angry", "Suspenseful", "Sad"] | |
| search_types = ["Semantic Search", "Literal Search"] | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as dashboard: | |
| gr.Markdown(""" | |
| # π Smart Book Recommender | |
| ## Find your next favorite book using AI-powered semantic search or flexible keyword matching! | |
| **Semantic Search:** Describe what you want (e.g., "romantic comedy in Paris") | |
| **Literal Search:** Type exact titles or authors (e.g., "harry" β Harry Potter books) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| search_type_radio = gr.Radio( | |
| choices=search_types, | |
| value="Semantic Search", | |
| label="Search Type", | |
| interactive=True | |
| ) | |
| search_instructions = gr.Markdown( | |
| "**Semantic Search Mode:** Describe what kind of book you're looking for using natural language - e.g., 'fantasy adventure with magic'.", | |
| visible=True | |
| ) | |
| # Single search input for both modes | |
| user_query = gr.Textbox( | |
| label="Search for books:", | |
| placeholder="e.g., 'harry potter' or 'thrilling mystery in Victorian London'", | |
| lines=2, | |
| max_lines=4 | |
| ) | |
| with gr.Column(scale=1): | |
| category_dropdown = gr.Dropdown( | |
| label="Filter by category (optional)", | |
| choices=categories, | |
| value="All", | |
| ) | |
| tone_dropdown = gr.Dropdown( | |
| label="Filter by emotional tone (optional)", | |
| choices=tones, | |
| value="All", | |
| ) | |
| submit_button = gr.Button("π Find Books", variant="primary", size="lg") | |
| gr.Markdown("---") | |
| # Use HTML component for book display | |
| output = gr.HTML( | |
| label="Book Recommendations", | |
| value="<p>Select a search type and enter your preferences to get personalized book recommendations!</p>" | |
| ) | |
| # Event handlers | |
| search_type_radio.change( | |
| fn=update_search_interface, | |
| inputs=[search_type_radio], | |
| outputs=[search_instructions] | |
| ) | |
| submit_button.click( | |
| fn=recommend_books, | |
| inputs=[user_query, category_dropdown, tone_dropdown, search_type_radio], | |
| outputs=output, | |
| ) | |
| # Allow Enter key to submit | |
| user_query.submit( | |
| fn=recommend_books, | |
| inputs=[user_query, category_dropdown, tone_dropdown, search_type_radio], | |
| outputs=output, | |
| ) | |
| # Add some usage tips at the bottom | |
| gr.Markdown(""" | |
| ### π‘ Tips for better results: | |
| - **Semantic Search:** Be descriptive (e.g., "dark fantasy with dragons", "romance set in medieval times") | |
| - **Literal Search:** Use partial names (e.g., "tolkien", "stephen king", "harry", "game thrones") | |
| - **Flexible Matching:** Literal search finds books even with partial words - "potter" finds "Harry Potter" | |
| - **Combine filters:** Use category and tone filters to narrow down results | |
| - **Try variations:** If you don't find what you want, try different keywords or switch search modes | |
| """) | |
| print("Enhanced app with flexible regex search initialized successfully! π") | |
| if __name__ == "__main__": | |
| dashboard.launch() |