import pandas as pd import gradio as gr import numpy as np import os import re from langchain_chroma import Chroma from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.document_loaders import TextLoader from langchain.text_splitter import CharacterTextSplitter # Ensure model caching os.environ["HF_HOME"] = "/tmp/hf_cache" os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache" # Initialize embeddings with caching print("Loading embeddings model...") embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) # Initialize ChromaDB print("Initializing ChromaDB...") if not os.path.exists("chroma_books"): print("Creating new ChromaDB from tagged_description.txt...") try: raw_docs = TextLoader("tagged_description.txt", encoding="utf-8").load() text_splitter = CharacterTextSplitter( separator="\n", chunk_size=0, chunk_overlap=0, length_function=len, ) documents = text_splitter.split_documents(raw_docs) print(f"Loaded {len(documents)} documents") db_books = Chroma.from_documents( documents, embedding=embeddings, collection_name="books", persist_directory="chroma_books", ) print("ChromaDB created successfully!") except FileNotFoundError: print("ERROR: tagged_description.txt not found!") raise else: print("Loading existing ChromaDB...") db_books = Chroma( persist_directory="chroma_books", embedding_function=embeddings, collection_name="books" ) # Load books data print("Loading books data...") try: books = pd.read_csv("final_book_df.csv") books["large_thumbnail"] = books["thumbnail"] + "&fife=w800" # Better fallback image handling books["large_thumbnail"] = np.where( books["large_thumbnail"].isna() | books["thumbnail"].isna(), "cover-not-found.jpg", books["large_thumbnail"] ) # Ensure 'authors' and 'categories' are string type for literal search books['authors'] = books['authors'].astype(str) books['categories'] = books['categories'].astype(str) books['title_and_subtitle'] = books['title_and_subtitle'].astype(str) print(f"Loaded {len(books)} books") except FileNotFoundError: print("ERROR: final_book_df.csv not found!") raise def retrieve_semantic_recommendations( query: str, category: str = None, tone: str = None, initial_top_k: int = 50, final_top_k: int = 8, ) -> pd.DataFrame: """Retrieve semantic recommendations based on query, category, and tone.""" recs = db_books.similarity_search(query, k=initial_top_k) books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs] book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k) # Filter by category if category and category != "All": book_recs = book_recs[book_recs["categories"] == category] # Sort by emotional tone if tone == "Happy": book_recs = book_recs.sort_values(by="joy", ascending=False) elif tone == "Surprising": book_recs = book_recs.sort_values(by="surprise", ascending=False) elif tone == "Angry": book_recs = book_recs.sort_values(by="anger", ascending=False) elif tone == "Suspenseful": book_recs = book_recs.sort_values(by="fear", ascending=False) elif tone == "Sad": book_recs = book_recs.sort_values(by="sadness", ascending=False) return book_recs.head(final_top_k) def retrieve_literal_recommendations( query: str, category: str = None, tone: str = None, final_top_k: int = 8, ) -> pd.DataFrame: """Retrieve literal recommendations using flexible regex pattern matching.""" if not query.strip(): return pd.DataFrame() # Create flexible regex pattern - matches partial words and handles word boundaries query_words = query.lower().strip().split() # Create regex patterns for each word that can match anywhere in the text patterns = [] for word in query_words: # Escape special regex characters and create flexible pattern escaped_word = re.escape(word) # Pattern that matches the word with optional word boundaries pattern = f".*{escaped_word}.*" patterns.append(pattern) # Combine patterns with OR logic for flexible matching combined_pattern = "|".join(patterns) try: # Search in title, subtitle, and authors using regex title_matches = books['title_and_subtitle'].str.contains( combined_pattern, case=False, na=False, regex=True ) author_matches = books['authors'].str.contains( combined_pattern, case=False, na=False, regex=True ) # Combine both matches literal_recs = books[title_matches | author_matches].copy() # If no results with combined pattern, try individual word patterns if literal_recs.empty and len(query_words) > 1: for word in query_words: escaped_word = re.escape(word.lower()) pattern = f".*{escaped_word}.*" word_title_matches = books['title_and_subtitle'].str.contains( pattern, case=False, na=False, regex=True ) word_author_matches = books['authors'].str.contains( pattern, case=False, na=False, regex=True ) word_matches = books[word_title_matches | word_author_matches].copy() literal_recs = pd.concat([literal_recs, word_matches]).drop_duplicates() if len(literal_recs) >= final_top_k: break except re.error: # Fallback to simple string matching if regex fails query_lower = query.lower() literal_recs = books[ books['title_and_subtitle'].str.contains(query_lower, case=False, na=False) | books['authors'].str.contains(query_lower, case=False, na=False) ].copy() # Filter by category if category and category != "All": literal_recs = literal_recs[literal_recs["categories"] == category] # Sort by emotional tone if tone == "Happy": literal_recs = literal_recs.sort_values(by="joy", ascending=False) elif tone == "Surprising": literal_recs = literal_recs.sort_values(by="surprise", ascending=False) elif tone == "Angry": literal_recs = literal_recs.sort_values(by="anger", ascending=False) elif tone == "Suspenseful": literal_recs = literal_recs.sort_values(by="fear", ascending=False) elif tone == "Sad": literal_recs = literal_recs.sort_values(by="sadness", ascending=False) return literal_recs.head(final_top_k) def create_book_card_html(row): """Create an HTML card for a single book with full description, ratings, and download link.""" # Handle missing description description = row.get("description", "No description available") if pd.isna(description): description = "No description available" # Format authors authors = row.get("authors", "Unknown Author") if pd.isna(authors) or authors == "nan": authors_str = "Unknown Author" else: authors_split = str(authors).split(";") if len(authors_split) == 2: authors_str = f"{authors_split[0]} and {authors_split[1]}" elif len(authors_split) > 2: authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}" else: authors_str = authors # Get other info title = row.get("title_and_subtitle", "Unknown Title") thumbnail = row.get("large_thumbnail", "https://via.placeholder.com/120x180/333333/cccccc?text=No+Cover") download_url = row.get("url", "") category = row.get("categories", "Unknown") # Handle ratings average_rating = row.get("average_rating", 0) ratings_count = row.get("ratings_count", 0) # Convert to proper numeric values try: avg_rating = float(average_rating) if not pd.isna(average_rating) else 0 rating_count = int(ratings_count) if not pd.isna(ratings_count) else 0 except (ValueError, TypeError): avg_rating = 0 rating_count = 0 # Create star rating display def create_star_rating(rating): """Create HTML for star rating display.""" full_stars = int(rating) half_star = 1 if (rating - full_stars) >= 0.5 else 0 empty_stars = 5 - full_stars - half_star stars_html = "" # Full stars stars_html += "★" * full_stars # Half star if half_star: stars_html += "☆" # Empty stars stars_html += "☆" * empty_stars return stars_html # Format rating display if avg_rating > 0: stars = create_star_rating(avg_rating) rating_display = f"""
{authors_str}
{category}
{rating_display}Please enter a search query to get book recommendations.
" try: if search_type == "Semantic Search": recommendations = retrieve_semantic_recommendations(query, category, tone) elif search_type == "Literal Search": recommendations = retrieve_literal_recommendations(query, category, tone) else: return "Invalid search type selected.
" if recommendations.empty: return "No books found matching your criteria. Try adjusting your search terms or filters.
" # Create HTML for all book cards html_cards = [] for _, row in recommendations.iterrows(): card_html = create_book_card_html(row) html_cards.append(card_html) # Combine all cards with a header full_html = f"""An error occurred while searching for books: {str(e)}
" def update_search_interface(search_type): """Update the interface based on search type selection.""" if search_type == "Literal Search": return { search_instructions: gr.update( value="**Literal Search Mode:** Type book titles or author names directly. Supports partial matching - e.g., 'harry' will find 'Harry Potter', 'tolkien' will find J.R.R. Tolkien books.", visible=True ) } else: return { search_instructions: gr.update( value="**Semantic Search Mode:** Describe what kind of book you're looking for using natural language - e.g., 'fantasy adventure with magic'.", visible=True ) } # Prepare dropdown options categories = ["All"] + sorted(books["categories"].unique().tolist()) tones = ["All", "Happy", "Surprising", "Angry", "Suspenseful", "Sad"] search_types = ["Semantic Search", "Literal Search"] # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as dashboard: gr.Markdown(""" # 📚 Smart Book Recommender ## Find your next favorite book using AI-powered semantic search or flexible keyword matching! **Semantic Search:** Describe what you want (e.g., "romantic comedy in Paris") **Literal Search:** Type exact titles or authors (e.g., "harry" → Harry Potter books) """) with gr.Row(): with gr.Column(scale=2): search_type_radio = gr.Radio( choices=search_types, value="Semantic Search", label="Search Type", interactive=True ) search_instructions = gr.Markdown( "**Semantic Search Mode:** Describe what kind of book you're looking for using natural language - e.g., 'fantasy adventure with magic'.", visible=True ) # Single search input for both modes user_query = gr.Textbox( label="Search for books:", placeholder="e.g., 'harry potter' or 'thrilling mystery in Victorian London'", lines=2, max_lines=4 ) with gr.Column(scale=1): category_dropdown = gr.Dropdown( label="Filter by category (optional)", choices=categories, value="All", ) tone_dropdown = gr.Dropdown( label="Filter by emotional tone (optional)", choices=tones, value="All", ) submit_button = gr.Button("🔍 Find Books", variant="primary", size="lg") gr.Markdown("---") # Use HTML component for book display output = gr.HTML( label="Book Recommendations", value="Select a search type and enter your preferences to get personalized book recommendations!
" ) # Event handlers search_type_radio.change( fn=update_search_interface, inputs=[search_type_radio], outputs=[search_instructions] ) submit_button.click( fn=recommend_books, inputs=[user_query, category_dropdown, tone_dropdown, search_type_radio], outputs=output, ) # Allow Enter key to submit user_query.submit( fn=recommend_books, inputs=[user_query, category_dropdown, tone_dropdown, search_type_radio], outputs=output, ) # Add some usage tips at the bottom gr.Markdown(""" ### 💡 Tips for better results: - **Semantic Search:** Be descriptive (e.g., "dark fantasy with dragons", "romance set in medieval times") - **Literal Search:** Use partial names (e.g., "tolkien", "stephen king", "harry", "game thrones") - **Flexible Matching:** Literal search finds books even with partial words - "potter" finds "Harry Potter" - **Combine filters:** Use category and tone filters to narrow down results - **Try variations:** If you don't find what you want, try different keywords or switch search modes """) print("Enhanced app with flexible regex search initialized successfully! 🚀") if __name__ == "__main__": dashboard.launch()