Spaces:

nonsodev
/

semantic-book-recommender

Sleeping

App Files Files Community

semantic-book-recommender / app.py

nonsodev

fallback image

0c35dcc 10 months ago

raw

history blame contribute delete

18 kB

	import pandas as pd
	import gradio as gr
	import numpy as np
	import os
	import re
	from langchain_chroma import Chroma
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.document_loaders import TextLoader
	from langchain.text_splitter import CharacterTextSplitter

	# Ensure model caching
	os.environ["HF_HOME"] = "/tmp/hf_cache"
	os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"

	# Initialize embeddings with caching
	print("Loading embeddings model...")
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device': 'cpu'},
	encode_kwargs={'normalize_embeddings': True}
	)

	# Initialize ChromaDB
	print("Initializing ChromaDB...")
	if not os.path.exists("chroma_books"):
	print("Creating new ChromaDB from tagged_description.txt...")
	try:
	raw_docs = TextLoader("tagged_description.txt", encoding="utf-8").load()
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=0,
	chunk_overlap=0,
	length_function=len,
	)
	documents = text_splitter.split_documents(raw_docs)
	print(f"Loaded {len(documents)} documents")

	db_books = Chroma.from_documents(
	documents,
	embedding=embeddings,
	collection_name="books",
	persist_directory="chroma_books",
	)
	print("ChromaDB created successfully!")
	except FileNotFoundError:
	print("ERROR: tagged_description.txt not found!")
	raise
	else:
	print("Loading existing ChromaDB...")
	db_books = Chroma(
	persist_directory="chroma_books",
	embedding_function=embeddings,
	collection_name="books"
	)

	# Load books data
	print("Loading books data...")
	try:
	books = pd.read_csv("final_book_df.csv")
	books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
	# Better fallback image handling
	books["large_thumbnail"] = np.where(
	books["large_thumbnail"].isna() \| books["thumbnail"].isna(),
	"cover-not-found.jpg",
	books["large_thumbnail"]
	)
	# Ensure 'authors' and 'categories' are string type for literal search
	books['authors'] = books['authors'].astype(str)
	books['categories'] = books['categories'].astype(str)
	books['title_and_subtitle'] = books['title_and_subtitle'].astype(str)

	print(f"Loaded {len(books)} books")
	except FileNotFoundError:
	print("ERROR: final_book_df.csv not found!")
	raise

	def retrieve_semantic_recommendations(
	query: str,
	category: str = None,
	tone: str = None,
	initial_top_k: int = 50,
	final_top_k: int = 8,
	) -> pd.DataFrame:
	"""Retrieve semantic recommendations based on query, category, and tone."""

	recs = db_books.similarity_search(query, k=initial_top_k)
	books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
	book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)

	# Filter by category
	if category and category != "All":
	book_recs = book_recs[book_recs["categories"] == category]

	# Sort by emotional tone
	if tone == "Happy":
	book_recs = book_recs.sort_values(by="joy", ascending=False)
	elif tone == "Surprising":
	book_recs = book_recs.sort_values(by="surprise", ascending=False)
	elif tone == "Angry":
	book_recs = book_recs.sort_values(by="anger", ascending=False)
	elif tone == "Suspenseful":
	book_recs = book_recs.sort_values(by="fear", ascending=False)
	elif tone == "Sad":
	book_recs = book_recs.sort_values(by="sadness", ascending=False)

	return book_recs.head(final_top_k)

	def retrieve_literal_recommendations(
	query: str,
	category: str = None,
	tone: str = None,
	final_top_k: int = 8,
	) -> pd.DataFrame:
	"""Retrieve literal recommendations using flexible regex pattern matching."""
	if not query.strip():
	return pd.DataFrame()

	# Create flexible regex pattern - matches partial words and handles word boundaries
	query_words = query.lower().strip().split()

	# Create regex patterns for each word that can match anywhere in the text
	patterns = []
	for word in query_words:
	# Escape special regex characters and create flexible pattern
	escaped_word = re.escape(word)
	# Pattern that matches the word with optional word boundaries
	pattern = f".{escaped_word}."
	patterns.append(pattern)

	# Combine patterns with OR logic for flexible matching
	combined_pattern = "\|".join(patterns)

	try:
	# Search in title, subtitle, and authors using regex
	title_matches = books['title_and_subtitle'].str.contains(
	combined_pattern, case=False, na=False, regex=True
	)
	author_matches = books['authors'].str.contains(
	combined_pattern, case=False, na=False, regex=True
	)

	# Combine both matches
	literal_recs = books[title_matches \| author_matches].copy()

	# If no results with combined pattern, try individual word patterns
	if literal_recs.empty and len(query_words) > 1:
	for word in query_words:
	escaped_word = re.escape(word.lower())
	pattern = f".{escaped_word}."

	word_title_matches = books['title_and_subtitle'].str.contains(
	pattern, case=False, na=False, regex=True
	)
	word_author_matches = books['authors'].str.contains(
	pattern, case=False, na=False, regex=True
	)

	word_matches = books[word_title_matches \| word_author_matches].copy()
	literal_recs = pd.concat([literal_recs, word_matches]).drop_duplicates()

	if len(literal_recs) >= final_top_k:
	break

	except re.error:
	# Fallback to simple string matching if regex fails
	query_lower = query.lower()
	literal_recs = books[
	books['title_and_subtitle'].str.contains(query_lower, case=False, na=False) \|
	books['authors'].str.contains(query_lower, case=False, na=False)
	].copy()

	# Filter by category
	if category and category != "All":
	literal_recs = literal_recs[literal_recs["categories"] == category]

	# Sort by emotional tone
	if tone == "Happy":
	literal_recs = literal_recs.sort_values(by="joy", ascending=False)
	elif tone == "Surprising":
	literal_recs = literal_recs.sort_values(by="surprise", ascending=False)
	elif tone == "Angry":
	literal_recs = literal_recs.sort_values(by="anger", ascending=False)
	elif tone == "Suspenseful":
	literal_recs = literal_recs.sort_values(by="fear", ascending=False)
	elif tone == "Sad":
	literal_recs = literal_recs.sort_values(by="sadness", ascending=False)

	return literal_recs.head(final_top_k)

	def create_book_card_html(row):
	"""Create an HTML card for a single book with full description, ratings, and download link."""

	# Handle missing description
	description = row.get("description", "No description available")
	if pd.isna(description):
	description = "No description available"

	# Format authors
	authors = row.get("authors", "Unknown Author")
	if pd.isna(authors) or authors == "nan":
	authors_str = "Unknown Author"
	else:
	authors_split = str(authors).split(";")
	if len(authors_split) == 2:
	authors_str = f"{authors_split[0]} and {authors_split[1]}"
	elif len(authors_split) > 2:
	authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
	else:
	authors_str = authors

	# Get other info
	title = row.get("title_and_subtitle", "Unknown Title")
	thumbnail = row.get("large_thumbnail", "https://via.placeholder.com/120x180/333333/cccccc?text=No+Cover")
	download_url = row.get("url", "")
	category = row.get("categories", "Unknown")

	# Handle ratings
	average_rating = row.get("average_rating", 0)
	ratings_count = row.get("ratings_count", 0)

	# Convert to proper numeric values
	try:
	avg_rating = float(average_rating) if not pd.isna(average_rating) else 0
	rating_count = int(ratings_count) if not pd.isna(ratings_count) else 0
	except (ValueError, TypeError):
	avg_rating = 0
	rating_count = 0

	# Create star rating display
	def create_star_rating(rating):
	"""Create HTML for star rating display."""
	full_stars = int(rating)
	half_star = 1 if (rating - full_stars) >= 0.5 else 0
	empty_stars = 5 - full_stars - half_star

	stars_html = ""
	# Full stars
	stars_html += "★" * full_stars
	# Half star
	if half_star:
	stars_html += "☆"
	# Empty stars
	stars_html += "☆" * empty_stars

	return stars_html

	# Format rating display
	if avg_rating > 0:
	stars = create_star_rating(avg_rating)
	rating_display = f"""
	<div style="margin: 2px 0; display: flex; align-items: center; gap: 6px; flex-wrap: wrap;">
	<span style="color: #ffd700; font-size: 12px; letter-spacing: 1px;">{stars}</span>
	<span style="color: #cccccc; font-size: 10px;">
	{avg_rating:.1f} ({rating_count:,})
	</span>
	</div>
	"""
	else:
	rating_display = """
	<div style="margin: 2px 0;">
	<span style="color: #888888; font-size: 10px;">No ratings</span>
	</div>
	"""

	# Create download button if URL exists
	download_button = ""
	if download_url and not pd.isna(download_url) and str(download_url).strip():
	download_button = f"""
	<div style="margin-top: 6px;">
	<a href="{download_url}" target="_blank"
	style="background-color: #4CAF50; color: white; padding: 6px 12px;
	text-decoration: none; border-radius: 4px; font-size: 10px;
	display: inline-block; text-align: center;">
	📖 Get Book
	</a>
	</div>
	"""

	# Create the card HTML with responsive design and better image fallback
	card_html = f"""
	<div style="border: 1px solid #444; border-radius: 8px; padding: 12px; margin: 10px 0;
	background-color: #2b2b2b; box-shadow: 0 2px 4px rgba(0,0,0,0.3);">

	<div style="display: flex; gap: 12px; flex-direction: row;">
	<div style="flex-shrink: 0;">
	<img src="{thumbnail}" alt="Book cover"
	style="width: 80px; height: 120px; object-fit: cover; border-radius: 4px;
	background-color: #333; border: 1px solid #555;"
	onerror="this.src='https://via.placeholder.com/120x180/333333/cccccc?text=No+Cover';">
	</div>

	<div style="flex-grow: 1; min-width: 0; display: flex; flex-direction: column;">
	<h3 style="margin: 0 0 6px 0; color: #ffffff; font-size: 14px; line-height: 1.2;
	word-wrap: break-word; overflow-wrap: break-word;">
	{title}
	</h3>

	<p style="margin: 0 0 4px 0; color: #cccccc; font-size: 11px; font-style: italic;">
	{authors_str}
	</p>

	<p style="margin: 0 0 4px 0; color: #aaaaaa; font-size: 10px;">
	{category}
	</p>

	{rating_display}

	<div style="flex-grow: 1; margin: 6px 0;">
	<p style="margin: 0; color: #dddddd; font-size: 11px; line-height: 1.3;
	display: -webkit-box; -webkit-line-clamp: 4; -webkit-box-orient: vertical;
	overflow: hidden; text-overflow: ellipsis;">
	{description}
	</p>
	</div>

	{download_button}
	</div>
	</div>
	</div>
	"""
	return card_html

	def recommend_books(query: str, category: str, tone: str, search_type: str):
	"""Main recommendation function for Gradio interface."""

	if not query.strip():
	return "<p>Please enter a search query to get book recommendations.</p>"

	try:
	if search_type == "Semantic Search":
	recommendations = retrieve_semantic_recommendations(query, category, tone)
	elif search_type == "Literal Search":
	recommendations = retrieve_literal_recommendations(query, category, tone)
	else:
	return "<p>Invalid search type selected.</p>"

	if recommendations.empty:
	return "<p>No books found matching your criteria. Try adjusting your search terms or filters.</p>"

	# Create HTML for all book cards
	html_cards = []
	for _, row in recommendations.iterrows():
	card_html = create_book_card_html(row)
	html_cards.append(card_html)

	# Combine all cards with a header
	full_html = f"""
	<div style="font-family: Arial, sans-serif; background-color: #1a1a1a; padding: 20px; border-radius: 8px;">
	<h2 style="color: #ffffff; margin-bottom: 20px;">
	📚 Found {len(recommendations)} recommendations for: "{query}" ({search_type})
	</h2>
	{''.join(html_cards)}
	</div>
	"""

	return full_html

	except Exception as e:
	print(f"Error in recommend_books: {e}")
	return f"<p>An error occurred while searching for books: {str(e)}</p>"

	def update_search_interface(search_type):
	"""Update the interface based on search type selection."""
	if search_type == "Literal Search":
	return {
	search_instructions: gr.update(
	value="Literal Search Mode: Type book titles or author names directly. Supports partial matching - e.g., 'harry' will find 'Harry Potter', 'tolkien' will find J.R.R. Tolkien books.",
	visible=True
	)
	}
	else:
	return {
	search_instructions: gr.update(
	value="Semantic Search Mode: Describe what kind of book you're looking for using natural language - e.g., 'fantasy adventure with magic'.",
	visible=True
	)
	}

	# Prepare dropdown options
	categories = ["All"] + sorted(books["categories"].unique().tolist())
	tones = ["All", "Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
	search_types = ["Semantic Search", "Literal Search"]

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft()) as dashboard:
	gr.Markdown("""
	# 📚 Smart Book Recommender
	## Find your next favorite book using AI-powered semantic search or flexible keyword matching!

	Semantic Search: Describe what you want (e.g., "romantic comedy in Paris")
	Literal Search: Type exact titles or authors (e.g., "harry" → Harry Potter books)
	""")

	with gr.Row():
	with gr.Column(scale=2):
	search_type_radio = gr.Radio(
	choices=search_types,
	value="Semantic Search",
	label="Search Type",
	interactive=True
	)

	search_instructions = gr.Markdown(
	"Semantic Search Mode: Describe what kind of book you're looking for using natural language - e.g., 'fantasy adventure with magic'.",
	visible=True
	)

	# Single search input for both modes
	user_query = gr.Textbox(
	label="Search for books:",
	placeholder="e.g., 'harry potter' or 'thrilling mystery in Victorian London'",
	lines=2,
	max_lines=4
	)

	with gr.Column(scale=1):
	category_dropdown = gr.Dropdown(
	label="Filter by category (optional)",
	choices=categories,
	value="All",
	)
	tone_dropdown = gr.Dropdown(
	label="Filter by emotional tone (optional)",
	choices=tones,
	value="All",
	)
	submit_button = gr.Button("🔍 Find Books", variant="primary", size="lg")

	gr.Markdown("---")

	# Use HTML component for book display
	output = gr.HTML(
	label="Book Recommendations",
	value="<p>Select a search type and enter your preferences to get personalized book recommendations!</p>"
	)

	# Event handlers
	search_type_radio.change(
	fn=update_search_interface,
	inputs=[search_type_radio],
	outputs=[search_instructions]
	)

	submit_button.click(
	fn=recommend_books,
	inputs=[user_query, category_dropdown, tone_dropdown, search_type_radio],
	outputs=output,
	)

	# Allow Enter key to submit
	user_query.submit(
	fn=recommend_books,
	inputs=[user_query, category_dropdown, tone_dropdown, search_type_radio],
	outputs=output,
	)

	# Add some usage tips at the bottom
	gr.Markdown("""
	### 💡 Tips for better results:
	- Semantic Search: Be descriptive (e.g., "dark fantasy with dragons", "romance set in medieval times")
	- Literal Search: Use partial names (e.g., "tolkien", "stephen king", "harry", "game thrones")
	- Flexible Matching: Literal search finds books even with partial words - "potter" finds "Harry Potter"
	- Combine filters: Use category and tone filters to narrow down results
	- Try variations: If you don't find what you want, try different keywords or switch search modes
	""")

	print("Enhanced app with flexible regex search initialized successfully! 🚀")

	if __name__ == "__main__":
	dashboard.launch()