nonsodev commited on
Commit
99ba469
Β·
1 Parent(s): 02aef4b

Add chroma_books to gitignore

Browse files
Files changed (2) hide show
  1. .gitignore +0 -0
  2. app.py +149 -69
.gitignore CHANGED
Binary files a/.gitignore and b/.gitignore differ
 
app.py CHANGED
@@ -1,24 +1,71 @@
1
  import pandas as pd
2
  import gradio as gr
3
  import numpy as np
4
-
5
-
6
  from langchain_chroma import Chroma
7
  from langchain_huggingface import HuggingFaceEmbeddings
 
 
8
 
 
 
 
9
 
 
 
10
  embeddings = HuggingFaceEmbeddings(
11
- model_name="sentence-transformers/all-MiniLM-L6-v2" # Fast and good quality
12
- # or "sentence-transformers/all-mpnet-base-v2" # Higher quality, slower
 
13
  )
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- books = pd.read_csv("final_book_df.csv")
17
- books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
18
- books["large_thumbnail"] = np.where(books["large_thumbnail"].isna(), "cover-not-found.jpg", books["large_thumbnail"])
19
-
20
- db_books = Chroma(persist_directory="chroma_books", embedding_function=embeddings, collection_name="books")
21
-
 
 
 
 
 
 
 
 
22
 
23
  def retrieve_semantic_recommendations(
24
  query: str,
@@ -27,96 +74,129 @@ def retrieve_semantic_recommendations(
27
  initial_top_k: int = 50,
28
  final_top_k: int = 16,
29
  ) -> pd.DataFrame:
30
-
 
31
  recs = db_books.similarity_search(query, k=initial_top_k)
32
  books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
33
  book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
34
 
35
- if category != "All":
 
36
  book_recs = book_recs[book_recs["categories"] == category].head(final_top_k)
37
  else:
38
  book_recs = book_recs.head(final_top_k)
39
 
 
40
  if tone == "Happy":
41
- book_recs.sort_values(by="joy", ascending=False, inplace=True)
42
  elif tone == "Surprising":
43
- book_recs.sort_values(by="surprise", ascending=False, inplace=True)
44
  elif tone == "Angry":
45
- book_recs.sort_values(by="anger", ascending=False, inplace=True)
46
  elif tone == "Suspenseful":
47
- book_recs.sort_values(by="fear", ascending=False, inplace=True)
48
  elif tone == "Sad":
49
- book_recs.sort_values(by="sadness", ascending=False, inplace=True)
50
 
51
  return book_recs
52
 
53
-
54
- def recommend_books(
55
- query: str,
56
- category: str,
57
- tone: str
58
- ):
59
- recommendations = retrieve_semantic_recommendations(query, category, tone)
60
- results = []
61
-
62
- for _, row in recommendations.iterrows():
63
- description = row["description"]
64
- truncated_desc_split = description.split()
65
- truncated_description = " ".join(truncated_desc_split[:30]) + "..."
66
-
67
- authors_split = row["authors"].split(";")
68
- if len(authors_split) == 2:
69
- authors_str = f"{authors_split[0]} and {authors_split[1]}"
70
- elif len(authors_split) > 2:
71
- authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
72
- else:
73
- authors_str = row["authors"]
74
-
75
- caption = f"{row['title_and_subtitle']} by {authors_str}: {truncated_description}"
76
- results.append((row["large_thumbnail"], caption))
77
- return results
78
-
79
-
80
-
81
- categories = ["All"] + sorted(books["categories"].unique())
82
- tones = ["All"] + ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
83
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  with gr.Blocks(theme=gr.themes.Glass()) as dashboard:
85
- gr.Markdown("# Semantic Book Recommender")
86
- gr.Markdown("## Find your next favorite book!")
87
 
88
  with gr.Row():
89
  user_query = gr.Textbox(
90
- label="please enter a description of your book:",
91
- placeholder="Enter your query here...",
92
- lines=1,
93
- max_lines=1,
94
  )
95
 
96
- category_dropdown = gr.Dropdown(
97
- label="Select a category",
98
- choices=categories,
99
- value="All",
100
- )
101
- tone_dropdown = gr.Dropdown(
102
- label="Select an emotional tone",
103
- choices=tones,
104
- value="All",
105
- )
106
- submit_button = gr.Button("Submit", variant="primary")
107
-
108
- gr.Markdown("## Recommendations")
 
109
  output = gr.Gallery(
110
  label="Recommended Books",
111
- columns=8,
112
- rows=2,
 
113
  )
114
 
 
115
  submit_button.click(
116
  fn=recommend_books,
117
  inputs=[user_query, category_dropdown, tone_dropdown],
118
  outputs=output,
119
  )
 
 
 
 
 
 
 
 
 
120
 
121
  if __name__ == "__main__":
122
  dashboard.launch()
 
1
  import pandas as pd
2
  import gradio as gr
3
  import numpy as np
4
+ import os
 
5
  from langchain_chroma import Chroma
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.document_loaders import TextLoader
8
+ from langchain.text_splitter import CharacterTextSplitter
9
 
10
+ # Ensure model caching
11
+ os.environ["HF_HOME"] = "/tmp/hf_cache"
12
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
13
 
14
+ # Initialize embeddings with caching
15
+ print("Loading embeddings model...")
16
  embeddings = HuggingFaceEmbeddings(
17
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
18
+ model_kwargs={'device': 'cpu'},
19
+ encode_kwargs={'normalize_embeddings': True}
20
  )
21
 
22
+ # Initialize ChromaDB
23
+ print("Initializing ChromaDB...")
24
+ if not os.path.exists("chroma_books"):
25
+ print("Creating new ChromaDB from tagged_description.txt...")
26
+ try:
27
+ raw_docs = TextLoader("tagged_description.txt", encoding="utf-8").load()
28
+ text_splitter = CharacterTextSplitter(
29
+ separator="\n",
30
+ chunk_size=0,
31
+ chunk_overlap=0,
32
+ length_function=len,
33
+ )
34
+ documents = text_splitter.split_documents(raw_docs)
35
+ print(f"Loaded {len(documents)} documents")
36
+
37
+ db_books = Chroma.from_documents(
38
+ documents,
39
+ embedding=embeddings,
40
+ collection_name="books",
41
+ persist_directory="chroma_books",
42
+ )
43
+ print("ChromaDB created successfully!")
44
+ except FileNotFoundError:
45
+ print("ERROR: tagged_description.txt not found!")
46
+ raise
47
+ else:
48
+ print("Loading existing ChromaDB...")
49
+ db_books = Chroma(
50
+ persist_directory="chroma_books",
51
+ embedding_function=embeddings,
52
+ collection_name="books"
53
+ )
54
 
55
+ # Load books data
56
+ print("Loading books data...")
57
+ try:
58
+ books = pd.read_csv("final_book_df.csv")
59
+ books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
60
+ books["large_thumbnail"] = np.where(
61
+ books["large_thumbnail"].isna(),
62
+ "cover-not-found.jpg",
63
+ books["large_thumbnail"]
64
+ )
65
+ print(f"Loaded {len(books)} books")
66
+ except FileNotFoundError:
67
+ print("ERROR: final_book_df.csv not found!")
68
+ raise
69
 
70
  def retrieve_semantic_recommendations(
71
  query: str,
 
74
  initial_top_k: int = 50,
75
  final_top_k: int = 16,
76
  ) -> pd.DataFrame:
77
+ """Retrieve semantic recommendations based on query, category, and tone."""
78
+
79
  recs = db_books.similarity_search(query, k=initial_top_k)
80
  books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
81
  book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
82
 
83
+ # Filter by category
84
+ if category and category != "All":
85
  book_recs = book_recs[book_recs["categories"] == category].head(final_top_k)
86
  else:
87
  book_recs = book_recs.head(final_top_k)
88
 
89
+ # Sort by emotional tone
90
  if tone == "Happy":
91
+ book_recs = book_recs.sort_values(by="joy", ascending=False)
92
  elif tone == "Surprising":
93
+ book_recs = book_recs.sort_values(by="surprise", ascending=False)
94
  elif tone == "Angry":
95
+ book_recs = book_recs.sort_values(by="anger", ascending=False)
96
  elif tone == "Suspenseful":
97
+ book_recs = book_recs.sort_values(by="fear", ascending=False)
98
  elif tone == "Sad":
99
+ book_recs = book_recs.sort_values(by="sadness", ascending=False)
100
 
101
  return book_recs
102
 
103
+ def recommend_books(query: str, category: str, tone: str):
104
+ """Main recommendation function for Gradio interface."""
105
+
106
+ if not query.strip():
107
+ return []
108
+
109
+ try:
110
+ recommendations = retrieve_semantic_recommendations(query, category, tone)
111
+ results = []
112
+
113
+ for _, row in recommendations.iterrows():
114
+ # Handle missing description
115
+ description = row.get("description", "No description available")
116
+ if pd.isna(description):
117
+ description = "No description available"
118
+
119
+ # Truncate description
120
+ truncated_desc_split = str(description).split()
121
+ truncated_description = " ".join(truncated_desc_split[:30]) + "..."
122
+
123
+ # Format authors
124
+ authors = row.get("authors", "Unknown Author")
125
+ if pd.isna(authors):
126
+ authors_str = "Unknown Author"
127
+ else:
128
+ authors_split = str(authors).split(";")
129
+ if len(authors_split) == 2:
130
+ authors_str = f"{authors_split[0]} and {authors_split[1]}"
131
+ elif len(authors_split) > 2:
132
+ authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
133
+ else:
134
+ authors_str = authors
135
+
136
+ # Create caption
137
+ title = row.get("title_and_subtitle", "Unknown Title")
138
+ caption = f"{title} by {authors_str}: {truncated_description}"
139
+ results.append((row["large_thumbnail"], caption))
140
+
141
+ return results
142
+
143
+ except Exception as e:
144
+ print(f"Error in recommend_books: {e}")
145
+ return []
146
+
147
+ # Prepare dropdown options
148
+ categories = ["All"] + sorted(books["categories"].unique().tolist())
149
+ tones = ["All", "Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
150
+
151
+ # Create Gradio interface
152
  with gr.Blocks(theme=gr.themes.Glass()) as dashboard:
153
+ gr.Markdown("# πŸ“š Semantic Book Recommender")
154
+ gr.Markdown("## Find your next favorite book using AI-powered semantic search!")
155
 
156
  with gr.Row():
157
  user_query = gr.Textbox(
158
+ label="Describe your ideal book:",
159
+ placeholder="e.g., 'A thrilling mystery set in Victorian London'",
160
+ lines=2,
161
+ max_lines=3,
162
  )
163
 
164
+ with gr.Column():
165
+ category_dropdown = gr.Dropdown(
166
+ label="Select a category (optional)",
167
+ choices=categories,
168
+ value="All",
169
+ )
170
+ tone_dropdown = gr.Dropdown(
171
+ label="Select an emotional tone (optional)",
172
+ choices=tones,
173
+ value="All",
174
+ )
175
+ submit_button = gr.Button("πŸ” Find Books", variant="primary")
176
+
177
+ gr.Markdown("## πŸ“– Recommendations")
178
  output = gr.Gallery(
179
  label="Recommended Books",
180
+ columns=4, # Reduced for better mobile experience
181
+ rows=4,
182
+ height="auto",
183
  )
184
 
185
+ # Event handlers
186
  submit_button.click(
187
  fn=recommend_books,
188
  inputs=[user_query, category_dropdown, tone_dropdown],
189
  outputs=output,
190
  )
191
+
192
+ # Allow Enter key to submit
193
+ user_query.submit(
194
+ fn=recommend_books,
195
+ inputs=[user_query, category_dropdown, tone_dropdown],
196
+ outputs=output,
197
+ )
198
+
199
+ print("App initialized successfully! πŸš€")
200
 
201
  if __name__ == "__main__":
202
  dashboard.launch()