Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| from fuzzywuzzy import process | |
| def match_books(user_input: str, df: pd.DataFrame, min_score: float = 0.8): | |
| # Use process.extractOne to get the best match | |
| book_titles = df['Book-Title'].unique() | |
| best_match = process.extractOne(user_input, book_titles) | |
| # Check if the best match score is above the minimum score | |
| if best_match and best_match[1] >= min_score: | |
| result = best_match[0] | |
| else: | |
| result = None | |
| return result | |
| def recommend_books(df: pd.DataFrame, book_to_be_recommended: str) -> pd.DataFrame: | |
| """ | |
| The recommend_books_new function identifies users who have read a specified book, | |
| finds other books these users have read, computes the correlation between the specified book and these other books, | |
| and returns a DataFrame with the recommended books, their correlation scores, and average ratings. | |
| """ | |
| # Get relevant dataset of book's readers | |
| book_readers = df['User-ID'][df['Book-Title'] == book_to_be_recommended] | |
| book_readers = book_readers.tolist() | |
| book_readers = np.unique(book_readers) | |
| # Final dataset | |
| books_of_book_readers = df[(df['User-ID'].isin(book_readers))] | |
| number_of_rating_per_book = books_of_book_readers.groupby(['Book-Title']).agg('count').reset_index() | |
| # Iterate over the number_of_user_ratings to get the highest number, | |
| # while keeping at least 10 final records | |
| threshold = 0 | |
| while True: | |
| books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold] | |
| books_to_compare = books_to_compare.tolist() | |
| print(f"Threshold: {threshold}, Number of books to compare: {len(books_to_compare)}") | |
| if len(books_to_compare) <= 11: | |
| books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold-1] | |
| break | |
| threshold += 1 | |
| ratings_data_raw = books_of_book_readers[['User-ID', 'Book-Rating', 'Book-Title']][ | |
| books_of_book_readers['Book-Title'].isin(books_to_compare)] | |
| # group by User and Book and compute mean | |
| ratings_data_raw_nodup = ratings_data_raw.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean() | |
| # reset index to see User-ID in every row | |
| ratings_data_raw_nodup = ratings_data_raw_nodup.to_frame().reset_index() | |
| dataset_for_corr = ratings_data_raw_nodup.pivot(index='User-ID', columns='Book-Title', values='Book-Rating') | |
| # Method 1: Using pandas corr() with pairwise complete observations | |
| correlations = dataset_for_corr.corrwith(dataset_for_corr[book_to_be_recommended], method='pearson') | |
| # Add average ratings for each book in dataset_for_corr | |
| average_ratings = ratings_data_raw_nodup.groupby('Book-Title')['Book-Rating'].mean().reset_index() | |
| # Create DataFrame with correlations | |
| correlations_df = pd.DataFrame({ | |
| 'Book-Title': correlations.index, | |
| 'Correlation [%]': correlations.values, | |
| }) | |
| # Merge correlations_df with average_ratings | |
| correlations_df = pd.merge(correlations_df, average_ratings, on='Book-Title') | |
| correlations_df = correlations_df.rename(columns={'Book-Rating': 'Average ratings'}) | |
| # Sort by correlation value | |
| correlations_df = correlations_df.sort_values('Correlation [%]', ascending=False) | |
| # convert correlation column to percentage and limit to two decimals | |
| correlations_df['Correlation [%]'] = correlations_df['Correlation [%]'] * 100 | |
| correlations_df['Correlation [%]'] = correlations_df['Correlation [%]'].round(2) | |
| # Remove the book being recommended from the list | |
| correlations_df = correlations_df[correlations_df['Book-Title'] != book_to_be_recommended] | |
| correlations_df = correlations_df.head(10) | |
| return correlations_df | |