{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "290dff84", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"books_cleaned.csv\", encoding=\"utf-8\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "2e2d9604", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 6397 entries, 0 to 6396\n", "Data columns (total 11 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 isbn13 6397 non-null int64 \n", " 1 authors 6397 non-null object \n", " 2 categories 6364 non-null object \n", " 3 thumbnail 6190 non-null object \n", " 4 description 6397 non-null object \n", " 5 published_year 6397 non-null float64\n", " 6 average_rating 6397 non-null float64\n", " 7 num_pages 6397 non-null float64\n", " 8 ratings_count 6397 non-null float64\n", " 9 title_and_subtitle 6397 non-null object \n", " 10 tagged_description 6397 non-null object \n", "dtypes: float64(4), int64(1), object(6)\n", "memory usage: 549.9+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 4, "id": "06585b26", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categoriescount
0Fiction2491
1Juvenile Fiction519
2Biography & Autobiography388
3History255
4Literary Criticism163
.........
520Humorous stories1
521Ballets1
522Aged women1
523Imperialism1
524Illinois1
\n", "

525 rows × 2 columns

\n", "
" ], "text/plain": [ " categories count\n", "0 Fiction 2491\n", "1 Juvenile Fiction 519\n", "2 Biography & Autobiography 388\n", "3 History 255\n", "4 Literary Criticism 163\n", ".. ... ...\n", "520 Humorous stories 1\n", "521 Ballets 1\n", "522 Aged women 1\n", "523 Imperialism 1\n", "524 Illinois 1\n", "\n", "[525 rows x 2 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# too many categories\n", "df[\"categories\"].value_counts().reset_index()" ] }, { "cell_type": "code", "execution_count": 5, "id": "1976240c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13authorscategoriesthumbnaildescriptionpublished_yearaverage_ratingnum_pagesratings_counttitle_and_subtitletagged_description
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [isbn13, authors, categories, thumbnail, description, published_year, average_rating, num_pages, ratings_count, title_and_subtitle, tagged_description]\n", "Index: []" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df[\"description\"].str.len() < 25 ]" ] }, { "cell_type": "code", "execution_count": null, "id": "8effbaa7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "id": "7a11c3d3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\NonsoDev\\Documents\\Allcodes\\Projects_DL_for resume\\Recommender systems\\book reccomender - llm\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Device set to use cpu\n" ] } ], "source": [ "# let us change the categories to a more manageable number, fiction and non fiction with a zero shot classifier\n", "from transformers import pipeline\n", "classifier = pipeline(\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "3cc8882a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sequencecategoriesscores
0A NOVEL THAT READERS and critics have been eag...[fiction, history, biography, fantasy, mystery...[0.8558421730995178, 0.6128803491592407, 0.296...
1A new 'Christie for Christmas' -- a full-lengt...[mystery, fiction, fantasy, scifi, biography, ...[0.9339157342910767, 0.5139176249504089, 0.155...
2Volume Two of Stephen Donaldson's acclaimed se...[fiction, fantasy, history, scifi, biography, ...[0.5638813972473145, 0.2660749554634094, 0.249...
3A memorable, mesmerizing heroine Jennifer -- b...[scifi, biography, fiction, history, romance, ...[0.19755955040454865, 0.09938773512840271, 0.0...
4Lewis' work on the nature of love divides love...[mystery, romance, history, biography, scifi, ...[0.16078977286815643, 0.06188512220978737, 0.0...
5\"In The Problem of Pain, C.S. Lewis, one of th...[mystery, history, biography, scifi, romance, ...[0.6848734021186829, 0.11091233044862747, 0.08...
6Until Vasco da Gama discovered the sea-route t...[history, mystery, biography, scifi, fiction, ...[0.9738430976867676, 0.19697055220603943, 0.18...
7A new-cover reissue of the fourth book in the ...[scifi, fantasy, fiction, mystery, history, ro...[0.9945376515388489, 0.9806752800941467, 0.934...
8Kate Blackwell is an enigma and one of the mos...[mystery, biography, fiction, scifi, history, ...[0.9990025162696838, 0.43301281332969666, 0.04...
9One of Sidney Sheldon's most popular and bests...[romance, mystery, biography, fantasy, scifi, ...[0.6518456935882568, 0.4315004348754883, 0.367...
\n", "
" ], "text/plain": [ " sequence \\\n", "0 A NOVEL THAT READERS and critics have been eag... \n", "1 A new 'Christie for Christmas' -- a full-lengt... \n", "2 Volume Two of Stephen Donaldson's acclaimed se... \n", "3 A memorable, mesmerizing heroine Jennifer -- b... \n", "4 Lewis' work on the nature of love divides love... \n", "5 \"In The Problem of Pain, C.S. Lewis, one of th... \n", "6 Until Vasco da Gama discovered the sea-route t... \n", "7 A new-cover reissue of the fourth book in the ... \n", "8 Kate Blackwell is an enigma and one of the mos... \n", "9 One of Sidney Sheldon's most popular and bests... \n", "\n", " categories \\\n", "0 [fiction, history, biography, fantasy, mystery... \n", "1 [mystery, fiction, fantasy, scifi, biography, ... \n", "2 [fiction, fantasy, history, scifi, biography, ... \n", "3 [scifi, biography, fiction, history, romance, ... \n", "4 [mystery, romance, history, biography, scifi, ... \n", "5 [mystery, history, biography, scifi, romance, ... \n", "6 [history, mystery, biography, scifi, fiction, ... \n", "7 [scifi, fantasy, fiction, mystery, history, ro... \n", "8 [mystery, biography, fiction, scifi, history, ... \n", "9 [romance, mystery, biography, fantasy, scifi, ... \n", "\n", " scores \n", "0 [0.8558421730995178, 0.6128803491592407, 0.296... \n", "1 [0.9339157342910767, 0.5139176249504089, 0.155... \n", "2 [0.5638813972473145, 0.2660749554634094, 0.249... \n", "3 [0.19755955040454865, 0.09938773512840271, 0.0... \n", "4 [0.16078977286815643, 0.06188512220978737, 0.0... \n", "5 [0.6848734021186829, 0.11091233044862747, 0.08... \n", "6 [0.9738430976867676, 0.19697055220603943, 0.18... \n", "7 [0.9945376515388489, 0.9806752800941467, 0.934... \n", "8 [0.9990025162696838, 0.43301281332969666, 0.04... \n", "9 [0.6518456935882568, 0.4315004348754883, 0.367... " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fiction_categories = [\"fiction\",\"mystery\",\"romance\",\"scifi\",\"fantasy\",\"biography\",\"history\"]\n", "\n", "df.head(10).apply(\n", " lambda x: classifier(x[\"description\"], candidate_labels=fiction_categories, multi_label=True),\n", " axis=1,\n", " result_type=\"expand\",\n", ").rename(columns={\"labels\": \"categories\", \"scores\": \"scores\"})" ] }, { "cell_type": "code", "execution_count": 8, "id": "365964c7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13authorscategoriesthumbnaildescriptionpublished_yearaverage_ratingnum_pagesratings_counttitle_and_subtitletagged_description
09780002005883Marilynne RobinsonFictionhttp://books.google.com/books/content?id=KQZCP...A NOVEL THAT READERS and critics have been eag...2004.03.85247.0361.0Gilead9780002005883 A NOVEL THAT READERS and critics...
19780002261982Charles Osborne;Agatha ChristieDetective and mystery storieshttp://books.google.com/books/content?id=gA5GP...A new 'Christie for Christmas' -- a full-lengt...2000.03.83241.05164.0Spider's Web A Novel9780002261982 A new 'Christie for Christmas' -...
29780006163831Stephen R. DonaldsonAmerican fictionhttp://books.google.com/books/content?id=OmQaw...Volume Two of Stephen Donaldson's acclaimed se...1982.03.97479.0172.0The One Tree9780006163831 Volume Two of Stephen Donaldson'...
39780006178736Sidney SheldonFictionhttp://books.google.com/books/content?id=FKo2T...A memorable, mesmerizing heroine Jennifer -- b...1993.03.93512.029532.0Rage of angels9780006178736 A memorable, mesmerizing heroine...
49780006280897Clive Staples LewisChristian lifehttp://books.google.com/books/content?id=XhQ5X...Lewis' work on the nature of love divides love...2002.04.15170.033684.0The Four Loves9780006280897 Lewis' work on the nature of lov...
\n", "
" ], "text/plain": [ " isbn13 authors \\\n", "0 9780002005883 Marilynne Robinson \n", "1 9780002261982 Charles Osborne;Agatha Christie \n", "2 9780006163831 Stephen R. Donaldson \n", "3 9780006178736 Sidney Sheldon \n", "4 9780006280897 Clive Staples Lewis \n", "\n", " categories \\\n", "0 Fiction \n", "1 Detective and mystery stories \n", "2 American fiction \n", "3 Fiction \n", "4 Christian life \n", "\n", " thumbnail \\\n", "0 http://books.google.com/books/content?id=KQZCP... \n", "1 http://books.google.com/books/content?id=gA5GP... \n", "2 http://books.google.com/books/content?id=OmQaw... \n", "3 http://books.google.com/books/content?id=FKo2T... \n", "4 http://books.google.com/books/content?id=XhQ5X... \n", "\n", " description published_year \\\n", "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n", "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n", "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n", "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n", "4 Lewis' work on the nature of love divides love... 2002.0 \n", "\n", " average_rating num_pages ratings_count title_and_subtitle \\\n", "0 3.85 247.0 361.0 Gilead \n", "1 3.83 241.0 5164.0 Spider's Web A Novel \n", "2 3.97 479.0 172.0 The One Tree \n", "3 3.93 512.0 29532.0 Rage of angels \n", "4 4.15 170.0 33684.0 The Four Loves \n", "\n", " tagged_description \n", "0 9780002005883 A NOVEL THAT READERS and critics... \n", "1 9780002261982 A new 'Christie for Christmas' -... \n", "2 9780006163831 Volume Two of Stephen Donaldson'... \n", "3 9780006178736 A memorable, mesmerizing heroine... \n", "4 9780006280897 Lewis' work on the nature of lov... " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "96c4d3c9", "metadata": {}, "outputs": [], "source": [ "from googlesearch import search\n", "\n", "def fetch_first_google_link(query):\n", " results = search(query, num_results=1, lang=\"en\")\n", " return list(results) if results else None\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "402a59f3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['https://books.google.com/books/about/The_One_Tree.html?id=dXzwAAAAQBAJ&source=kp_cover']\n" ] } ], "source": [ "print(fetch_first_google_link(\"The One Tree by Stephen R. Donaldson -google books\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "be3349dc", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7da59931", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }