{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\NonsoDev\\Documents\\Allcodes\\Projects_DL_for resume\\Recommender systems\\book reccomender - llm\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Path to dataset files: C:\\Users\\NonsoDev\\.cache\\kagglehub\\datasets\\dylanjcastillo\\7k-books-with-metadata\\versions\\3\n" ] } ], "source": [ "import kagglehub\n", "\n", "# Download latest version\n", "path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n", "\n", "print(\"Path to dataset files:\", path)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of books: 6810\n" ] } ], "source": [ "import pandas as pd\n", "data = pd.read_csv(path + \"/books.csv\")\n", "print(\"Number of books:\", len(data))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13isbn10titlesubtitleauthorscategoriesthumbnaildescriptionpublished_yearaverage_ratingnum_pagesratings_count
097800020058830002005883GileadNaNMarilynne RobinsonFictionhttp://books.google.com/books/content?id=KQZCP...A NOVEL THAT READERS and critics have been eag...2004.03.85247.0361.0
197800022619820002261987Spider's WebA NovelCharles Osborne;Agatha ChristieDetective and mystery storieshttp://books.google.com/books/content?id=gA5GP...A new 'Christie for Christmas' -- a full-lengt...2000.03.83241.05164.0
297800061638310006163831The One TreeNaNStephen R. DonaldsonAmerican fictionhttp://books.google.com/books/content?id=OmQaw...Volume Two of Stephen Donaldson's acclaimed se...1982.03.97479.0172.0
397800061787360006178731Rage of angelsNaNSidney SheldonFictionhttp://books.google.com/books/content?id=FKo2T...A memorable, mesmerizing heroine Jennifer -- b...1993.03.93512.029532.0
497800062808970006280897The Four LovesNaNClive Staples LewisChristian lifehttp://books.google.com/books/content?id=XhQ5X...Lewis' work on the nature of love divides love...2002.04.15170.033684.0
\n", "
" ], "text/plain": [ " isbn13 isbn10 title subtitle \\\n", "0 9780002005883 0002005883 Gilead NaN \n", "1 9780002261982 0002261987 Spider's Web A Novel \n", "2 9780006163831 0006163831 The One Tree NaN \n", "3 9780006178736 0006178731 Rage of angels NaN \n", "4 9780006280897 0006280897 The Four Loves NaN \n", "\n", " authors categories \\\n", "0 Marilynne Robinson Fiction \n", "1 Charles Osborne;Agatha Christie Detective and mystery stories \n", "2 Stephen R. Donaldson American fiction \n", "3 Sidney Sheldon Fiction \n", "4 Clive Staples Lewis Christian life \n", "\n", " thumbnail \\\n", "0 http://books.google.com/books/content?id=KQZCP... \n", "1 http://books.google.com/books/content?id=gA5GP... \n", "2 http://books.google.com/books/content?id=OmQaw... \n", "3 http://books.google.com/books/content?id=FKo2T... \n", "4 http://books.google.com/books/content?id=XhQ5X... \n", "\n", " description published_year \\\n", "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n", "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n", "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n", "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n", "4 Lewis' work on the nature of love divides love... 2002.0 \n", "\n", " average_rating num_pages ratings_count \n", "0 3.85 247.0 361.0 \n", "1 3.83 241.0 5164.0 \n", "2 3.97 479.0 172.0 \n", "3 3.93 512.0 29532.0 \n", "4 4.15 170.0 33684.0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 6810 entries, 0 to 6809\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 isbn13 6810 non-null int64 \n", " 1 isbn10 6810 non-null object \n", " 2 title 6810 non-null object \n", " 3 subtitle 2381 non-null object \n", " 4 authors 6738 non-null object \n", " 5 categories 6711 non-null object \n", " 6 thumbnail 6481 non-null object \n", " 7 description 6548 non-null object \n", " 8 published_year 6804 non-null float64\n", " 9 average_rating 6767 non-null float64\n", " 10 num_pages 6767 non-null float64\n", " 11 ratings_count 6767 non-null float64\n", "dtypes: float64(4), int64(1), object(7)\n", "memory usage: 638.6+ KB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "isbn13 0.000000\n", "isbn10 0.000000\n", "title 0.000000\n", "subtitle 65.036711\n", "authors 1.057269\n", "categories 1.453744\n", "thumbnail 4.831131\n", "description 3.847283\n", "published_year 0.088106\n", "average_rating 0.631424\n", "num_pages 0.631424\n", "ratings_count 0.631424\n", "dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.isnull().sum() / len(data) * 100" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "rows_to_remove = data[(data[\"description\"].isnull()) | (data[\"authors\"].isnull()) | (data[\"published_year\"].isnull() )| (data[\"average_rating\"].isnull()) |( data[\"num_pages\"].isnull()) | ( data[\"ratings_count\"].isnull())]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5.3744493392070485" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(rows_to_remove) / len(data) * 100 #5.5% of the data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "data = data.drop(index=rows_to_remove.index)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "subtitle 64.959652\n", "thumbnail 3.227809\n", "categories 0.512104\n", "isbn13 0.000000\n", "title 0.000000\n", "isbn10 0.000000\n", "authors 0.000000\n", "description 0.000000\n", "published_year 0.000000\n", "average_rating 0.000000\n", "num_pages 0.000000\n", "ratings_count 0.000000\n", "dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(data.isnull().sum() / len(data) * 100).sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "categories\n", "Fiction 2510\n", "Juvenile Fiction 521\n", "Biography & Autobiography 390\n", "History 256\n", "Literary Criticism 163\n", " ... \n", "Humorous stories 1\n", "Ballets 1\n", "Aged women 1\n", "Catholic women 1\n", "Christian fiction 1\n", "Name: count, Length: 530, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[\"categories\"].value_counts() #530 categories is too much, there is something wrong with this column" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5024 Violence erupts in the poor town of Milagro wh...\n", "3235 FBI Special Agent Dillon Savich teams up with ...\n", "5235 Seventeen-year-old Manhattan society girl Grad...\n", "4516 This is the story of the Tuck family, who are ...\n", "3204 Prejudice, the intricacies of Mediterranean po...\n", "Name: description, dtype: object" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[\"description\"].sample(5) #some desscriptions are too short to be useful and some are too long\n", "# i think characters greater than 25 are better suited for understanding the context" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "data[\"description_chars\"] = data[\"description\"].apply(lambda x: len(x) if isinstance(x, str) else 0)\n", "data = data[data[\"description_chars\"] > 25]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6397" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(data)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# some of the subtitle are missing, so we can have a cojoined title and subtitle, to replace both the title and subtitle\n", "data[\"title_and_subtitle\"] = data[\"title\"].apply(lambda x: x if isinstance(x, str) else \"\") + \" \" + data[\"subtitle\"].apply(lambda x: x if isinstance(x, str) else \"\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13isbn10titlesubtitleauthorscategoriesthumbnaildescriptionpublished_yearaverage_ratingnum_pagesratings_countdescription_charstitle_and_subtitle
097800020058830002005883GileadNaNMarilynne RobinsonFictionhttp://books.google.com/books/content?id=KQZCP...A NOVEL THAT READERS and critics have been eag...2004.03.85247.0361.01154Gilead
197800022619820002261987Spider's WebA NovelCharles Osborne;Agatha ChristieDetective and mystery storieshttp://books.google.com/books/content?id=gA5GP...A new 'Christie for Christmas' -- a full-lengt...2000.03.83241.05164.01200Spider's Web A Novel
297800061638310006163831The One TreeNaNStephen R. DonaldsonAmerican fictionhttp://books.google.com/books/content?id=OmQaw...Volume Two of Stephen Donaldson's acclaimed se...1982.03.97479.0172.0109The One Tree
397800061787360006178731Rage of angelsNaNSidney SheldonFictionhttp://books.google.com/books/content?id=FKo2T...A memorable, mesmerizing heroine Jennifer -- b...1993.03.93512.029532.0359Rage of angels
497800062808970006280897The Four LovesNaNClive Staples LewisChristian lifehttp://books.google.com/books/content?id=XhQ5X...Lewis' work on the nature of love divides love...2002.04.15170.033684.0295The Four Loves
\n", "
" ], "text/plain": [ " isbn13 isbn10 title subtitle \\\n", "0 9780002005883 0002005883 Gilead NaN \n", "1 9780002261982 0002261987 Spider's Web A Novel \n", "2 9780006163831 0006163831 The One Tree NaN \n", "3 9780006178736 0006178731 Rage of angels NaN \n", "4 9780006280897 0006280897 The Four Loves NaN \n", "\n", " authors categories \\\n", "0 Marilynne Robinson Fiction \n", "1 Charles Osborne;Agatha Christie Detective and mystery stories \n", "2 Stephen R. Donaldson American fiction \n", "3 Sidney Sheldon Fiction \n", "4 Clive Staples Lewis Christian life \n", "\n", " thumbnail \\\n", "0 http://books.google.com/books/content?id=KQZCP... \n", "1 http://books.google.com/books/content?id=gA5GP... \n", "2 http://books.google.com/books/content?id=OmQaw... \n", "3 http://books.google.com/books/content?id=FKo2T... \n", "4 http://books.google.com/books/content?id=XhQ5X... \n", "\n", " description published_year \\\n", "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n", "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n", "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n", "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n", "4 Lewis' work on the nature of love divides love... 2002.0 \n", "\n", " average_rating num_pages ratings_count description_chars \\\n", "0 3.85 247.0 361.0 1154 \n", "1 3.83 241.0 5164.0 1200 \n", "2 3.97 479.0 172.0 109 \n", "3 3.93 512.0 29532.0 359 \n", "4 4.15 170.0 33684.0 295 \n", "\n", " title_and_subtitle \n", "0 Gilead \n", "1 Spider's Web A Novel \n", "2 The One Tree \n", "3 Rage of angels \n", "4 The Four Loves " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "data[\"tagged_description\"] = data[\"isbn13\"].apply(str) + \" \" + data[\"description\"].apply(str)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13isbn10titlesubtitleauthorscategoriesthumbnaildescriptionpublished_yearaverage_ratingnum_pagesratings_countdescription_charstitle_and_subtitletagged_description
097800020058830002005883GileadNaNMarilynne RobinsonFictionhttp://books.google.com/books/content?id=KQZCP...A NOVEL THAT READERS and critics have been eag...2004.03.85247.0361.01154Gilead9780002005883 A NOVEL THAT READERS and critics...
197800022619820002261987Spider's WebA NovelCharles Osborne;Agatha ChristieDetective and mystery storieshttp://books.google.com/books/content?id=gA5GP...A new 'Christie for Christmas' -- a full-lengt...2000.03.83241.05164.01200Spider's Web A Novel9780002261982 A new 'Christie for Christmas' -...
297800061638310006163831The One TreeNaNStephen R. DonaldsonAmerican fictionhttp://books.google.com/books/content?id=OmQaw...Volume Two of Stephen Donaldson's acclaimed se...1982.03.97479.0172.0109The One Tree9780006163831 Volume Two of Stephen Donaldson'...
397800061787360006178731Rage of angelsNaNSidney SheldonFictionhttp://books.google.com/books/content?id=FKo2T...A memorable, mesmerizing heroine Jennifer -- b...1993.03.93512.029532.0359Rage of angels9780006178736 A memorable, mesmerizing heroine...
497800062808970006280897The Four LovesNaNClive Staples LewisChristian lifehttp://books.google.com/books/content?id=XhQ5X...Lewis' work on the nature of love divides love...2002.04.15170.033684.0295The Four Loves9780006280897 Lewis' work on the nature of lov...
\n", "
" ], "text/plain": [ " isbn13 isbn10 title subtitle \\\n", "0 9780002005883 0002005883 Gilead NaN \n", "1 9780002261982 0002261987 Spider's Web A Novel \n", "2 9780006163831 0006163831 The One Tree NaN \n", "3 9780006178736 0006178731 Rage of angels NaN \n", "4 9780006280897 0006280897 The Four Loves NaN \n", "\n", " authors categories \\\n", "0 Marilynne Robinson Fiction \n", "1 Charles Osborne;Agatha Christie Detective and mystery stories \n", "2 Stephen R. Donaldson American fiction \n", "3 Sidney Sheldon Fiction \n", "4 Clive Staples Lewis Christian life \n", "\n", " thumbnail \\\n", "0 http://books.google.com/books/content?id=KQZCP... \n", "1 http://books.google.com/books/content?id=gA5GP... \n", "2 http://books.google.com/books/content?id=OmQaw... \n", "3 http://books.google.com/books/content?id=FKo2T... \n", "4 http://books.google.com/books/content?id=XhQ5X... \n", "\n", " description published_year \\\n", "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n", "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n", "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n", "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n", "4 Lewis' work on the nature of love divides love... 2002.0 \n", "\n", " average_rating num_pages ratings_count description_chars \\\n", "0 3.85 247.0 361.0 1154 \n", "1 3.83 241.0 5164.0 1200 \n", "2 3.97 479.0 172.0 109 \n", "3 3.93 512.0 29532.0 359 \n", "4 4.15 170.0 33684.0 295 \n", "\n", " title_and_subtitle tagged_description \n", "0 Gilead 9780002005883 A NOVEL THAT READERS and critics... \n", "1 Spider's Web A Novel 9780002261982 A new 'Christie for Christmas' -... \n", "2 The One Tree 9780006163831 Volume Two of Stephen Donaldson'... \n", "3 Rage of angels 9780006178736 A memorable, mesmerizing heroine... \n", "4 The Four Loves 9780006280897 Lewis' work on the nature of lov... " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13isbn10titlesubtitleauthorscategoriesthumbnaildescriptionpublished_yearaverage_ratingnum_pagesratings_countdescription_charstitle_and_subtitletagged_description
097800020058830002005883GileadNaNMarilynne RobinsonFictionhttp://books.google.com/books/content?id=KQZCP...A NOVEL THAT READERS and critics have been eag...2004.03.85247.0361.01154Gilead9780002005883 A NOVEL THAT READERS and critics...
197800022619820002261987Spider's WebA NovelCharles Osborne;Agatha ChristieDetective and mystery storieshttp://books.google.com/books/content?id=gA5GP...A new 'Christie for Christmas' -- a full-lengt...2000.03.83241.05164.01200Spider's Web A Novel9780002261982 A new 'Christie for Christmas' -...
297800061638310006163831The One TreeNaNStephen R. DonaldsonAmerican fictionhttp://books.google.com/books/content?id=OmQaw...Volume Two of Stephen Donaldson's acclaimed se...1982.03.97479.0172.0109The One Tree9780006163831 Volume Two of Stephen Donaldson'...
397800061787360006178731Rage of angelsNaNSidney SheldonFictionhttp://books.google.com/books/content?id=FKo2T...A memorable, mesmerizing heroine Jennifer -- b...1993.03.93512.029532.0359Rage of angels9780006178736 A memorable, mesmerizing heroine...
497800062808970006280897The Four LovesNaNClive Staples LewisChristian lifehttp://books.google.com/books/content?id=XhQ5X...Lewis' work on the nature of love divides love...2002.04.15170.033684.0295The Four Loves9780006280897 Lewis' work on the nature of lov...
\n", "
" ], "text/plain": [ " isbn13 isbn10 title subtitle \\\n", "0 9780002005883 0002005883 Gilead NaN \n", "1 9780002261982 0002261987 Spider's Web A Novel \n", "2 9780006163831 0006163831 The One Tree NaN \n", "3 9780006178736 0006178731 Rage of angels NaN \n", "4 9780006280897 0006280897 The Four Loves NaN \n", "\n", " authors categories \\\n", "0 Marilynne Robinson Fiction \n", "1 Charles Osborne;Agatha Christie Detective and mystery stories \n", "2 Stephen R. Donaldson American fiction \n", "3 Sidney Sheldon Fiction \n", "4 Clive Staples Lewis Christian life \n", "\n", " thumbnail \\\n", "0 http://books.google.com/books/content?id=KQZCP... \n", "1 http://books.google.com/books/content?id=gA5GP... \n", "2 http://books.google.com/books/content?id=OmQaw... \n", "3 http://books.google.com/books/content?id=FKo2T... \n", "4 http://books.google.com/books/content?id=XhQ5X... \n", "\n", " description published_year \\\n", "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n", "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n", "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n", "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n", "4 Lewis' work on the nature of love divides love... 2002.0 \n", "\n", " average_rating num_pages ratings_count description_chars \\\n", "0 3.85 247.0 361.0 1154 \n", "1 3.83 241.0 5164.0 1200 \n", "2 3.97 479.0 172.0 109 \n", "3 3.93 512.0 29532.0 359 \n", "4 4.15 170.0 33684.0 295 \n", "\n", " title_and_subtitle tagged_description \n", "0 Gilead 9780002005883 A NOVEL THAT READERS and critics... \n", "1 Spider's Web A Novel 9780002261982 A new 'Christie for Christmas' -... \n", "2 The One Tree 9780006163831 Volume Two of Stephen Donaldson'... \n", "3 Rage of angels 9780006178736 A memorable, mesmerizing heroine... \n", "4 The Four Loves 9780006280897 Lewis' work on the nature of lov... " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "data = data.drop(columns=[ \"title\", \"subtitle\", \"description_chars\",\"isbn10\"], axis=1)\n", "data.to_csv(\"books_cleaned.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 6397 entries, 0 to 6809\n", "Data columns (total 11 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 isbn13 6397 non-null int64 \n", " 1 authors 6397 non-null object \n", " 2 categories 6364 non-null object \n", " 3 thumbnail 6190 non-null object \n", " 4 description 6397 non-null object \n", " 5 published_year 6397 non-null float64\n", " 6 average_rating 6397 non-null float64\n", " 7 num_pages 6397 non-null float64\n", " 8 ratings_count 6397 non-null float64\n", " 9 title_and_subtitle 6397 non-null object \n", " 10 tagged_description 6397 non-null object \n", "dtypes: float64(4), int64(1), object(6)\n", "memory usage: 599.7+ KB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 0 }