{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\NonsoDev\\Documents\\Allcodes\\Projects_DL_for resume\\Recommender systems\\book reccomender - llm\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Path to dataset files: C:\\Users\\NonsoDev\\.cache\\kagglehub\\datasets\\dylanjcastillo\\7k-books-with-metadata\\versions\\3\n"
]
}
],
"source": [
"import kagglehub\n",
"\n",
"# Download latest version\n",
"path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n",
"\n",
"print(\"Path to dataset files:\", path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of books: 6810\n"
]
}
],
"source": [
"import pandas as pd\n",
"data = pd.read_csv(path + \"/books.csv\")\n",
"print(\"Number of books:\", len(data))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" isbn13 | \n",
" isbn10 | \n",
" title | \n",
" subtitle | \n",
" authors | \n",
" categories | \n",
" thumbnail | \n",
" description | \n",
" published_year | \n",
" average_rating | \n",
" num_pages | \n",
" ratings_count | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 9780002005883 | \n",
" 0002005883 | \n",
" Gilead | \n",
" NaN | \n",
" Marilynne Robinson | \n",
" Fiction | \n",
" http://books.google.com/books/content?id=KQZCP... | \n",
" A NOVEL THAT READERS and critics have been eag... | \n",
" 2004.0 | \n",
" 3.85 | \n",
" 247.0 | \n",
" 361.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 9780002261982 | \n",
" 0002261987 | \n",
" Spider's Web | \n",
" A Novel | \n",
" Charles Osborne;Agatha Christie | \n",
" Detective and mystery stories | \n",
" http://books.google.com/books/content?id=gA5GP... | \n",
" A new 'Christie for Christmas' -- a full-lengt... | \n",
" 2000.0 | \n",
" 3.83 | \n",
" 241.0 | \n",
" 5164.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 9780006163831 | \n",
" 0006163831 | \n",
" The One Tree | \n",
" NaN | \n",
" Stephen R. Donaldson | \n",
" American fiction | \n",
" http://books.google.com/books/content?id=OmQaw... | \n",
" Volume Two of Stephen Donaldson's acclaimed se... | \n",
" 1982.0 | \n",
" 3.97 | \n",
" 479.0 | \n",
" 172.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 9780006178736 | \n",
" 0006178731 | \n",
" Rage of angels | \n",
" NaN | \n",
" Sidney Sheldon | \n",
" Fiction | \n",
" http://books.google.com/books/content?id=FKo2T... | \n",
" A memorable, mesmerizing heroine Jennifer -- b... | \n",
" 1993.0 | \n",
" 3.93 | \n",
" 512.0 | \n",
" 29532.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 9780006280897 | \n",
" 0006280897 | \n",
" The Four Loves | \n",
" NaN | \n",
" Clive Staples Lewis | \n",
" Christian life | \n",
" http://books.google.com/books/content?id=XhQ5X... | \n",
" Lewis' work on the nature of love divides love... | \n",
" 2002.0 | \n",
" 4.15 | \n",
" 170.0 | \n",
" 33684.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" isbn13 isbn10 title subtitle \\\n",
"0 9780002005883 0002005883 Gilead NaN \n",
"1 9780002261982 0002261987 Spider's Web A Novel \n",
"2 9780006163831 0006163831 The One Tree NaN \n",
"3 9780006178736 0006178731 Rage of angels NaN \n",
"4 9780006280897 0006280897 The Four Loves NaN \n",
"\n",
" authors categories \\\n",
"0 Marilynne Robinson Fiction \n",
"1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
"2 Stephen R. Donaldson American fiction \n",
"3 Sidney Sheldon Fiction \n",
"4 Clive Staples Lewis Christian life \n",
"\n",
" thumbnail \\\n",
"0 http://books.google.com/books/content?id=KQZCP... \n",
"1 http://books.google.com/books/content?id=gA5GP... \n",
"2 http://books.google.com/books/content?id=OmQaw... \n",
"3 http://books.google.com/books/content?id=FKo2T... \n",
"4 http://books.google.com/books/content?id=XhQ5X... \n",
"\n",
" description published_year \\\n",
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
"\n",
" average_rating num_pages ratings_count \n",
"0 3.85 247.0 361.0 \n",
"1 3.83 241.0 5164.0 \n",
"2 3.97 479.0 172.0 \n",
"3 3.93 512.0 29532.0 \n",
"4 4.15 170.0 33684.0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 6810 entries, 0 to 6809\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 isbn13 6810 non-null int64 \n",
" 1 isbn10 6810 non-null object \n",
" 2 title 6810 non-null object \n",
" 3 subtitle 2381 non-null object \n",
" 4 authors 6738 non-null object \n",
" 5 categories 6711 non-null object \n",
" 6 thumbnail 6481 non-null object \n",
" 7 description 6548 non-null object \n",
" 8 published_year 6804 non-null float64\n",
" 9 average_rating 6767 non-null float64\n",
" 10 num_pages 6767 non-null float64\n",
" 11 ratings_count 6767 non-null float64\n",
"dtypes: float64(4), int64(1), object(7)\n",
"memory usage: 638.6+ KB\n"
]
}
],
"source": [
"data.info()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"isbn13 0.000000\n",
"isbn10 0.000000\n",
"title 0.000000\n",
"subtitle 65.036711\n",
"authors 1.057269\n",
"categories 1.453744\n",
"thumbnail 4.831131\n",
"description 3.847283\n",
"published_year 0.088106\n",
"average_rating 0.631424\n",
"num_pages 0.631424\n",
"ratings_count 0.631424\n",
"dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isnull().sum() / len(data) * 100"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"rows_to_remove = data[(data[\"description\"].isnull()) | (data[\"authors\"].isnull()) | (data[\"published_year\"].isnull() )| (data[\"average_rating\"].isnull()) |( data[\"num_pages\"].isnull()) | ( data[\"ratings_count\"].isnull())]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5.3744493392070485"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(rows_to_remove) / len(data) * 100 #5.5% of the data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"data = data.drop(index=rows_to_remove.index)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"subtitle 64.959652\n",
"thumbnail 3.227809\n",
"categories 0.512104\n",
"isbn13 0.000000\n",
"title 0.000000\n",
"isbn10 0.000000\n",
"authors 0.000000\n",
"description 0.000000\n",
"published_year 0.000000\n",
"average_rating 0.000000\n",
"num_pages 0.000000\n",
"ratings_count 0.000000\n",
"dtype: float64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(data.isnull().sum() / len(data) * 100).sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"categories\n",
"Fiction 2510\n",
"Juvenile Fiction 521\n",
"Biography & Autobiography 390\n",
"History 256\n",
"Literary Criticism 163\n",
" ... \n",
"Humorous stories 1\n",
"Ballets 1\n",
"Aged women 1\n",
"Catholic women 1\n",
"Christian fiction 1\n",
"Name: count, Length: 530, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[\"categories\"].value_counts() #530 categories is too much, there is something wrong with this column"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5024 Violence erupts in the poor town of Milagro wh...\n",
"3235 FBI Special Agent Dillon Savich teams up with ...\n",
"5235 Seventeen-year-old Manhattan society girl Grad...\n",
"4516 This is the story of the Tuck family, who are ...\n",
"3204 Prejudice, the intricacies of Mediterranean po...\n",
"Name: description, dtype: object"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[\"description\"].sample(5) #some desscriptions are too short to be useful and some are too long\n",
"# i think characters greater than 25 are better suited for understanding the context"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"data[\"description_chars\"] = data[\"description\"].apply(lambda x: len(x) if isinstance(x, str) else 0)\n",
"data = data[data[\"description_chars\"] > 25]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6397"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# some of the subtitle are missing, so we can have a cojoined title and subtitle, to replace both the title and subtitle\n",
"data[\"title_and_subtitle\"] = data[\"title\"].apply(lambda x: x if isinstance(x, str) else \"\") + \" \" + data[\"subtitle\"].apply(lambda x: x if isinstance(x, str) else \"\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" isbn13 | \n",
" isbn10 | \n",
" title | \n",
" subtitle | \n",
" authors | \n",
" categories | \n",
" thumbnail | \n",
" description | \n",
" published_year | \n",
" average_rating | \n",
" num_pages | \n",
" ratings_count | \n",
" description_chars | \n",
" title_and_subtitle | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 9780002005883 | \n",
" 0002005883 | \n",
" Gilead | \n",
" NaN | \n",
" Marilynne Robinson | \n",
" Fiction | \n",
" http://books.google.com/books/content?id=KQZCP... | \n",
" A NOVEL THAT READERS and critics have been eag... | \n",
" 2004.0 | \n",
" 3.85 | \n",
" 247.0 | \n",
" 361.0 | \n",
" 1154 | \n",
" Gilead | \n",
"
\n",
" \n",
" | 1 | \n",
" 9780002261982 | \n",
" 0002261987 | \n",
" Spider's Web | \n",
" A Novel | \n",
" Charles Osborne;Agatha Christie | \n",
" Detective and mystery stories | \n",
" http://books.google.com/books/content?id=gA5GP... | \n",
" A new 'Christie for Christmas' -- a full-lengt... | \n",
" 2000.0 | \n",
" 3.83 | \n",
" 241.0 | \n",
" 5164.0 | \n",
" 1200 | \n",
" Spider's Web A Novel | \n",
"
\n",
" \n",
" | 2 | \n",
" 9780006163831 | \n",
" 0006163831 | \n",
" The One Tree | \n",
" NaN | \n",
" Stephen R. Donaldson | \n",
" American fiction | \n",
" http://books.google.com/books/content?id=OmQaw... | \n",
" Volume Two of Stephen Donaldson's acclaimed se... | \n",
" 1982.0 | \n",
" 3.97 | \n",
" 479.0 | \n",
" 172.0 | \n",
" 109 | \n",
" The One Tree | \n",
"
\n",
" \n",
" | 3 | \n",
" 9780006178736 | \n",
" 0006178731 | \n",
" Rage of angels | \n",
" NaN | \n",
" Sidney Sheldon | \n",
" Fiction | \n",
" http://books.google.com/books/content?id=FKo2T... | \n",
" A memorable, mesmerizing heroine Jennifer -- b... | \n",
" 1993.0 | \n",
" 3.93 | \n",
" 512.0 | \n",
" 29532.0 | \n",
" 359 | \n",
" Rage of angels | \n",
"
\n",
" \n",
" | 4 | \n",
" 9780006280897 | \n",
" 0006280897 | \n",
" The Four Loves | \n",
" NaN | \n",
" Clive Staples Lewis | \n",
" Christian life | \n",
" http://books.google.com/books/content?id=XhQ5X... | \n",
" Lewis' work on the nature of love divides love... | \n",
" 2002.0 | \n",
" 4.15 | \n",
" 170.0 | \n",
" 33684.0 | \n",
" 295 | \n",
" The Four Loves | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" isbn13 isbn10 title subtitle \\\n",
"0 9780002005883 0002005883 Gilead NaN \n",
"1 9780002261982 0002261987 Spider's Web A Novel \n",
"2 9780006163831 0006163831 The One Tree NaN \n",
"3 9780006178736 0006178731 Rage of angels NaN \n",
"4 9780006280897 0006280897 The Four Loves NaN \n",
"\n",
" authors categories \\\n",
"0 Marilynne Robinson Fiction \n",
"1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
"2 Stephen R. Donaldson American fiction \n",
"3 Sidney Sheldon Fiction \n",
"4 Clive Staples Lewis Christian life \n",
"\n",
" thumbnail \\\n",
"0 http://books.google.com/books/content?id=KQZCP... \n",
"1 http://books.google.com/books/content?id=gA5GP... \n",
"2 http://books.google.com/books/content?id=OmQaw... \n",
"3 http://books.google.com/books/content?id=FKo2T... \n",
"4 http://books.google.com/books/content?id=XhQ5X... \n",
"\n",
" description published_year \\\n",
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
"\n",
" average_rating num_pages ratings_count description_chars \\\n",
"0 3.85 247.0 361.0 1154 \n",
"1 3.83 241.0 5164.0 1200 \n",
"2 3.97 479.0 172.0 109 \n",
"3 3.93 512.0 29532.0 359 \n",
"4 4.15 170.0 33684.0 295 \n",
"\n",
" title_and_subtitle \n",
"0 Gilead \n",
"1 Spider's Web A Novel \n",
"2 The One Tree \n",
"3 Rage of angels \n",
"4 The Four Loves "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"data[\"tagged_description\"] = data[\"isbn13\"].apply(str) + \" \" + data[\"description\"].apply(str)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" isbn13 | \n",
" isbn10 | \n",
" title | \n",
" subtitle | \n",
" authors | \n",
" categories | \n",
" thumbnail | \n",
" description | \n",
" published_year | \n",
" average_rating | \n",
" num_pages | \n",
" ratings_count | \n",
" description_chars | \n",
" title_and_subtitle | \n",
" tagged_description | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 9780002005883 | \n",
" 0002005883 | \n",
" Gilead | \n",
" NaN | \n",
" Marilynne Robinson | \n",
" Fiction | \n",
" http://books.google.com/books/content?id=KQZCP... | \n",
" A NOVEL THAT READERS and critics have been eag... | \n",
" 2004.0 | \n",
" 3.85 | \n",
" 247.0 | \n",
" 361.0 | \n",
" 1154 | \n",
" Gilead | \n",
" 9780002005883 A NOVEL THAT READERS and critics... | \n",
"
\n",
" \n",
" | 1 | \n",
" 9780002261982 | \n",
" 0002261987 | \n",
" Spider's Web | \n",
" A Novel | \n",
" Charles Osborne;Agatha Christie | \n",
" Detective and mystery stories | \n",
" http://books.google.com/books/content?id=gA5GP... | \n",
" A new 'Christie for Christmas' -- a full-lengt... | \n",
" 2000.0 | \n",
" 3.83 | \n",
" 241.0 | \n",
" 5164.0 | \n",
" 1200 | \n",
" Spider's Web A Novel | \n",
" 9780002261982 A new 'Christie for Christmas' -... | \n",
"
\n",
" \n",
" | 2 | \n",
" 9780006163831 | \n",
" 0006163831 | \n",
" The One Tree | \n",
" NaN | \n",
" Stephen R. Donaldson | \n",
" American fiction | \n",
" http://books.google.com/books/content?id=OmQaw... | \n",
" Volume Two of Stephen Donaldson's acclaimed se... | \n",
" 1982.0 | \n",
" 3.97 | \n",
" 479.0 | \n",
" 172.0 | \n",
" 109 | \n",
" The One Tree | \n",
" 9780006163831 Volume Two of Stephen Donaldson'... | \n",
"
\n",
" \n",
" | 3 | \n",
" 9780006178736 | \n",
" 0006178731 | \n",
" Rage of angels | \n",
" NaN | \n",
" Sidney Sheldon | \n",
" Fiction | \n",
" http://books.google.com/books/content?id=FKo2T... | \n",
" A memorable, mesmerizing heroine Jennifer -- b... | \n",
" 1993.0 | \n",
" 3.93 | \n",
" 512.0 | \n",
" 29532.0 | \n",
" 359 | \n",
" Rage of angels | \n",
" 9780006178736 A memorable, mesmerizing heroine... | \n",
"
\n",
" \n",
" | 4 | \n",
" 9780006280897 | \n",
" 0006280897 | \n",
" The Four Loves | \n",
" NaN | \n",
" Clive Staples Lewis | \n",
" Christian life | \n",
" http://books.google.com/books/content?id=XhQ5X... | \n",
" Lewis' work on the nature of love divides love... | \n",
" 2002.0 | \n",
" 4.15 | \n",
" 170.0 | \n",
" 33684.0 | \n",
" 295 | \n",
" The Four Loves | \n",
" 9780006280897 Lewis' work on the nature of lov... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" isbn13 isbn10 title subtitle \\\n",
"0 9780002005883 0002005883 Gilead NaN \n",
"1 9780002261982 0002261987 Spider's Web A Novel \n",
"2 9780006163831 0006163831 The One Tree NaN \n",
"3 9780006178736 0006178731 Rage of angels NaN \n",
"4 9780006280897 0006280897 The Four Loves NaN \n",
"\n",
" authors categories \\\n",
"0 Marilynne Robinson Fiction \n",
"1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
"2 Stephen R. Donaldson American fiction \n",
"3 Sidney Sheldon Fiction \n",
"4 Clive Staples Lewis Christian life \n",
"\n",
" thumbnail \\\n",
"0 http://books.google.com/books/content?id=KQZCP... \n",
"1 http://books.google.com/books/content?id=gA5GP... \n",
"2 http://books.google.com/books/content?id=OmQaw... \n",
"3 http://books.google.com/books/content?id=FKo2T... \n",
"4 http://books.google.com/books/content?id=XhQ5X... \n",
"\n",
" description published_year \\\n",
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
"\n",
" average_rating num_pages ratings_count description_chars \\\n",
"0 3.85 247.0 361.0 1154 \n",
"1 3.83 241.0 5164.0 1200 \n",
"2 3.97 479.0 172.0 109 \n",
"3 3.93 512.0 29532.0 359 \n",
"4 4.15 170.0 33684.0 295 \n",
"\n",
" title_and_subtitle tagged_description \n",
"0 Gilead 9780002005883 A NOVEL THAT READERS and critics... \n",
"1 Spider's Web A Novel 9780002261982 A new 'Christie for Christmas' -... \n",
"2 The One Tree 9780006163831 Volume Two of Stephen Donaldson'... \n",
"3 Rage of angels 9780006178736 A memorable, mesmerizing heroine... \n",
"4 The Four Loves 9780006280897 Lewis' work on the nature of lov... "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" isbn13 | \n",
" isbn10 | \n",
" title | \n",
" subtitle | \n",
" authors | \n",
" categories | \n",
" thumbnail | \n",
" description | \n",
" published_year | \n",
" average_rating | \n",
" num_pages | \n",
" ratings_count | \n",
" description_chars | \n",
" title_and_subtitle | \n",
" tagged_description | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 9780002005883 | \n",
" 0002005883 | \n",
" Gilead | \n",
" NaN | \n",
" Marilynne Robinson | \n",
" Fiction | \n",
" http://books.google.com/books/content?id=KQZCP... | \n",
" A NOVEL THAT READERS and critics have been eag... | \n",
" 2004.0 | \n",
" 3.85 | \n",
" 247.0 | \n",
" 361.0 | \n",
" 1154 | \n",
" Gilead | \n",
" 9780002005883 A NOVEL THAT READERS and critics... | \n",
"
\n",
" \n",
" | 1 | \n",
" 9780002261982 | \n",
" 0002261987 | \n",
" Spider's Web | \n",
" A Novel | \n",
" Charles Osborne;Agatha Christie | \n",
" Detective and mystery stories | \n",
" http://books.google.com/books/content?id=gA5GP... | \n",
" A new 'Christie for Christmas' -- a full-lengt... | \n",
" 2000.0 | \n",
" 3.83 | \n",
" 241.0 | \n",
" 5164.0 | \n",
" 1200 | \n",
" Spider's Web A Novel | \n",
" 9780002261982 A new 'Christie for Christmas' -... | \n",
"
\n",
" \n",
" | 2 | \n",
" 9780006163831 | \n",
" 0006163831 | \n",
" The One Tree | \n",
" NaN | \n",
" Stephen R. Donaldson | \n",
" American fiction | \n",
" http://books.google.com/books/content?id=OmQaw... | \n",
" Volume Two of Stephen Donaldson's acclaimed se... | \n",
" 1982.0 | \n",
" 3.97 | \n",
" 479.0 | \n",
" 172.0 | \n",
" 109 | \n",
" The One Tree | \n",
" 9780006163831 Volume Two of Stephen Donaldson'... | \n",
"
\n",
" \n",
" | 3 | \n",
" 9780006178736 | \n",
" 0006178731 | \n",
" Rage of angels | \n",
" NaN | \n",
" Sidney Sheldon | \n",
" Fiction | \n",
" http://books.google.com/books/content?id=FKo2T... | \n",
" A memorable, mesmerizing heroine Jennifer -- b... | \n",
" 1993.0 | \n",
" 3.93 | \n",
" 512.0 | \n",
" 29532.0 | \n",
" 359 | \n",
" Rage of angels | \n",
" 9780006178736 A memorable, mesmerizing heroine... | \n",
"
\n",
" \n",
" | 4 | \n",
" 9780006280897 | \n",
" 0006280897 | \n",
" The Four Loves | \n",
" NaN | \n",
" Clive Staples Lewis | \n",
" Christian life | \n",
" http://books.google.com/books/content?id=XhQ5X... | \n",
" Lewis' work on the nature of love divides love... | \n",
" 2002.0 | \n",
" 4.15 | \n",
" 170.0 | \n",
" 33684.0 | \n",
" 295 | \n",
" The Four Loves | \n",
" 9780006280897 Lewis' work on the nature of lov... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" isbn13 isbn10 title subtitle \\\n",
"0 9780002005883 0002005883 Gilead NaN \n",
"1 9780002261982 0002261987 Spider's Web A Novel \n",
"2 9780006163831 0006163831 The One Tree NaN \n",
"3 9780006178736 0006178731 Rage of angels NaN \n",
"4 9780006280897 0006280897 The Four Loves NaN \n",
"\n",
" authors categories \\\n",
"0 Marilynne Robinson Fiction \n",
"1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
"2 Stephen R. Donaldson American fiction \n",
"3 Sidney Sheldon Fiction \n",
"4 Clive Staples Lewis Christian life \n",
"\n",
" thumbnail \\\n",
"0 http://books.google.com/books/content?id=KQZCP... \n",
"1 http://books.google.com/books/content?id=gA5GP... \n",
"2 http://books.google.com/books/content?id=OmQaw... \n",
"3 http://books.google.com/books/content?id=FKo2T... \n",
"4 http://books.google.com/books/content?id=XhQ5X... \n",
"\n",
" description published_year \\\n",
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
"\n",
" average_rating num_pages ratings_count description_chars \\\n",
"0 3.85 247.0 361.0 1154 \n",
"1 3.83 241.0 5164.0 1200 \n",
"2 3.97 479.0 172.0 109 \n",
"3 3.93 512.0 29532.0 359 \n",
"4 4.15 170.0 33684.0 295 \n",
"\n",
" title_and_subtitle tagged_description \n",
"0 Gilead 9780002005883 A NOVEL THAT READERS and critics... \n",
"1 Spider's Web A Novel 9780002261982 A new 'Christie for Christmas' -... \n",
"2 The One Tree 9780006163831 Volume Two of Stephen Donaldson'... \n",
"3 Rage of angels 9780006178736 A memorable, mesmerizing heroine... \n",
"4 The Four Loves 9780006280897 Lewis' work on the nature of lov... "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"data = data.drop(columns=[ \"title\", \"subtitle\", \"description_chars\",\"isbn10\"], axis=1)\n",
"data.to_csv(\"books_cleaned.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Index: 6397 entries, 0 to 6809\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 isbn13 6397 non-null int64 \n",
" 1 authors 6397 non-null object \n",
" 2 categories 6364 non-null object \n",
" 3 thumbnail 6190 non-null object \n",
" 4 description 6397 non-null object \n",
" 5 published_year 6397 non-null float64\n",
" 6 average_rating 6397 non-null float64\n",
" 7 num_pages 6397 non-null float64\n",
" 8 ratings_count 6397 non-null float64\n",
" 9 title_and_subtitle 6397 non-null object \n",
" 10 tagged_description 6397 non-null object \n",
"dtypes: float64(4), int64(1), object(6)\n",
"memory usage: 599.7+ KB\n"
]
}
],
"source": [
"data.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}