{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "7cbe0a72",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63c42422",
"metadata": {},
"outputs": [],
"source": [
"final_data_cols = [\n",
" 'id',\n",
" 'title',\n",
" \"authors\",\n",
" \"description\",\n",
" \"categories\",\n",
" \"thumbnail\",\n",
" \"published_year\",\n",
" \"average_rating\",\n",
" \"num_pages\",\n",
" \"download_url\",\n",
" \"anger\",\n",
" \"disgust\",\n",
" \"fear\",\n",
" \"joy\",\n",
" \"sadness\",\n",
" \"surprise\",\n",
" \"neutral\"\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "19fa8ab2",
"metadata": {},
"outputs": [],
"source": [
"df_base = pd.read_csv(\"books_cleaned.csv\")\n",
"categories_df = pd.read_csv(\"books_with_categories.csv\")\n",
"df_sentiments = pd.read_csv(\"books_with_sentiment.csv\")\n",
"df_download_url = pd.read_csv(\"books_with_urls.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a9ebdc7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6397, 11)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5e81abf9",
"metadata": {},
"outputs": [],
"source": [
"with open(\"to_drop.txt\", \"r\") as f:\n",
" to_drop = f.read().splitlines()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "66e10c4c",
"metadata": {},
"outputs": [],
"source": [
"to_drop = [int(i) for i in to_drop]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "919ed91b",
"metadata": {},
"outputs": [],
"source": [
"df_base = df_base.drop(to_drop, errors=\"ignore\")\n",
"categories_df = categories_df.drop(to_drop, errors=\"ignore\")\n",
"df_sentiments = df_sentiments.drop(to_drop, errors=\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "2b140195",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6381, 11)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_base.shape"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "4d1c9d6a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6381, 2)"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_download_url.shape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "32427a77",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6381, 11)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"categories_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "cbb04023",
"metadata": {},
"outputs": [],
"source": [
"df_download_url = df_download_url[[\"url\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18edb501",
"metadata": {},
"outputs": [],
"source": [
"df_sentiments = df_sentiments[[\"anger\",\"disgust\",\"fear\",\"joy\",\"sadness\",\"surprise\",\"neutral\"]]\n",
"df_sentiments.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e72f2e81",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" url | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Gilead by Marilynne Robinson- google books | \n",
" https://books.google.com/books/about/Gilead.ht... | \n",
"
\n",
" \n",
" | 1 | \n",
" Spider's Web A Novel by Charles Osborne;Agatha... | \n",
" https://books.google.com/books/about/Spider_s_... | \n",
"
\n",
" \n",
" | 2 | \n",
" The One Tree by Stephen R. Donaldson- google ... | \n",
" https://books.google.com/books/about/The_One_T... | \n",
"
\n",
" \n",
" | 3 | \n",
" Rage of angels by Sidney Sheldon- google books | \n",
" https://books.google.com/books/about/Rage_of_A... | \n",
"
\n",
" \n",
" | 4 | \n",
" The Four Loves by Clive Staples Lewis- google... | \n",
" https://books.google.com/books/about/The_Four_... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title \\\n",
"0 Gilead by Marilynne Robinson- google books \n",
"1 Spider's Web A Novel by Charles Osborne;Agatha... \n",
"2 The One Tree by Stephen R. Donaldson- google ... \n",
"3 Rage of angels by Sidney Sheldon- google books \n",
"4 The Four Loves by Clive Staples Lewis- google... \n",
"\n",
" url \n",
"0 https://books.google.com/books/about/Gilead.ht... \n",
"1 https://books.google.com/books/about/Spider_s_... \n",
"2 https://books.google.com/books/about/The_One_T... \n",
"3 https://books.google.com/books/about/Rage_of_A... \n",
"4 https://books.google.com/books/about/The_Four_... "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_base = df_base[[\"isbn13\", \"authors\",\"thumbnail\",\"description\",\"published_year\",\"average_rating\",\"num_pages\",\"ratings_count\",\"title_and_subtitle\",\"tagged_description\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e3f317bc",
"metadata": {},
"outputs": [],
"source": [
"categories_df = categories_df[[\"categories\"]]"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "d7126b63",
"metadata": {},
"outputs": [],
"source": [
"df_base = df_base.reset_index().drop(\"index\", axis=1)\n",
"categories_df = categories_df.reset_index().drop(\"index\", axis=1)\n",
"df_download_url = df_download_url.reset_index().drop(\"index\", axis=1)\n",
"df_sentiments = df_sentiments.reset_index().drop(\"index\", axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "cc1e8c55",
"metadata": {},
"outputs": [],
"source": [
"final_df = pd.concat([df_base,categories_df,df_sentiments,df_download_url], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "9ba30e30",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" isbn13 | \n",
" authors | \n",
" thumbnail | \n",
" description | \n",
" published_year | \n",
" average_rating | \n",
" num_pages | \n",
" ratings_count | \n",
" title_and_subtitle | \n",
" tagged_description | \n",
" categories | \n",
" anger | \n",
" disgust | \n",
" fear | \n",
" joy | \n",
" sadness | \n",
" surprise | \n",
" neutral | \n",
" url | \n",
"
\n",
" \n",
" \n",
" \n",
" | 4039 | \n",
" 9780727861153 | \n",
" Ja Jance;Judith A. Jance | \n",
" http://books.google.com/books/content?id=YDFDP... | \n",
" Life is good for Joanna Brady in the small des... | \n",
" 2004.0 | \n",
" 4.00 | \n",
" 256.0 | \n",
" 39.0 | \n",
" Desert Heat | \n",
" 9780727861153 Life is good for Joanna Brady in... | \n",
" mystery | \n",
" 0.839755 | \n",
" 0.893530 | \n",
" 0.051363 | \n",
" 0.769920 | \n",
" 0.111690 | \n",
" 0.078765 | \n",
" 0.558840 | \n",
" https://books.google.com/books/about/Desert_He... | \n",
"
\n",
" \n",
" | 2261 | \n",
" 9780393059465 | \n",
" Harriet Beecher Stowe;Professor Harriet Beeche... | \n",
" http://books.google.com/books/content?id=bSaWh... | \n",
" An interpretation of the American classic refu... | \n",
" 2007.0 | \n",
" 3.86 | \n",
" 528.0 | \n",
" 160.0 | \n",
" The Annotated Uncle Tom's Cabin | \n",
" 9780393059465 An interpretation of the America... | \n",
" history | \n",
" 0.064134 | \n",
" 0.728139 | \n",
" 0.051363 | \n",
" 0.040564 | \n",
" 0.111690 | \n",
" 0.348772 | \n",
" 0.599532 | \n",
" https://books.google.com/books/about/Uncle_Tom... | \n",
"
\n",
" \n",
" | 6101 | \n",
" 9781841157481 | \n",
" Jonathan Franzen | \n",
" http://books.google.com/books/content?id=n9-ha... | \n",
" Dying St. Louis is turned inside-out by the ap... | \n",
" 2003.0 | \n",
" 3.12 | \n",
" 528.0 | \n",
" 119.0 | \n",
" The Twenty-seventh City | \n",
" 9781841157481 Dying St. Louis is turned inside... | \n",
" fiction | \n",
" 0.470221 | \n",
" 0.114413 | \n",
" 0.066823 | \n",
" 0.402793 | \n",
" 0.111690 | \n",
" 0.216259 | \n",
" 0.735679 | \n",
" https://books.google.com/books/about/The_Twent... | \n",
"
\n",
" \n",
" | 5666 | \n",
" 9781560258247 | \n",
" Norman Mailer;John Buffalo Mailer | \n",
" http://books.google.com/books/content?id=9oBps... | \n",
" Questions are posed, writes Norman Mailer, \"in... | \n",
" 2006.0 | \n",
" 3.31 | \n",
" 218.0 | \n",
" 67.0 | \n",
" The Big Empty Dialogues on Politics, Sex, God,... | \n",
" 9781560258247 Questions are posed, writes Norm... | \n",
" mystery | \n",
" 0.085885 | \n",
" 0.104098 | \n",
" 0.253858 | \n",
" 0.370736 | \n",
" 0.111690 | \n",
" 0.313475 | \n",
" 0.930554 | \n",
" https://books.google.com/books/about/The_Big_E... | \n",
"
\n",
" \n",
" | 1862 | \n",
" 9780349107868 | \n",
" Daniel Jonah Goldhagen | \n",
" http://books.google.com/books/content?id=L11gQ... | \n",
" Daniel Goldhagen re-visits a question which hi... | \n",
" 1997.0 | \n",
" 3.68 | \n",
" 634.0 | \n",
" 80.0 | \n",
" Hitler's Willing Executioners Ordinary Germans... | \n",
" 9780349107868 Daniel Goldhagen re-visits a que... | \n",
" mystery | \n",
" 0.781836 | \n",
" 0.129887 | \n",
" 0.198395 | \n",
" 0.040564 | \n",
" 0.131437 | \n",
" 0.088081 | \n",
" 0.693353 | \n",
" https://books.google.com/books/about/Hitler_s_... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" isbn13 authors \\\n",
"4039 9780727861153 Ja Jance;Judith A. Jance \n",
"2261 9780393059465 Harriet Beecher Stowe;Professor Harriet Beeche... \n",
"6101 9781841157481 Jonathan Franzen \n",
"5666 9781560258247 Norman Mailer;John Buffalo Mailer \n",
"1862 9780349107868 Daniel Jonah Goldhagen \n",
"\n",
" thumbnail \\\n",
"4039 http://books.google.com/books/content?id=YDFDP... \n",
"2261 http://books.google.com/books/content?id=bSaWh... \n",
"6101 http://books.google.com/books/content?id=n9-ha... \n",
"5666 http://books.google.com/books/content?id=9oBps... \n",
"1862 http://books.google.com/books/content?id=L11gQ... \n",
"\n",
" description published_year \\\n",
"4039 Life is good for Joanna Brady in the small des... 2004.0 \n",
"2261 An interpretation of the American classic refu... 2007.0 \n",
"6101 Dying St. Louis is turned inside-out by the ap... 2003.0 \n",
"5666 Questions are posed, writes Norman Mailer, \"in... 2006.0 \n",
"1862 Daniel Goldhagen re-visits a question which hi... 1997.0 \n",
"\n",
" average_rating num_pages ratings_count \\\n",
"4039 4.00 256.0 39.0 \n",
"2261 3.86 528.0 160.0 \n",
"6101 3.12 528.0 119.0 \n",
"5666 3.31 218.0 67.0 \n",
"1862 3.68 634.0 80.0 \n",
"\n",
" title_and_subtitle \\\n",
"4039 Desert Heat \n",
"2261 The Annotated Uncle Tom's Cabin \n",
"6101 The Twenty-seventh City \n",
"5666 The Big Empty Dialogues on Politics, Sex, God,... \n",
"1862 Hitler's Willing Executioners Ordinary Germans... \n",
"\n",
" tagged_description categories anger \\\n",
"4039 9780727861153 Life is good for Joanna Brady in... mystery 0.839755 \n",
"2261 9780393059465 An interpretation of the America... history 0.064134 \n",
"6101 9781841157481 Dying St. Louis is turned inside... fiction 0.470221 \n",
"5666 9781560258247 Questions are posed, writes Norm... mystery 0.085885 \n",
"1862 9780349107868 Daniel Goldhagen re-visits a que... mystery 0.781836 \n",
"\n",
" disgust fear joy sadness surprise neutral \\\n",
"4039 0.893530 0.051363 0.769920 0.111690 0.078765 0.558840 \n",
"2261 0.728139 0.051363 0.040564 0.111690 0.348772 0.599532 \n",
"6101 0.114413 0.066823 0.402793 0.111690 0.216259 0.735679 \n",
"5666 0.104098 0.253858 0.370736 0.111690 0.313475 0.930554 \n",
"1862 0.129887 0.198395 0.040564 0.131437 0.088081 0.693353 \n",
"\n",
" url \n",
"4039 https://books.google.com/books/about/Desert_He... \n",
"2261 https://books.google.com/books/about/Uncle_Tom... \n",
"6101 https://books.google.com/books/about/The_Twent... \n",
"5666 https://books.google.com/books/about/The_Big_E... \n",
"1862 https://books.google.com/books/about/Hitler_s_... "
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "c90847c5",
"metadata": {},
"outputs": [],
"source": [
"final_df[\"tagged_description\"].to_csv(\"tagged_description.txt\", index=None, header=None)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "5419aa0e",
"metadata": {},
"outputs": [],
"source": [
"final_df.to_csv(\"final_book_df.csv\", index=None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "32b5edca",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}