{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "7cbe0a72", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "id": "63c42422", "metadata": {}, "outputs": [], "source": [ "final_data_cols = [\n", " 'id',\n", " 'title',\n", " \"authors\",\n", " \"description\",\n", " \"categories\",\n", " \"thumbnail\",\n", " \"published_year\",\n", " \"average_rating\",\n", " \"num_pages\",\n", " \"download_url\",\n", " \"anger\",\n", " \"disgust\",\n", " \"fear\",\n", " \"joy\",\n", " \"sadness\",\n", " \"surprise\",\n", " \"neutral\"\n", " ]" ] }, { "cell_type": "code", "execution_count": 22, "id": "19fa8ab2", "metadata": {}, "outputs": [], "source": [ "df_base = pd.read_csv(\"books_cleaned.csv\")\n", "categories_df = pd.read_csv(\"books_with_categories.csv\")\n", "df_sentiments = pd.read_csv(\"books_with_sentiment.csv\")\n", "df_download_url = pd.read_csv(\"books_with_urls.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8a9ebdc7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6397, 11)" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 4, "id": "5e81abf9", "metadata": {}, "outputs": [], "source": [ "with open(\"to_drop.txt\", \"r\") as f:\n", " to_drop = f.read().splitlines()" ] }, { "cell_type": "code", "execution_count": 7, "id": "66e10c4c", "metadata": {}, "outputs": [], "source": [ "to_drop = [int(i) for i in to_drop]" ] }, { "cell_type": "code", "execution_count": 26, "id": "919ed91b", "metadata": {}, "outputs": [], "source": [ "df_base = df_base.drop(to_drop, errors=\"ignore\")\n", "categories_df = categories_df.drop(to_drop, errors=\"ignore\")\n", "df_sentiments = df_sentiments.drop(to_drop, errors=\"ignore\")" ] }, { "cell_type": "code", "execution_count": 27, "id": "2b140195", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6381, 11)" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_base.shape" ] }, { "cell_type": "code", "execution_count": 28, "id": "4d1c9d6a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6381, 2)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_download_url.shape" ] }, { "cell_type": "code", "execution_count": 16, "id": "32427a77", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6381, 11)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "categories_df.shape" ] }, { "cell_type": "code", "execution_count": 40, "id": "cbb04023", "metadata": {}, "outputs": [], "source": [ "df_download_url = df_download_url[[\"url\"]]" ] }, { "cell_type": "code", "execution_count": null, "id": "18edb501", "metadata": {}, "outputs": [], "source": [ "df_sentiments = df_sentiments[[\"anger\",\"disgust\",\"fear\",\"joy\",\"sadness\",\"surprise\",\"neutral\"]]\n", "df_sentiments.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "e72f2e81", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleurl
0Gilead by Marilynne Robinson- google bookshttps://books.google.com/books/about/Gilead.ht...
1Spider's Web A Novel by Charles Osborne;Agatha...https://books.google.com/books/about/Spider_s_...
2The One Tree by Stephen R. Donaldson- google ...https://books.google.com/books/about/The_One_T...
3Rage of angels by Sidney Sheldon- google bookshttps://books.google.com/books/about/Rage_of_A...
4The Four Loves by Clive Staples Lewis- google...https://books.google.com/books/about/The_Four_...
\n", "
" ], "text/plain": [ " title \\\n", "0 Gilead by Marilynne Robinson- google books \n", "1 Spider's Web A Novel by Charles Osborne;Agatha... \n", "2 The One Tree by Stephen R. Donaldson- google ... \n", "3 Rage of angels by Sidney Sheldon- google books \n", "4 The Four Loves by Clive Staples Lewis- google... \n", "\n", " url \n", "0 https://books.google.com/books/about/Gilead.ht... \n", "1 https://books.google.com/books/about/Spider_s_... \n", "2 https://books.google.com/books/about/The_One_T... \n", "3 https://books.google.com/books/about/Rage_of_A... \n", "4 https://books.google.com/books/about/The_Four_... " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_base = df_base[[\"isbn13\", \"authors\",\"thumbnail\",\"description\",\"published_year\",\"average_rating\",\"num_pages\",\"ratings_count\",\"title_and_subtitle\",\"tagged_description\"]]" ] }, { "cell_type": "code", "execution_count": null, "id": "e3f317bc", "metadata": {}, "outputs": [], "source": [ "categories_df = categories_df[[\"categories\"]]" ] }, { "cell_type": "code", "execution_count": 46, "id": "d7126b63", "metadata": {}, "outputs": [], "source": [ "df_base = df_base.reset_index().drop(\"index\", axis=1)\n", "categories_df = categories_df.reset_index().drop(\"index\", axis=1)\n", "df_download_url = df_download_url.reset_index().drop(\"index\", axis=1)\n", "df_sentiments = df_sentiments.reset_index().drop(\"index\", axis=1)" ] }, { "cell_type": "code", "execution_count": 50, "id": "cc1e8c55", "metadata": {}, "outputs": [], "source": [ "final_df = pd.concat([df_base,categories_df,df_sentiments,df_download_url], axis=1)" ] }, { "cell_type": "code", "execution_count": 51, "id": "9ba30e30", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13authorsthumbnaildescriptionpublished_yearaverage_ratingnum_pagesratings_counttitle_and_subtitletagged_descriptioncategoriesangerdisgustfearjoysadnesssurpriseneutralurl
40399780727861153Ja Jance;Judith A. Jancehttp://books.google.com/books/content?id=YDFDP...Life is good for Joanna Brady in the small des...2004.04.00256.039.0Desert Heat9780727861153 Life is good for Joanna Brady in...mystery0.8397550.8935300.0513630.7699200.1116900.0787650.558840https://books.google.com/books/about/Desert_He...
22619780393059465Harriet Beecher Stowe;Professor Harriet Beeche...http://books.google.com/books/content?id=bSaWh...An interpretation of the American classic refu...2007.03.86528.0160.0The Annotated Uncle Tom's Cabin9780393059465 An interpretation of the America...history0.0641340.7281390.0513630.0405640.1116900.3487720.599532https://books.google.com/books/about/Uncle_Tom...
61019781841157481Jonathan Franzenhttp://books.google.com/books/content?id=n9-ha...Dying St. Louis is turned inside-out by the ap...2003.03.12528.0119.0The Twenty-seventh City9781841157481 Dying St. Louis is turned inside...fiction0.4702210.1144130.0668230.4027930.1116900.2162590.735679https://books.google.com/books/about/The_Twent...
56669781560258247Norman Mailer;John Buffalo Mailerhttp://books.google.com/books/content?id=9oBps...Questions are posed, writes Norman Mailer, \"in...2006.03.31218.067.0The Big Empty Dialogues on Politics, Sex, God,...9781560258247 Questions are posed, writes Norm...mystery0.0858850.1040980.2538580.3707360.1116900.3134750.930554https://books.google.com/books/about/The_Big_E...
18629780349107868Daniel Jonah Goldhagenhttp://books.google.com/books/content?id=L11gQ...Daniel Goldhagen re-visits a question which hi...1997.03.68634.080.0Hitler's Willing Executioners Ordinary Germans...9780349107868 Daniel Goldhagen re-visits a que...mystery0.7818360.1298870.1983950.0405640.1314370.0880810.693353https://books.google.com/books/about/Hitler_s_...
\n", "
" ], "text/plain": [ " isbn13 authors \\\n", "4039 9780727861153 Ja Jance;Judith A. Jance \n", "2261 9780393059465 Harriet Beecher Stowe;Professor Harriet Beeche... \n", "6101 9781841157481 Jonathan Franzen \n", "5666 9781560258247 Norman Mailer;John Buffalo Mailer \n", "1862 9780349107868 Daniel Jonah Goldhagen \n", "\n", " thumbnail \\\n", "4039 http://books.google.com/books/content?id=YDFDP... \n", "2261 http://books.google.com/books/content?id=bSaWh... \n", "6101 http://books.google.com/books/content?id=n9-ha... \n", "5666 http://books.google.com/books/content?id=9oBps... \n", "1862 http://books.google.com/books/content?id=L11gQ... \n", "\n", " description published_year \\\n", "4039 Life is good for Joanna Brady in the small des... 2004.0 \n", "2261 An interpretation of the American classic refu... 2007.0 \n", "6101 Dying St. Louis is turned inside-out by the ap... 2003.0 \n", "5666 Questions are posed, writes Norman Mailer, \"in... 2006.0 \n", "1862 Daniel Goldhagen re-visits a question which hi... 1997.0 \n", "\n", " average_rating num_pages ratings_count \\\n", "4039 4.00 256.0 39.0 \n", "2261 3.86 528.0 160.0 \n", "6101 3.12 528.0 119.0 \n", "5666 3.31 218.0 67.0 \n", "1862 3.68 634.0 80.0 \n", "\n", " title_and_subtitle \\\n", "4039 Desert Heat \n", "2261 The Annotated Uncle Tom's Cabin \n", "6101 The Twenty-seventh City \n", "5666 The Big Empty Dialogues on Politics, Sex, God,... \n", "1862 Hitler's Willing Executioners Ordinary Germans... \n", "\n", " tagged_description categories anger \\\n", "4039 9780727861153 Life is good for Joanna Brady in... mystery 0.839755 \n", "2261 9780393059465 An interpretation of the America... history 0.064134 \n", "6101 9781841157481 Dying St. Louis is turned inside... fiction 0.470221 \n", "5666 9781560258247 Questions are posed, writes Norm... mystery 0.085885 \n", "1862 9780349107868 Daniel Goldhagen re-visits a que... mystery 0.781836 \n", "\n", " disgust fear joy sadness surprise neutral \\\n", "4039 0.893530 0.051363 0.769920 0.111690 0.078765 0.558840 \n", "2261 0.728139 0.051363 0.040564 0.111690 0.348772 0.599532 \n", "6101 0.114413 0.066823 0.402793 0.111690 0.216259 0.735679 \n", "5666 0.104098 0.253858 0.370736 0.111690 0.313475 0.930554 \n", "1862 0.129887 0.198395 0.040564 0.131437 0.088081 0.693353 \n", "\n", " url \n", "4039 https://books.google.com/books/about/Desert_He... \n", "2261 https://books.google.com/books/about/Uncle_Tom... \n", "6101 https://books.google.com/books/about/The_Twent... \n", "5666 https://books.google.com/books/about/The_Big_E... \n", "1862 https://books.google.com/books/about/Hitler_s_... " ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_df.sample(5)" ] }, { "cell_type": "code", "execution_count": 53, "id": "c90847c5", "metadata": {}, "outputs": [], "source": [ "final_df[\"tagged_description\"].to_csv(\"tagged_description.txt\", index=None, header=None)" ] }, { "cell_type": "code", "execution_count": 54, "id": "5419aa0e", "metadata": {}, "outputs": [], "source": [ "final_df.to_csv(\"final_book_df.csv\", index=None)" ] }, { "cell_type": "code", "execution_count": null, "id": "32b5edca", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }