abdulMalik1234567890 commited on
Commit
d38101e
·
0 Parent(s):

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv/
2
+ .env
.html ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html>
2
+ <head>
3
+ <link rel="preconnect" href="https://fonts.gstatic.com/" crossorigin="" />
4
+ <link
5
+ rel="stylesheet"
6
+ as="style"
7
+ onload="this.rel='stylesheet'"
8
+ href="https://fonts.googleapis.com/css2?display=swap&amp;family=Noto+Serif%3Awght%40400%3B500%3B700%3B900&amp;family=Noto+Sans%3Awght%40400%3B500%3B700%3B900"
9
+ />
10
+
11
+ <title>BookWise - Responsive</title>
12
+ <link rel="icon" type="image/x-icon" href="data:image/x-icon;base64," />
13
+
14
+ <script src="https://cdn.tailwindcss.com?plugins=forms,container-queries"></script>
15
+ </head>
16
+ <body>
17
+ <div
18
+ class="relative flex size-full min-h-screen flex-col bg-white group/design-root overflow-x-hidden"
19
+ style='--select-button-svg: url(&apos;data:image/svg+xml,%3csvg xmlns=%27http://www.w3.org/2000/svg%27 width=%2724px%27 height=%2724px%27 fill=%27rgb(99,116,136)%27 viewBox=%270 0 256 256%27%3e%3cpath d=%27M181.66,170.34a8,8,0,0,1,0,11.32l-48,48a8,8,0,0,1-11.32,0l-48-48a8,8,0,0,1,11.32-11.32L128,212.69l42.34-42.35A8,8,0,0,1,181.66,170.34Zm-96-84.68L128,43.31l42.34,42.35a8,8,0,0,0,11.32-11.32l-48-48a8,8,0,0,0-11.32,0l-48,48A8,8,0,0,0,85.66,85.66Z%27%3e%3c/path%3e%3c/svg%3e&apos;); font-family: "Noto Serif", "Noto Sans", sans-serif;'
20
+ >
21
+ <div class="layout-container flex h-full grow flex-col">
22
+ <header class="flex items-center justify-between whitespace-nowrap border-b border-solid border-b-[#f0f2f4] px-4 sm:px-6 lg:px-10 py-3">
23
+ <div class="flex items-center gap-4 sm:gap-8">
24
+ <div class="flex items-center gap-4 text-[#111418]">
25
+ <div class="size-4">
26
+ <svg viewBox="0 0 48 48" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M6 6H42L36 24L42 42H6L12 24L6 6Z" fill="currentColor"></path></svg>
27
+ </div>
28
+ <h2 class="text-[#111418] text-lg font-bold leading-tight tracking-[-0.015em]">BookWise</h2>
29
+ </div>
30
+ <div class="flex items-center gap-4 sm:gap-6 lg:gap-9">
31
+ <a class="text-[#111418] text-sm font-medium leading-normal" href="#">Home</a>
32
+ <a class="text-[#111418] text-sm font-medium leading-normal" href="#">My Library</a>
33
+ </div>
34
+ </div>
35
+ <div class="flex flex-1 justify-end gap-2 sm:gap-4 lg:gap-8">
36
+ <label class="hidden sm:flex flex-col min-w-32 md:min-w-40 !h-10 max-w-64">
37
+ <div class="flex w-full flex-1 items-stretch rounded-xl h-full">
38
+ <input
39
+ type="search"
40
+ placeholder="Search books..."
41
+ class="flex-1 px-3 py-2 text-sm border border-[#dce0e5] rounded-xl focus:outline-none focus:border-[#1669c9]"
42
+ />
43
+ </div>
44
+ </label>
45
+ <div
46
+ class="bg-center bg-no-repeat aspect-square bg-cover rounded-full size-8 sm:size-10"
47
+ style='background-image: url("https://lh3.googleusercontent.com/aida-public/AB6AXuBUFyw4SUtL0Bo77m9z_aqCPzx8jrYQ23Iz0YTuhs46ShVxgc5Soj1GQwKpCt9ZxQMSKH5bT-lodTkHcLdV3_qNp12gLkZTdHBtJFt5bDjUqT7CZHFN0QfWSoqRdPy4zx8RW_6N_MEmDJckbi0Ea2st3Kx-6gFPNMFCOLx2ofYQrOiSQ_kbKQw-wWQ7H8CvhkaTaXLXGEcpDXN5EJA8-UbK19-eAe34zXeJkXlqE3873k0hhvB6XGP2etAtUFf0e17br6aohFXWQKw");'
48
+ ></div>
49
+ </div>
50
+ </header>
51
+ <div class="px-4 sm:px-8 lg:px-40 flex flex-1 justify-center py-5">
52
+ <div class="layout-content-container flex flex-col max-w-[960px] flex-1">
53
+ <div class="flex flex-wrap justify-between gap-3 p-4">
54
+ <p class="text-[#111418] tracking-light text-2xl sm:text-[32px] font-bold leading-tight min-w-0">Describe Your Ideal Book</p>
55
+ </div>
56
+ <div class="flex max-w-full sm:max-w-[480px] flex-wrap items-end gap-4 px-4 py-3">
57
+ <label class="flex flex-col min-w-40 flex-1">
58
+ <textarea
59
+ placeholder="Enter a few words or a detailed description of a book you enjoyed or wish to read."
60
+ class="form-input flex w-full min-w-0 flex-1 resize-none overflow-hidden rounded-xl text-[#111418] focus:outline-0 focus:ring-0 border border-[#dce0e5] bg-white focus:border-[#dce0e5] min-h-24 sm:min-h-36 placeholder:text-[#637488] p-[15px] text-base font-normal leading-normal"
61
+ ></textarea>
62
+ </label>
63
+ </div>
64
+ <div class="flex max-w-full sm:max-w-[480px] flex-wrap items-end gap-4 px-4 py-3">
65
+ <label class="flex flex-col min-w-40 flex-1">
66
+ <p class="text-[#111418] text-base font-medium leading-normal pb-2">Genre</p>
67
+ <select
68
+ class="form-input flex w-full min-w-0 flex-1 resize-none overflow-hidden rounded-xl text-[#111418] focus:outline-0 focus:ring-0 border border-[#dce0e5] bg-white focus:border-[#dce0e5] h-14 bg-[image:--select-button-svg] placeholder:text-[#637488] p-[15px] text-base font-normal leading-normal"
69
+ >
70
+ <option value="">Select a genre</option>
71
+ <option value="fiction">Fiction</option>
72
+ <option value="mystery">Mystery</option>
73
+ <option value="romance">Romance</option>
74
+ <option value="scifi">Science Fiction</option>
75
+ <option value="fantasy">Fantasy</option>
76
+ <option value="biography">Biography</option>
77
+ <option value="history">History</option>
78
+ </select>
79
+ </label>
80
+ </div>
81
+ <div class="flex max-w-full sm:max-w-[480px] flex-wrap items-end gap-4 px-4 py-3">
82
+ <label class="flex flex-col min-w-40 flex-1">
83
+ <p class="text-[#111418] text-base font-medium leading-normal pb-2">Tone</p>
84
+ <select
85
+ class="form-input flex w-full min-w-0 flex-1 resize-none overflow-hidden rounded-xl text-[#111418] focus:outline-0 focus:ring-0 border border-[#dce0e5] bg-white focus:border-[#dce0e5] h-14 bg-[image:--select-button-svg] placeholder:text-[#637488] p-[15px] text-base font-normal leading-normal"
86
+ >
87
+ <option value="">Select a tone</option>
88
+ <option value="light">Light & Humorous</option>
89
+ <option value="serious">Serious & Dramatic</option>
90
+ <option value="dark">Dark & Mysterious</option>
91
+ <option value="uplifting">Uplifting & Inspiring</option>
92
+ <option value="thoughtful">Thoughtful & Reflective</option>
93
+ </select>
94
+ </label>
95
+ </div>
96
+ <div class="flex px-4 py-3">
97
+ <button
98
+ class="flex min-w-[84px] max-w-full sm:max-w-[480px] cursor-pointer items-center justify-center overflow-hidden rounded-full h-12 px-5 flex-1 bg-[#1669c9] text-white text-base font-bold leading-normal tracking-[0.015em]"
99
+ >
100
+ <span class="truncate">Search</span>
101
+ </button>
102
+ </div>
103
+ <h2 class="text-[#111418] text-xl sm:text-[22px] font-bold leading-tight tracking-[-0.015em] px-4 pb-3 pt-5">Recommended Books</h2>
104
+ <div class="p-4">
105
+ <div class="flex flex-col sm:flex-row items-stretch justify-between gap-4 rounded-xl">
106
+ <div class="flex flex-col sm:flex-[2_2_0px] gap-4">
107
+ <div class="flex flex-col gap-1">
108
+ <p class="text-[#111418] text-base font-bold leading-tight">The Secret Garden</p>
109
+ <p class="text-[#637488] text-sm font-normal leading-normal">Frances Bennett | A young girl discovers a hidden garden and unlocks its mysteries.</p>
110
+ </div>
111
+ <button
112
+ class="flex min-w-[84px] max-w-[480px] cursor-pointer items-center justify-center overflow-hidden rounded-full h-8 px-4 flex-row-reverse bg-[#f0f2f4] text-[#111418] text-sm font-medium leading-normal w-fit"
113
+ >
114
+ <span class="truncate">More Details</span>
115
+ </button>
116
+ </div>
117
+ <div
118
+ class="w-full bg-center bg-no-repeat aspect-video bg-cover rounded-xl flex-1 min-h-32 sm:min-h-0"
119
+ style='background-image: url("https://lh3.googleusercontent.com/aida-public/AB6AXuAhvJGw2xq2ulM1eOrAvTprpJpVc1vuviCpOk8vShEIBbBx6QE73cIwxDuKmNFPbS1D_uvqCRhBTFMIRIbEhMCoNvAu4T2c3GUpj-Ek1cgDY-S88u5m3Djfv3jKbmWHyzo9bSf3w1MZgWEevsLl5Ug3NWZ49xQB46X4MpQb9BRL6MjvUI12TbRp-P2ho9PALgBlj7Y2ZIVWKVQSHkwgO7_aeYqeQNTKOS4RxQrwHKBB-inDY6CtKFFi4P2WhiVp9PrnR8g5hVhij1k");'
120
+ ></div>
121
+ </div>
122
+ </div>
123
+ <div class="p-4">
124
+ <div class="flex flex-col sm:flex-row items-stretch justify-between gap-4 rounded-xl">
125
+ <div class="flex flex-col sm:flex-[2_2_0px] gap-4">
126
+ <div class="flex flex-col gap-1">
127
+ <p class="text-[#111418] text-base font-bold leading-tight">The Adventures of Tom Sawyer</p>
128
+ <p class="text-[#637488] text-sm font-normal leading-normal">Mark Twain | A mischievous boy's escapades along the Mississippi River.</p>
129
+ </div>
130
+ <button
131
+ class="flex min-w-[84px] max-w-[480px] cursor-pointer items-center justify-center overflow-hidden rounded-full h-8 px-4 flex-row-reverse bg-[#f0f2f4] text-[#111418] text-sm font-medium leading-normal w-fit"
132
+ >
133
+ <span class="truncate">More Details</span>
134
+ </button>
135
+ </div>
136
+ <div
137
+ class="w-full bg-center bg-no-repeat aspect-video bg-cover rounded-xl flex-1 min-h-32 sm:min-h-0"
138
+ style='background-image: url("https://lh3.googleusercontent.com/aida-public/AB6AXuCAiL_cW6RPNH4wyHoJFKsAzdorxksRaMXfr-QII83iGD0eJD1GZ8Jlbsq9oxbEIHkL9O0P_AtPUnrX146wgZ6bJefkas6SjVdM1uRe15ZbtlxjWfxz6k057F-6z7_UJhV8KQ5R1NR9hcxYTRhkWo9J6mCCJIY8NQmVc8YKfwKHgHEAC3UV3rPRXOK3bQfw6zdSQrulwq6jDm69jbmy2TiS5hCTkE1igPfIUtedG4KUxMM8p1IRy6OJfHbJfh4V78FoE1bf9VHijwA");'
139
+ ></div>
140
+ </div>
141
+ </div>
142
+ <div class="p-4">
143
+ <div class="flex flex-col sm:flex-row items-stretch justify-between gap-4 rounded-xl">
144
+ <div class="flex flex-col sm:flex-[2_2_0px] gap-4">
145
+ <div class="flex flex-col gap-1">
146
+ <p class="text-[#111418] text-base font-bold leading-tight">Pride and Prejudice</p>
147
+ <p class="text-[#637488] text-sm font-normal leading-normal">Jane Austen | A classic tale of love and societal expectations in 19th-century England.</p>
148
+ </div>
149
+ <button
150
+ class="flex min-w-[84px] max-w-[480px] cursor-pointer items-center justify-center overflow-hidden rounded-full h-8 px-4 flex-row-reverse bg-[#f0f2f4] text-[#111418] text-sm font-medium leading-normal w-fit"
151
+ >
152
+ <span class="truncate">More Details</span>
153
+ </button>
154
+ </div>
155
+ <div
156
+ class="w-full bg-center bg-no-repeat aspect-video bg-cover rounded-xl flex-1 min-h-32 sm:min-h-0"
157
+ style='background-image: url("https://lh3.googleusercontent.com/aida-public/AB6AXuAvoPL4nOSFDk3mNVaL1VmxSQw32s11eLzXYFT5EnmGXgl7pxTJJ9uEwowWGn54SgqFto0TNEqkwde-sanAoLRLWL_puvGrXW0xwzMX6fQrfDLo9daSrGViJT8rB9WePaw1n_Cm_XK9Uruv4c6M-7RcrpVZfGpEYCJ1wyu9ls87x8w3fA6bi7kUM_aebpza82L8qQ583ikVjOc45xjgiCH0MnJnZJQPbNpTEfQizMBw1EFjE7CI-RnUovvqXJC4-R74Q-KBUbgtJCA");'
158
+ ></div>
159
+ </div>
160
+ </div>
161
+ </div>
162
+ </div>
163
+ </div>
164
+ </div>
165
+ </body>
166
+ </html>
books_cleaned.csv ADDED
The diff for this file is too large to render. See raw diff
 
books_with_categories.csv ADDED
The diff for this file is too large to render. See raw diff
 
books_with_sentiment.csv ADDED
The diff for this file is too large to render. See raw diff
 
books_with_urls.csv ADDED
The diff for this file is too large to render. See raw diff
 
cover-not-found.jpg ADDED
data-exploration.ipynb ADDED
@@ -0,0 +1,1111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {
7
+ "collapsed": true
8
+ },
9
+ "outputs": [
10
+ {
11
+ "name": "stderr",
12
+ "output_type": "stream",
13
+ "text": [
14
+ "c:\\Users\\NonsoDev\\Documents\\Allcodes\\Projects_DL_for resume\\Recommender systems\\book reccomender - llm\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
15
+ " from .autonotebook import tqdm as notebook_tqdm\n"
16
+ ]
17
+ },
18
+ {
19
+ "name": "stdout",
20
+ "output_type": "stream",
21
+ "text": [
22
+ "Path to dataset files: C:\\Users\\NonsoDev\\.cache\\kagglehub\\datasets\\dylanjcastillo\\7k-books-with-metadata\\versions\\3\n"
23
+ ]
24
+ }
25
+ ],
26
+ "source": [
27
+ "import kagglehub\n",
28
+ "\n",
29
+ "# Download latest version\n",
30
+ "path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n",
31
+ "\n",
32
+ "print(\"Path to dataset files:\", path)"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 3,
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "Number of books: 6810\n"
45
+ ]
46
+ }
47
+ ],
48
+ "source": [
49
+ "import pandas as pd\n",
50
+ "data = pd.read_csv(path + \"/books.csv\")\n",
51
+ "print(\"Number of books:\", len(data))"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 4,
57
+ "metadata": {},
58
+ "outputs": [
59
+ {
60
+ "data": {
61
+ "text/html": [
62
+ "<div>\n",
63
+ "<style scoped>\n",
64
+ " .dataframe tbody tr th:only-of-type {\n",
65
+ " vertical-align: middle;\n",
66
+ " }\n",
67
+ "\n",
68
+ " .dataframe tbody tr th {\n",
69
+ " vertical-align: top;\n",
70
+ " }\n",
71
+ "\n",
72
+ " .dataframe thead th {\n",
73
+ " text-align: right;\n",
74
+ " }\n",
75
+ "</style>\n",
76
+ "<table border=\"1\" class=\"dataframe\">\n",
77
+ " <thead>\n",
78
+ " <tr style=\"text-align: right;\">\n",
79
+ " <th></th>\n",
80
+ " <th>isbn13</th>\n",
81
+ " <th>isbn10</th>\n",
82
+ " <th>title</th>\n",
83
+ " <th>subtitle</th>\n",
84
+ " <th>authors</th>\n",
85
+ " <th>categories</th>\n",
86
+ " <th>thumbnail</th>\n",
87
+ " <th>description</th>\n",
88
+ " <th>published_year</th>\n",
89
+ " <th>average_rating</th>\n",
90
+ " <th>num_pages</th>\n",
91
+ " <th>ratings_count</th>\n",
92
+ " </tr>\n",
93
+ " </thead>\n",
94
+ " <tbody>\n",
95
+ " <tr>\n",
96
+ " <th>0</th>\n",
97
+ " <td>9780002005883</td>\n",
98
+ " <td>0002005883</td>\n",
99
+ " <td>Gilead</td>\n",
100
+ " <td>NaN</td>\n",
101
+ " <td>Marilynne Robinson</td>\n",
102
+ " <td>Fiction</td>\n",
103
+ " <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
104
+ " <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
105
+ " <td>2004.0</td>\n",
106
+ " <td>3.85</td>\n",
107
+ " <td>247.0</td>\n",
108
+ " <td>361.0</td>\n",
109
+ " </tr>\n",
110
+ " <tr>\n",
111
+ " <th>1</th>\n",
112
+ " <td>9780002261982</td>\n",
113
+ " <td>0002261987</td>\n",
114
+ " <td>Spider's Web</td>\n",
115
+ " <td>A Novel</td>\n",
116
+ " <td>Charles Osborne;Agatha Christie</td>\n",
117
+ " <td>Detective and mystery stories</td>\n",
118
+ " <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
119
+ " <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
120
+ " <td>2000.0</td>\n",
121
+ " <td>3.83</td>\n",
122
+ " <td>241.0</td>\n",
123
+ " <td>5164.0</td>\n",
124
+ " </tr>\n",
125
+ " <tr>\n",
126
+ " <th>2</th>\n",
127
+ " <td>9780006163831</td>\n",
128
+ " <td>0006163831</td>\n",
129
+ " <td>The One Tree</td>\n",
130
+ " <td>NaN</td>\n",
131
+ " <td>Stephen R. Donaldson</td>\n",
132
+ " <td>American fiction</td>\n",
133
+ " <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
134
+ " <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
135
+ " <td>1982.0</td>\n",
136
+ " <td>3.97</td>\n",
137
+ " <td>479.0</td>\n",
138
+ " <td>172.0</td>\n",
139
+ " </tr>\n",
140
+ " <tr>\n",
141
+ " <th>3</th>\n",
142
+ " <td>9780006178736</td>\n",
143
+ " <td>0006178731</td>\n",
144
+ " <td>Rage of angels</td>\n",
145
+ " <td>NaN</td>\n",
146
+ " <td>Sidney Sheldon</td>\n",
147
+ " <td>Fiction</td>\n",
148
+ " <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
149
+ " <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
150
+ " <td>1993.0</td>\n",
151
+ " <td>3.93</td>\n",
152
+ " <td>512.0</td>\n",
153
+ " <td>29532.0</td>\n",
154
+ " </tr>\n",
155
+ " <tr>\n",
156
+ " <th>4</th>\n",
157
+ " <td>9780006280897</td>\n",
158
+ " <td>0006280897</td>\n",
159
+ " <td>The Four Loves</td>\n",
160
+ " <td>NaN</td>\n",
161
+ " <td>Clive Staples Lewis</td>\n",
162
+ " <td>Christian life</td>\n",
163
+ " <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
164
+ " <td>Lewis' work on the nature of love divides love...</td>\n",
165
+ " <td>2002.0</td>\n",
166
+ " <td>4.15</td>\n",
167
+ " <td>170.0</td>\n",
168
+ " <td>33684.0</td>\n",
169
+ " </tr>\n",
170
+ " </tbody>\n",
171
+ "</table>\n",
172
+ "</div>"
173
+ ],
174
+ "text/plain": [
175
+ " isbn13 isbn10 title subtitle \\\n",
176
+ "0 9780002005883 0002005883 Gilead NaN \n",
177
+ "1 9780002261982 0002261987 Spider's Web A Novel \n",
178
+ "2 9780006163831 0006163831 The One Tree NaN \n",
179
+ "3 9780006178736 0006178731 Rage of angels NaN \n",
180
+ "4 9780006280897 0006280897 The Four Loves NaN \n",
181
+ "\n",
182
+ " authors categories \\\n",
183
+ "0 Marilynne Robinson Fiction \n",
184
+ "1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
185
+ "2 Stephen R. Donaldson American fiction \n",
186
+ "3 Sidney Sheldon Fiction \n",
187
+ "4 Clive Staples Lewis Christian life \n",
188
+ "\n",
189
+ " thumbnail \\\n",
190
+ "0 http://books.google.com/books/content?id=KQZCP... \n",
191
+ "1 http://books.google.com/books/content?id=gA5GP... \n",
192
+ "2 http://books.google.com/books/content?id=OmQaw... \n",
193
+ "3 http://books.google.com/books/content?id=FKo2T... \n",
194
+ "4 http://books.google.com/books/content?id=XhQ5X... \n",
195
+ "\n",
196
+ " description published_year \\\n",
197
+ "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
198
+ "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
199
+ "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
200
+ "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
201
+ "4 Lewis' work on the nature of love divides love... 2002.0 \n",
202
+ "\n",
203
+ " average_rating num_pages ratings_count \n",
204
+ "0 3.85 247.0 361.0 \n",
205
+ "1 3.83 241.0 5164.0 \n",
206
+ "2 3.97 479.0 172.0 \n",
207
+ "3 3.93 512.0 29532.0 \n",
208
+ "4 4.15 170.0 33684.0 "
209
+ ]
210
+ },
211
+ "execution_count": 4,
212
+ "metadata": {},
213
+ "output_type": "execute_result"
214
+ }
215
+ ],
216
+ "source": [
217
+ "data.head()"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 5,
223
+ "metadata": {},
224
+ "outputs": [
225
+ {
226
+ "name": "stdout",
227
+ "output_type": "stream",
228
+ "text": [
229
+ "<class 'pandas.core.frame.DataFrame'>\n",
230
+ "RangeIndex: 6810 entries, 0 to 6809\n",
231
+ "Data columns (total 12 columns):\n",
232
+ " # Column Non-Null Count Dtype \n",
233
+ "--- ------ -------------- ----- \n",
234
+ " 0 isbn13 6810 non-null int64 \n",
235
+ " 1 isbn10 6810 non-null object \n",
236
+ " 2 title 6810 non-null object \n",
237
+ " 3 subtitle 2381 non-null object \n",
238
+ " 4 authors 6738 non-null object \n",
239
+ " 5 categories 6711 non-null object \n",
240
+ " 6 thumbnail 6481 non-null object \n",
241
+ " 7 description 6548 non-null object \n",
242
+ " 8 published_year 6804 non-null float64\n",
243
+ " 9 average_rating 6767 non-null float64\n",
244
+ " 10 num_pages 6767 non-null float64\n",
245
+ " 11 ratings_count 6767 non-null float64\n",
246
+ "dtypes: float64(4), int64(1), object(7)\n",
247
+ "memory usage: 638.6+ KB\n"
248
+ ]
249
+ }
250
+ ],
251
+ "source": [
252
+ "data.info()"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": 6,
258
+ "metadata": {},
259
+ "outputs": [
260
+ {
261
+ "data": {
262
+ "text/plain": [
263
+ "isbn13 0.000000\n",
264
+ "isbn10 0.000000\n",
265
+ "title 0.000000\n",
266
+ "subtitle 65.036711\n",
267
+ "authors 1.057269\n",
268
+ "categories 1.453744\n",
269
+ "thumbnail 4.831131\n",
270
+ "description 3.847283\n",
271
+ "published_year 0.088106\n",
272
+ "average_rating 0.631424\n",
273
+ "num_pages 0.631424\n",
274
+ "ratings_count 0.631424\n",
275
+ "dtype: float64"
276
+ ]
277
+ },
278
+ "execution_count": 6,
279
+ "metadata": {},
280
+ "output_type": "execute_result"
281
+ }
282
+ ],
283
+ "source": [
284
+ "data.isnull().sum() / len(data) * 100"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": 7,
290
+ "metadata": {},
291
+ "outputs": [],
292
+ "source": [
293
+ "rows_to_remove = data[(data[\"description\"].isnull()) | (data[\"authors\"].isnull()) | (data[\"published_year\"].isnull() )| (data[\"average_rating\"].isnull()) |( data[\"num_pages\"].isnull()) | ( data[\"ratings_count\"].isnull())]"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": 8,
299
+ "metadata": {},
300
+ "outputs": [
301
+ {
302
+ "data": {
303
+ "text/plain": [
304
+ "5.3744493392070485"
305
+ ]
306
+ },
307
+ "execution_count": 8,
308
+ "metadata": {},
309
+ "output_type": "execute_result"
310
+ }
311
+ ],
312
+ "source": [
313
+ "len(rows_to_remove) / len(data) * 100 #5.5% of the data"
314
+ ]
315
+ },
316
+ {
317
+ "cell_type": "code",
318
+ "execution_count": 9,
319
+ "metadata": {},
320
+ "outputs": [],
321
+ "source": [
322
+ "data = data.drop(index=rows_to_remove.index)"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": 10,
328
+ "metadata": {},
329
+ "outputs": [
330
+ {
331
+ "data": {
332
+ "text/plain": [
333
+ "subtitle 64.959652\n",
334
+ "thumbnail 3.227809\n",
335
+ "categories 0.512104\n",
336
+ "isbn13 0.000000\n",
337
+ "title 0.000000\n",
338
+ "isbn10 0.000000\n",
339
+ "authors 0.000000\n",
340
+ "description 0.000000\n",
341
+ "published_year 0.000000\n",
342
+ "average_rating 0.000000\n",
343
+ "num_pages 0.000000\n",
344
+ "ratings_count 0.000000\n",
345
+ "dtype: float64"
346
+ ]
347
+ },
348
+ "execution_count": 10,
349
+ "metadata": {},
350
+ "output_type": "execute_result"
351
+ }
352
+ ],
353
+ "source": [
354
+ "(data.isnull().sum() / len(data) * 100).sort_values(ascending=False)"
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": 11,
360
+ "metadata": {},
361
+ "outputs": [
362
+ {
363
+ "data": {
364
+ "text/plain": [
365
+ "categories\n",
366
+ "Fiction 2510\n",
367
+ "Juvenile Fiction 521\n",
368
+ "Biography & Autobiography 390\n",
369
+ "History 256\n",
370
+ "Literary Criticism 163\n",
371
+ " ... \n",
372
+ "Humorous stories 1\n",
373
+ "Ballets 1\n",
374
+ "Aged women 1\n",
375
+ "Catholic women 1\n",
376
+ "Christian fiction 1\n",
377
+ "Name: count, Length: 530, dtype: int64"
378
+ ]
379
+ },
380
+ "execution_count": 11,
381
+ "metadata": {},
382
+ "output_type": "execute_result"
383
+ }
384
+ ],
385
+ "source": [
386
+ "data[\"categories\"].value_counts() #530 categories is too much, there is something wrong with this column"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": 12,
392
+ "metadata": {},
393
+ "outputs": [
394
+ {
395
+ "data": {
396
+ "text/plain": [
397
+ "5024 Violence erupts in the poor town of Milagro wh...\n",
398
+ "3235 FBI Special Agent Dillon Savich teams up with ...\n",
399
+ "5235 Seventeen-year-old Manhattan society girl Grad...\n",
400
+ "4516 This is the story of the Tuck family, who are ...\n",
401
+ "3204 Prejudice, the intricacies of Mediterranean po...\n",
402
+ "Name: description, dtype: object"
403
+ ]
404
+ },
405
+ "execution_count": 12,
406
+ "metadata": {},
407
+ "output_type": "execute_result"
408
+ }
409
+ ],
410
+ "source": [
411
+ "data[\"description\"].sample(5) #some desscriptions are too short to be useful and some are too long\n",
412
+ "# i think characters greater than 25 are better suited for understanding the context"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": 13,
418
+ "metadata": {},
419
+ "outputs": [],
420
+ "source": [
421
+ "data[\"description_chars\"] = data[\"description\"].apply(lambda x: len(x) if isinstance(x, str) else 0)\n",
422
+ "data = data[data[\"description_chars\"] > 25]"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "code",
427
+ "execution_count": 14,
428
+ "metadata": {},
429
+ "outputs": [
430
+ {
431
+ "data": {
432
+ "text/plain": [
433
+ "6397"
434
+ ]
435
+ },
436
+ "execution_count": 14,
437
+ "metadata": {},
438
+ "output_type": "execute_result"
439
+ }
440
+ ],
441
+ "source": [
442
+ "len(data)"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": 15,
448
+ "metadata": {},
449
+ "outputs": [],
450
+ "source": [
451
+ "# some of the subtitle are missing, so we can have a cojoined title and subtitle, to replace both the title and subtitle\n",
452
+ "data[\"title_and_subtitle\"] = data[\"title\"].apply(lambda x: x if isinstance(x, str) else \"\") + \" \" + data[\"subtitle\"].apply(lambda x: x if isinstance(x, str) else \"\")"
453
+ ]
454
+ },
455
+ {
456
+ "cell_type": "code",
457
+ "execution_count": 16,
458
+ "metadata": {},
459
+ "outputs": [
460
+ {
461
+ "data": {
462
+ "text/html": [
463
+ "<div>\n",
464
+ "<style scoped>\n",
465
+ " .dataframe tbody tr th:only-of-type {\n",
466
+ " vertical-align: middle;\n",
467
+ " }\n",
468
+ "\n",
469
+ " .dataframe tbody tr th {\n",
470
+ " vertical-align: top;\n",
471
+ " }\n",
472
+ "\n",
473
+ " .dataframe thead th {\n",
474
+ " text-align: right;\n",
475
+ " }\n",
476
+ "</style>\n",
477
+ "<table border=\"1\" class=\"dataframe\">\n",
478
+ " <thead>\n",
479
+ " <tr style=\"text-align: right;\">\n",
480
+ " <th></th>\n",
481
+ " <th>isbn13</th>\n",
482
+ " <th>isbn10</th>\n",
483
+ " <th>title</th>\n",
484
+ " <th>subtitle</th>\n",
485
+ " <th>authors</th>\n",
486
+ " <th>categories</th>\n",
487
+ " <th>thumbnail</th>\n",
488
+ " <th>description</th>\n",
489
+ " <th>published_year</th>\n",
490
+ " <th>average_rating</th>\n",
491
+ " <th>num_pages</th>\n",
492
+ " <th>ratings_count</th>\n",
493
+ " <th>description_chars</th>\n",
494
+ " <th>title_and_subtitle</th>\n",
495
+ " </tr>\n",
496
+ " </thead>\n",
497
+ " <tbody>\n",
498
+ " <tr>\n",
499
+ " <th>0</th>\n",
500
+ " <td>9780002005883</td>\n",
501
+ " <td>0002005883</td>\n",
502
+ " <td>Gilead</td>\n",
503
+ " <td>NaN</td>\n",
504
+ " <td>Marilynne Robinson</td>\n",
505
+ " <td>Fiction</td>\n",
506
+ " <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
507
+ " <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
508
+ " <td>2004.0</td>\n",
509
+ " <td>3.85</td>\n",
510
+ " <td>247.0</td>\n",
511
+ " <td>361.0</td>\n",
512
+ " <td>1154</td>\n",
513
+ " <td>Gilead</td>\n",
514
+ " </tr>\n",
515
+ " <tr>\n",
516
+ " <th>1</th>\n",
517
+ " <td>9780002261982</td>\n",
518
+ " <td>0002261987</td>\n",
519
+ " <td>Spider's Web</td>\n",
520
+ " <td>A Novel</td>\n",
521
+ " <td>Charles Osborne;Agatha Christie</td>\n",
522
+ " <td>Detective and mystery stories</td>\n",
523
+ " <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
524
+ " <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
525
+ " <td>2000.0</td>\n",
526
+ " <td>3.83</td>\n",
527
+ " <td>241.0</td>\n",
528
+ " <td>5164.0</td>\n",
529
+ " <td>1200</td>\n",
530
+ " <td>Spider's Web A Novel</td>\n",
531
+ " </tr>\n",
532
+ " <tr>\n",
533
+ " <th>2</th>\n",
534
+ " <td>9780006163831</td>\n",
535
+ " <td>0006163831</td>\n",
536
+ " <td>The One Tree</td>\n",
537
+ " <td>NaN</td>\n",
538
+ " <td>Stephen R. Donaldson</td>\n",
539
+ " <td>American fiction</td>\n",
540
+ " <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
541
+ " <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
542
+ " <td>1982.0</td>\n",
543
+ " <td>3.97</td>\n",
544
+ " <td>479.0</td>\n",
545
+ " <td>172.0</td>\n",
546
+ " <td>109</td>\n",
547
+ " <td>The One Tree</td>\n",
548
+ " </tr>\n",
549
+ " <tr>\n",
550
+ " <th>3</th>\n",
551
+ " <td>9780006178736</td>\n",
552
+ " <td>0006178731</td>\n",
553
+ " <td>Rage of angels</td>\n",
554
+ " <td>NaN</td>\n",
555
+ " <td>Sidney Sheldon</td>\n",
556
+ " <td>Fiction</td>\n",
557
+ " <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
558
+ " <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
559
+ " <td>1993.0</td>\n",
560
+ " <td>3.93</td>\n",
561
+ " <td>512.0</td>\n",
562
+ " <td>29532.0</td>\n",
563
+ " <td>359</td>\n",
564
+ " <td>Rage of angels</td>\n",
565
+ " </tr>\n",
566
+ " <tr>\n",
567
+ " <th>4</th>\n",
568
+ " <td>9780006280897</td>\n",
569
+ " <td>0006280897</td>\n",
570
+ " <td>The Four Loves</td>\n",
571
+ " <td>NaN</td>\n",
572
+ " <td>Clive Staples Lewis</td>\n",
573
+ " <td>Christian life</td>\n",
574
+ " <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
575
+ " <td>Lewis' work on the nature of love divides love...</td>\n",
576
+ " <td>2002.0</td>\n",
577
+ " <td>4.15</td>\n",
578
+ " <td>170.0</td>\n",
579
+ " <td>33684.0</td>\n",
580
+ " <td>295</td>\n",
581
+ " <td>The Four Loves</td>\n",
582
+ " </tr>\n",
583
+ " </tbody>\n",
584
+ "</table>\n",
585
+ "</div>"
586
+ ],
587
+ "text/plain": [
588
+ " isbn13 isbn10 title subtitle \\\n",
589
+ "0 9780002005883 0002005883 Gilead NaN \n",
590
+ "1 9780002261982 0002261987 Spider's Web A Novel \n",
591
+ "2 9780006163831 0006163831 The One Tree NaN \n",
592
+ "3 9780006178736 0006178731 Rage of angels NaN \n",
593
+ "4 9780006280897 0006280897 The Four Loves NaN \n",
594
+ "\n",
595
+ " authors categories \\\n",
596
+ "0 Marilynne Robinson Fiction \n",
597
+ "1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
598
+ "2 Stephen R. Donaldson American fiction \n",
599
+ "3 Sidney Sheldon Fiction \n",
600
+ "4 Clive Staples Lewis Christian life \n",
601
+ "\n",
602
+ " thumbnail \\\n",
603
+ "0 http://books.google.com/books/content?id=KQZCP... \n",
604
+ "1 http://books.google.com/books/content?id=gA5GP... \n",
605
+ "2 http://books.google.com/books/content?id=OmQaw... \n",
606
+ "3 http://books.google.com/books/content?id=FKo2T... \n",
607
+ "4 http://books.google.com/books/content?id=XhQ5X... \n",
608
+ "\n",
609
+ " description published_year \\\n",
610
+ "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
611
+ "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
612
+ "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
613
+ "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
614
+ "4 Lewis' work on the nature of love divides love... 2002.0 \n",
615
+ "\n",
616
+ " average_rating num_pages ratings_count description_chars \\\n",
617
+ "0 3.85 247.0 361.0 1154 \n",
618
+ "1 3.83 241.0 5164.0 1200 \n",
619
+ "2 3.97 479.0 172.0 109 \n",
620
+ "3 3.93 512.0 29532.0 359 \n",
621
+ "4 4.15 170.0 33684.0 295 \n",
622
+ "\n",
623
+ " title_and_subtitle \n",
624
+ "0 Gilead \n",
625
+ "1 Spider's Web A Novel \n",
626
+ "2 The One Tree \n",
627
+ "3 Rage of angels \n",
628
+ "4 The Four Loves "
629
+ ]
630
+ },
631
+ "execution_count": 16,
632
+ "metadata": {},
633
+ "output_type": "execute_result"
634
+ }
635
+ ],
636
+ "source": [
637
+ "data.head()"
638
+ ]
639
+ },
640
+ {
641
+ "cell_type": "code",
642
+ "execution_count": 17,
643
+ "metadata": {},
644
+ "outputs": [],
645
+ "source": [
646
+ "data[\"tagged_description\"] = data[\"isbn13\"].apply(str) + \" \" + data[\"description\"].apply(str)"
647
+ ]
648
+ },
649
+ {
650
+ "cell_type": "code",
651
+ "execution_count": 18,
652
+ "metadata": {},
653
+ "outputs": [
654
+ {
655
+ "data": {
656
+ "text/html": [
657
+ "<div>\n",
658
+ "<style scoped>\n",
659
+ " .dataframe tbody tr th:only-of-type {\n",
660
+ " vertical-align: middle;\n",
661
+ " }\n",
662
+ "\n",
663
+ " .dataframe tbody tr th {\n",
664
+ " vertical-align: top;\n",
665
+ " }\n",
666
+ "\n",
667
+ " .dataframe thead th {\n",
668
+ " text-align: right;\n",
669
+ " }\n",
670
+ "</style>\n",
671
+ "<table border=\"1\" class=\"dataframe\">\n",
672
+ " <thead>\n",
673
+ " <tr style=\"text-align: right;\">\n",
674
+ " <th></th>\n",
675
+ " <th>isbn13</th>\n",
676
+ " <th>isbn10</th>\n",
677
+ " <th>title</th>\n",
678
+ " <th>subtitle</th>\n",
679
+ " <th>authors</th>\n",
680
+ " <th>categories</th>\n",
681
+ " <th>thumbnail</th>\n",
682
+ " <th>description</th>\n",
683
+ " <th>published_year</th>\n",
684
+ " <th>average_rating</th>\n",
685
+ " <th>num_pages</th>\n",
686
+ " <th>ratings_count</th>\n",
687
+ " <th>description_chars</th>\n",
688
+ " <th>title_and_subtitle</th>\n",
689
+ " <th>tagged_description</th>\n",
690
+ " </tr>\n",
691
+ " </thead>\n",
692
+ " <tbody>\n",
693
+ " <tr>\n",
694
+ " <th>0</th>\n",
695
+ " <td>9780002005883</td>\n",
696
+ " <td>0002005883</td>\n",
697
+ " <td>Gilead</td>\n",
698
+ " <td>NaN</td>\n",
699
+ " <td>Marilynne Robinson</td>\n",
700
+ " <td>Fiction</td>\n",
701
+ " <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
702
+ " <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
703
+ " <td>2004.0</td>\n",
704
+ " <td>3.85</td>\n",
705
+ " <td>247.0</td>\n",
706
+ " <td>361.0</td>\n",
707
+ " <td>1154</td>\n",
708
+ " <td>Gilead</td>\n",
709
+ " <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
710
+ " </tr>\n",
711
+ " <tr>\n",
712
+ " <th>1</th>\n",
713
+ " <td>9780002261982</td>\n",
714
+ " <td>0002261987</td>\n",
715
+ " <td>Spider's Web</td>\n",
716
+ " <td>A Novel</td>\n",
717
+ " <td>Charles Osborne;Agatha Christie</td>\n",
718
+ " <td>Detective and mystery stories</td>\n",
719
+ " <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
720
+ " <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
721
+ " <td>2000.0</td>\n",
722
+ " <td>3.83</td>\n",
723
+ " <td>241.0</td>\n",
724
+ " <td>5164.0</td>\n",
725
+ " <td>1200</td>\n",
726
+ " <td>Spider's Web A Novel</td>\n",
727
+ " <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
728
+ " </tr>\n",
729
+ " <tr>\n",
730
+ " <th>2</th>\n",
731
+ " <td>9780006163831</td>\n",
732
+ " <td>0006163831</td>\n",
733
+ " <td>The One Tree</td>\n",
734
+ " <td>NaN</td>\n",
735
+ " <td>Stephen R. Donaldson</td>\n",
736
+ " <td>American fiction</td>\n",
737
+ " <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
738
+ " <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
739
+ " <td>1982.0</td>\n",
740
+ " <td>3.97</td>\n",
741
+ " <td>479.0</td>\n",
742
+ " <td>172.0</td>\n",
743
+ " <td>109</td>\n",
744
+ " <td>The One Tree</td>\n",
745
+ " <td>9780006163831 Volume Two of Stephen Donaldson'...</td>\n",
746
+ " </tr>\n",
747
+ " <tr>\n",
748
+ " <th>3</th>\n",
749
+ " <td>9780006178736</td>\n",
750
+ " <td>0006178731</td>\n",
751
+ " <td>Rage of angels</td>\n",
752
+ " <td>NaN</td>\n",
753
+ " <td>Sidney Sheldon</td>\n",
754
+ " <td>Fiction</td>\n",
755
+ " <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
756
+ " <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
757
+ " <td>1993.0</td>\n",
758
+ " <td>3.93</td>\n",
759
+ " <td>512.0</td>\n",
760
+ " <td>29532.0</td>\n",
761
+ " <td>359</td>\n",
762
+ " <td>Rage of angels</td>\n",
763
+ " <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
764
+ " </tr>\n",
765
+ " <tr>\n",
766
+ " <th>4</th>\n",
767
+ " <td>9780006280897</td>\n",
768
+ " <td>0006280897</td>\n",
769
+ " <td>The Four Loves</td>\n",
770
+ " <td>NaN</td>\n",
771
+ " <td>Clive Staples Lewis</td>\n",
772
+ " <td>Christian life</td>\n",
773
+ " <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
774
+ " <td>Lewis' work on the nature of love divides love...</td>\n",
775
+ " <td>2002.0</td>\n",
776
+ " <td>4.15</td>\n",
777
+ " <td>170.0</td>\n",
778
+ " <td>33684.0</td>\n",
779
+ " <td>295</td>\n",
780
+ " <td>The Four Loves</td>\n",
781
+ " <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
782
+ " </tr>\n",
783
+ " </tbody>\n",
784
+ "</table>\n",
785
+ "</div>"
786
+ ],
787
+ "text/plain": [
788
+ " isbn13 isbn10 title subtitle \\\n",
789
+ "0 9780002005883 0002005883 Gilead NaN \n",
790
+ "1 9780002261982 0002261987 Spider's Web A Novel \n",
791
+ "2 9780006163831 0006163831 The One Tree NaN \n",
792
+ "3 9780006178736 0006178731 Rage of angels NaN \n",
793
+ "4 9780006280897 0006280897 The Four Loves NaN \n",
794
+ "\n",
795
+ " authors categories \\\n",
796
+ "0 Marilynne Robinson Fiction \n",
797
+ "1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
798
+ "2 Stephen R. Donaldson American fiction \n",
799
+ "3 Sidney Sheldon Fiction \n",
800
+ "4 Clive Staples Lewis Christian life \n",
801
+ "\n",
802
+ " thumbnail \\\n",
803
+ "0 http://books.google.com/books/content?id=KQZCP... \n",
804
+ "1 http://books.google.com/books/content?id=gA5GP... \n",
805
+ "2 http://books.google.com/books/content?id=OmQaw... \n",
806
+ "3 http://books.google.com/books/content?id=FKo2T... \n",
807
+ "4 http://books.google.com/books/content?id=XhQ5X... \n",
808
+ "\n",
809
+ " description published_year \\\n",
810
+ "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
811
+ "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
812
+ "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
813
+ "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
814
+ "4 Lewis' work on the nature of love divides love... 2002.0 \n",
815
+ "\n",
816
+ " average_rating num_pages ratings_count description_chars \\\n",
817
+ "0 3.85 247.0 361.0 1154 \n",
818
+ "1 3.83 241.0 5164.0 1200 \n",
819
+ "2 3.97 479.0 172.0 109 \n",
820
+ "3 3.93 512.0 29532.0 359 \n",
821
+ "4 4.15 170.0 33684.0 295 \n",
822
+ "\n",
823
+ " title_and_subtitle tagged_description \n",
824
+ "0 Gilead 9780002005883 A NOVEL THAT READERS and critics... \n",
825
+ "1 Spider's Web A Novel 9780002261982 A new 'Christie for Christmas' -... \n",
826
+ "2 The One Tree 9780006163831 Volume Two of Stephen Donaldson'... \n",
827
+ "3 Rage of angels 9780006178736 A memorable, mesmerizing heroine... \n",
828
+ "4 The Four Loves 9780006280897 Lewis' work on the nature of lov... "
829
+ ]
830
+ },
831
+ "execution_count": 18,
832
+ "metadata": {},
833
+ "output_type": "execute_result"
834
+ }
835
+ ],
836
+ "source": [
837
+ "data.head()"
838
+ ]
839
+ },
840
+ {
841
+ "cell_type": "code",
842
+ "execution_count": null,
843
+ "metadata": {},
844
+ "outputs": [],
845
+ "source": []
846
+ },
847
+ {
848
+ "cell_type": "code",
849
+ "execution_count": 19,
850
+ "metadata": {},
851
+ "outputs": [
852
+ {
853
+ "data": {
854
+ "text/html": [
855
+ "<div>\n",
856
+ "<style scoped>\n",
857
+ " .dataframe tbody tr th:only-of-type {\n",
858
+ " vertical-align: middle;\n",
859
+ " }\n",
860
+ "\n",
861
+ " .dataframe tbody tr th {\n",
862
+ " vertical-align: top;\n",
863
+ " }\n",
864
+ "\n",
865
+ " .dataframe thead th {\n",
866
+ " text-align: right;\n",
867
+ " }\n",
868
+ "</style>\n",
869
+ "<table border=\"1\" class=\"dataframe\">\n",
870
+ " <thead>\n",
871
+ " <tr style=\"text-align: right;\">\n",
872
+ " <th></th>\n",
873
+ " <th>isbn13</th>\n",
874
+ " <th>isbn10</th>\n",
875
+ " <th>title</th>\n",
876
+ " <th>subtitle</th>\n",
877
+ " <th>authors</th>\n",
878
+ " <th>categories</th>\n",
879
+ " <th>thumbnail</th>\n",
880
+ " <th>description</th>\n",
881
+ " <th>published_year</th>\n",
882
+ " <th>average_rating</th>\n",
883
+ " <th>num_pages</th>\n",
884
+ " <th>ratings_count</th>\n",
885
+ " <th>description_chars</th>\n",
886
+ " <th>title_and_subtitle</th>\n",
887
+ " <th>tagged_description</th>\n",
888
+ " </tr>\n",
889
+ " </thead>\n",
890
+ " <tbody>\n",
891
+ " <tr>\n",
892
+ " <th>0</th>\n",
893
+ " <td>9780002005883</td>\n",
894
+ " <td>0002005883</td>\n",
895
+ " <td>Gilead</td>\n",
896
+ " <td>NaN</td>\n",
897
+ " <td>Marilynne Robinson</td>\n",
898
+ " <td>Fiction</td>\n",
899
+ " <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
900
+ " <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
901
+ " <td>2004.0</td>\n",
902
+ " <td>3.85</td>\n",
903
+ " <td>247.0</td>\n",
904
+ " <td>361.0</td>\n",
905
+ " <td>1154</td>\n",
906
+ " <td>Gilead</td>\n",
907
+ " <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
908
+ " </tr>\n",
909
+ " <tr>\n",
910
+ " <th>1</th>\n",
911
+ " <td>9780002261982</td>\n",
912
+ " <td>0002261987</td>\n",
913
+ " <td>Spider's Web</td>\n",
914
+ " <td>A Novel</td>\n",
915
+ " <td>Charles Osborne;Agatha Christie</td>\n",
916
+ " <td>Detective and mystery stories</td>\n",
917
+ " <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
918
+ " <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
919
+ " <td>2000.0</td>\n",
920
+ " <td>3.83</td>\n",
921
+ " <td>241.0</td>\n",
922
+ " <td>5164.0</td>\n",
923
+ " <td>1200</td>\n",
924
+ " <td>Spider's Web A Novel</td>\n",
925
+ " <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
926
+ " </tr>\n",
927
+ " <tr>\n",
928
+ " <th>2</th>\n",
929
+ " <td>9780006163831</td>\n",
930
+ " <td>0006163831</td>\n",
931
+ " <td>The One Tree</td>\n",
932
+ " <td>NaN</td>\n",
933
+ " <td>Stephen R. Donaldson</td>\n",
934
+ " <td>American fiction</td>\n",
935
+ " <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
936
+ " <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
937
+ " <td>1982.0</td>\n",
938
+ " <td>3.97</td>\n",
939
+ " <td>479.0</td>\n",
940
+ " <td>172.0</td>\n",
941
+ " <td>109</td>\n",
942
+ " <td>The One Tree</td>\n",
943
+ " <td>9780006163831 Volume Two of Stephen Donaldson'...</td>\n",
944
+ " </tr>\n",
945
+ " <tr>\n",
946
+ " <th>3</th>\n",
947
+ " <td>9780006178736</td>\n",
948
+ " <td>0006178731</td>\n",
949
+ " <td>Rage of angels</td>\n",
950
+ " <td>NaN</td>\n",
951
+ " <td>Sidney Sheldon</td>\n",
952
+ " <td>Fiction</td>\n",
953
+ " <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
954
+ " <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
955
+ " <td>1993.0</td>\n",
956
+ " <td>3.93</td>\n",
957
+ " <td>512.0</td>\n",
958
+ " <td>29532.0</td>\n",
959
+ " <td>359</td>\n",
960
+ " <td>Rage of angels</td>\n",
961
+ " <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
962
+ " </tr>\n",
963
+ " <tr>\n",
964
+ " <th>4</th>\n",
965
+ " <td>9780006280897</td>\n",
966
+ " <td>0006280897</td>\n",
967
+ " <td>The Four Loves</td>\n",
968
+ " <td>NaN</td>\n",
969
+ " <td>Clive Staples Lewis</td>\n",
970
+ " <td>Christian life</td>\n",
971
+ " <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
972
+ " <td>Lewis' work on the nature of love divides love...</td>\n",
973
+ " <td>2002.0</td>\n",
974
+ " <td>4.15</td>\n",
975
+ " <td>170.0</td>\n",
976
+ " <td>33684.0</td>\n",
977
+ " <td>295</td>\n",
978
+ " <td>The Four Loves</td>\n",
979
+ " <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
980
+ " </tr>\n",
981
+ " </tbody>\n",
982
+ "</table>\n",
983
+ "</div>"
984
+ ],
985
+ "text/plain": [
986
+ " isbn13 isbn10 title subtitle \\\n",
987
+ "0 9780002005883 0002005883 Gilead NaN \n",
988
+ "1 9780002261982 0002261987 Spider's Web A Novel \n",
989
+ "2 9780006163831 0006163831 The One Tree NaN \n",
990
+ "3 9780006178736 0006178731 Rage of angels NaN \n",
991
+ "4 9780006280897 0006280897 The Four Loves NaN \n",
992
+ "\n",
993
+ " authors categories \\\n",
994
+ "0 Marilynne Robinson Fiction \n",
995
+ "1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
996
+ "2 Stephen R. Donaldson American fiction \n",
997
+ "3 Sidney Sheldon Fiction \n",
998
+ "4 Clive Staples Lewis Christian life \n",
999
+ "\n",
1000
+ " thumbnail \\\n",
1001
+ "0 http://books.google.com/books/content?id=KQZCP... \n",
1002
+ "1 http://books.google.com/books/content?id=gA5GP... \n",
1003
+ "2 http://books.google.com/books/content?id=OmQaw... \n",
1004
+ "3 http://books.google.com/books/content?id=FKo2T... \n",
1005
+ "4 http://books.google.com/books/content?id=XhQ5X... \n",
1006
+ "\n",
1007
+ " description published_year \\\n",
1008
+ "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
1009
+ "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
1010
+ "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
1011
+ "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
1012
+ "4 Lewis' work on the nature of love divides love... 2002.0 \n",
1013
+ "\n",
1014
+ " average_rating num_pages ratings_count description_chars \\\n",
1015
+ "0 3.85 247.0 361.0 1154 \n",
1016
+ "1 3.83 241.0 5164.0 1200 \n",
1017
+ "2 3.97 479.0 172.0 109 \n",
1018
+ "3 3.93 512.0 29532.0 359 \n",
1019
+ "4 4.15 170.0 33684.0 295 \n",
1020
+ "\n",
1021
+ " title_and_subtitle tagged_description \n",
1022
+ "0 Gilead 9780002005883 A NOVEL THAT READERS and critics... \n",
1023
+ "1 Spider's Web A Novel 9780002261982 A new 'Christie for Christmas' -... \n",
1024
+ "2 The One Tree 9780006163831 Volume Two of Stephen Donaldson'... \n",
1025
+ "3 Rage of angels 9780006178736 A memorable, mesmerizing heroine... \n",
1026
+ "4 The Four Loves 9780006280897 Lewis' work on the nature of lov... "
1027
+ ]
1028
+ },
1029
+ "execution_count": 19,
1030
+ "metadata": {},
1031
+ "output_type": "execute_result"
1032
+ }
1033
+ ],
1034
+ "source": [
1035
+ "data.head()"
1036
+ ]
1037
+ },
1038
+ {
1039
+ "cell_type": "code",
1040
+ "execution_count": 20,
1041
+ "metadata": {},
1042
+ "outputs": [],
1043
+ "source": [
1044
+ "data = data.drop(columns=[ \"title\", \"subtitle\", \"description_chars\",\"isbn10\"], axis=1)\n",
1045
+ "data.to_csv(\"books_cleaned.csv\", index=False)"
1046
+ ]
1047
+ },
1048
+ {
1049
+ "cell_type": "code",
1050
+ "execution_count": 21,
1051
+ "metadata": {},
1052
+ "outputs": [
1053
+ {
1054
+ "name": "stdout",
1055
+ "output_type": "stream",
1056
+ "text": [
1057
+ "<class 'pandas.core.frame.DataFrame'>\n",
1058
+ "Index: 6397 entries, 0 to 6809\n",
1059
+ "Data columns (total 11 columns):\n",
1060
+ " # Column Non-Null Count Dtype \n",
1061
+ "--- ------ -------------- ----- \n",
1062
+ " 0 isbn13 6397 non-null int64 \n",
1063
+ " 1 authors 6397 non-null object \n",
1064
+ " 2 categories 6364 non-null object \n",
1065
+ " 3 thumbnail 6190 non-null object \n",
1066
+ " 4 description 6397 non-null object \n",
1067
+ " 5 published_year 6397 non-null float64\n",
1068
+ " 6 average_rating 6397 non-null float64\n",
1069
+ " 7 num_pages 6397 non-null float64\n",
1070
+ " 8 ratings_count 6397 non-null float64\n",
1071
+ " 9 title_and_subtitle 6397 non-null object \n",
1072
+ " 10 tagged_description 6397 non-null object \n",
1073
+ "dtypes: float64(4), int64(1), object(6)\n",
1074
+ "memory usage: 599.7+ KB\n"
1075
+ ]
1076
+ }
1077
+ ],
1078
+ "source": [
1079
+ "data.info()"
1080
+ ]
1081
+ },
1082
+ {
1083
+ "cell_type": "code",
1084
+ "execution_count": null,
1085
+ "metadata": {},
1086
+ "outputs": [],
1087
+ "source": []
1088
+ }
1089
+ ],
1090
+ "metadata": {
1091
+ "kernelspec": {
1092
+ "display_name": "venv",
1093
+ "language": "python",
1094
+ "name": "python3"
1095
+ },
1096
+ "language_info": {
1097
+ "codemirror_mode": {
1098
+ "name": "ipython",
1099
+ "version": 3
1100
+ },
1101
+ "file_extension": ".py",
1102
+ "mimetype": "text/x-python",
1103
+ "name": "python",
1104
+ "nbconvert_exporter": "python",
1105
+ "pygments_lexer": "ipython3",
1106
+ "version": "3.11.9"
1107
+ }
1108
+ },
1109
+ "nbformat": 4,
1110
+ "nbformat_minor": 0
1111
+ }
download_url.ipynb ADDED
@@ -0,0 +1,1353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "colab": {
8
+ "base_uri": "https://localhost:8080/"
9
+ },
10
+ "id": "39eGIDIr-3ju",
11
+ "outputId": "ac47d677-9538-4860-dc27-b0d43ef6e41d"
12
+ },
13
+ "outputs": [
14
+ {
15
+ "name": "stdout",
16
+ "output_type": "stream",
17
+ "text": [
18
+ "Requirement already satisfied: googlesearch-python in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (1.3.0)\n",
19
+ "Requirement already satisfied: beautifulsoup4>=4.9 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from googlesearch-python) (4.13.4)\n",
20
+ "Requirement already satisfied: requests>=2.20 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from googlesearch-python) (2.32.3)\n",
21
+ "Requirement already satisfied: soupsieve>1.2 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from beautifulsoup4>=4.9->googlesearch-python) (2.7)\n",
22
+ "Requirement already satisfied: typing-extensions>=4.0.0 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from beautifulsoup4>=4.9->googlesearch-python) (4.14.0)\n",
23
+ "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from requests>=2.20->googlesearch-python) (3.4.2)\n",
24
+ "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from requests>=2.20->googlesearch-python) (3.10)\n",
25
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from requests>=2.20->googlesearch-python) (2.4.0)\n",
26
+ "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from requests>=2.20->googlesearch-python) (2025.4.26)\n"
27
+ ]
28
+ },
29
+ {
30
+ "name": "stderr",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "\n",
34
+ "[notice] A new release of pip is available: 24.0 -> 25.1.1\n",
35
+ "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
36
+ ]
37
+ }
38
+ ],
39
+ "source": [
40
+ "!pip install googlesearch-python\n"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "metadata": {
47
+ "id": "6KyyK4zD_eqC"
48
+ },
49
+ "outputs": [],
50
+ "source": []
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 2,
55
+ "metadata": {
56
+ "id": "9vzs6ees_CDr"
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "import time\n",
61
+ "import random\n",
62
+ "import numpy as np\n",
63
+ "from googlesearch import search\n",
64
+ "import pandas as pd\n",
65
+ "from typing import Optional\n",
66
+ "import logging\n",
67
+ "\n",
68
+ "# Set up logging\n",
69
+ "logging.basicConfig(level=logging.INFO)\n",
70
+ "logger = logging.getLogger(__name__)\n",
71
+ "\n",
72
+ "def fetch_first_google_link_with_backoff(query: str, max_retries: int = 3) -> Optional[str]:\n",
73
+ " \"\"\"\n",
74
+ " Fetch first Google search result with exponential backoff and retry logic\n",
75
+ " \"\"\"\n",
76
+ " for attempt in range(max_retries + 1):\n",
77
+ " try:\n",
78
+ " # Add random delay between 1-5 seconds\n",
79
+ " base_delay = random.uniform(1, 5)\n",
80
+ " time.sleep(base_delay)\n",
81
+ " \n",
82
+ " results = search(query, num_results=1, lang=\"en\")\n",
83
+ " url = list(results)[0]\n",
84
+ " \n",
85
+ " if url.startswith(\"https://books.google.com\"):\n",
86
+ " return url\n",
87
+ " else:\n",
88
+ " # Additional delay before backup search\n",
89
+ " time.sleep(random.uniform(2, 4))\n",
90
+ " results = search(query.replace(\"google\", \"amazon\"), num_results=1, lang=\"en\")\n",
91
+ " return list(results)[0]\n",
92
+ " \n",
93
+ " except Exception as e:\n",
94
+ " if \"429\" in str(e) or \"Too Many Requests\" in str(e):\n",
95
+ " if attempt < max_retries:\n",
96
+ " # Exponential backoff: 2^attempt * base_time + random jitter\n",
97
+ " wait_time = (2 ** attempt) * 10 + random.uniform(5, 15)\n",
98
+ " logger.warning(f\"Rate limited on attempt {attempt + 1}. Waiting {wait_time:.1f} seconds...\")\n",
99
+ " time.sleep(wait_time)\n",
100
+ " continue\n",
101
+ " else:\n",
102
+ " logger.error(f\"Max retries exceeded for query: {query}\")\n",
103
+ " return None\n",
104
+ " else:\n",
105
+ " logger.error(f\"Unexpected error for query '{query}': {e}\")\n",
106
+ " return None\n",
107
+ " \n",
108
+ " return None\n",
109
+ "\n",
110
+ "def process_queries_in_batches(queries_df: pd.DataFrame, batch_size: int = 50, \n",
111
+ " batch_delay: int = 300) -> pd.Series:\n",
112
+ " \"\"\"\n",
113
+ " Process queries in batches with delays between batches\n",
114
+ " \"\"\"\n",
115
+ " results = []\n",
116
+ " total_queries = len(queries_df)\n",
117
+ " \n",
118
+ " for i in range(0, total_queries, batch_size):\n",
119
+ " batch_end = min(i + batch_size, total_queries)\n",
120
+ " batch_queries = queries_df.iloc[i:batch_end]\n",
121
+ " \n",
122
+ " logger.info(f\"Processing batch {i//batch_size + 1}: queries {i+1}-{batch_end} of {total_queries}\")\n",
123
+ " \n",
124
+ " # Process batch\n",
125
+ " batch_results = batch_queries.apply(fetch_first_google_link_with_backoff)\n",
126
+ " results.extend(batch_results.tolist())\n",
127
+ " \n",
128
+ " # Delay between batches (except for the last batch)\n",
129
+ " if batch_end < total_queries:\n",
130
+ " logger.info(f\"Batch complete. Waiting {batch_delay} seconds before next batch...\")\n",
131
+ " time.sleep(batch_delay)\n",
132
+ " \n",
133
+ " return pd.Series(results, index=queries_df.index)\n",
134
+ "\n",
135
+ "# Alternative approach: Save progress incrementally\n",
136
+ "def process_queries_with_checkpoints(queries_df: pd.DataFrame, \n",
137
+ " checkpoint_file: str = \"search_progress.csv\",\n",
138
+ " start_index: int = 0) -> pd.Series:\n",
139
+ " \"\"\"\n",
140
+ " Process queries with periodic checkpoints to resume if interrupted\n",
141
+ " \"\"\"\n",
142
+ " results = [None] * len(queries_df)\n",
143
+ " \n",
144
+ " # Load existing progress if checkpoint exists\n",
145
+ " try:\n",
146
+ " checkpoint_df = pd.read_csv(checkpoint_file)\n",
147
+ " for idx, row in checkpoint_df.iterrows():\n",
148
+ " if row['result'] is not np.nan:\n",
149
+ " results[row['query_index']] = row['result']\n",
150
+ " logger.info(f\"Loaded {len(checkpoint_df)} previous results from checkpoint\")\n",
151
+ " except FileNotFoundError:\n",
152
+ " logger.info(\"No checkpoint file found, starting fresh\")\n",
153
+ " \n",
154
+ " for i in range(start_index, len(queries_df)):\n",
155
+ " if results[i] is not None: # Skip if already processed\n",
156
+ " continue\n",
157
+ " \n",
158
+ " query = queries_df.iloc[i]\n",
159
+ " logger.info(f\"Processing query {i+1}/{len(queries_df)}: {query}\")\n",
160
+ " \n",
161
+ " result = fetch_first_google_link_with_backoff(query)\n",
162
+ " results[i] = result\n",
163
+ " \n",
164
+ " # Save checkpoint every 10 queries\n",
165
+ " if (i + 1) % 10 == 0:\n",
166
+ " checkpoint_data = {\n",
167
+ " 'query_index': range(len(results)),\n",
168
+ " 'query': queries_df.tolist(),\n",
169
+ " 'result': results\n",
170
+ " }\n",
171
+ " pd.DataFrame(checkpoint_data).to_csv(checkpoint_file, index=False)\n",
172
+ " logger.info(f\"Checkpoint saved at query {i+1}\")\n",
173
+ " \n",
174
+ " return pd.Series(results, index=queries_df.index)\n",
175
+ "\n"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": null,
181
+ "metadata": {},
182
+ "outputs": [],
183
+ "source": []
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 3,
188
+ "metadata": {
189
+ "id": "guCqCfDy_E_V"
190
+ },
191
+ "outputs": [],
192
+ "source": [
193
+ "df = pd.read_csv(\"books_cleaned.csv\", encoding=\"utf-8\")"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": 4,
199
+ "metadata": {
200
+ "id": "z1uMJic9_X73"
201
+ },
202
+ "outputs": [],
203
+ "source": [
204
+ "queries_df = df[\"title_and_subtitle\"] + \" by \" + df[\"authors\"] + \"- google books\""
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": []
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": null,
217
+ "metadata": {
218
+ "colab": {
219
+ "base_uri": "https://localhost:8080/",
220
+ "height": 428
221
+ },
222
+ "id": "ocpGW7wW_axk",
223
+ "outputId": "512704e8-6d1a-4b66-e171-45b9157f0942"
224
+ },
225
+ "outputs": [],
226
+ "source": []
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "metadata": {
232
+ "id": "ydbB5Jd9_jcU"
233
+ },
234
+ "outputs": [
235
+ {
236
+ "name": "stderr",
237
+ "output_type": "stream",
238
+ "text": [
239
+ "INFO:__main__:Loaded 6397 previous results from checkpoint\n",
240
+ "INFO:__main__:Processing query 807/6397: City of God by Augustine;Henry Scowcroft Bettenson;Gillian Rosemary Evans- google books\n",
241
+ "INFO:__main__:Processing query 1618/6397: The Complete Stories of Evelyn Waugh by Evelyn Waugh- google books\n",
242
+ "INFO:__main__:Processing query 2355/6397: Hawthorne's Short Stories by Nathaniel Hawthorne- google books\n",
243
+ "INFO:__main__:Processing query 5781/6397: Little Butterfly by Hinako Takanaga;Sachiko Sato- google books\n",
244
+ "INFO:__main__:Processing query 6391/6397: Aspects of the Novel by E. M. Forster- google books\n",
245
+ "INFO:__main__:Processing query 6392/6397: Mistaken Identity by Nayantara Sahgal- google books\n",
246
+ "INFO:__main__:Processing query 6393/6397: Journey to the East by Hermann Hesse- google books\n",
247
+ "INFO:__main__:Processing query 6394/6397: The Monk Who Sold His Ferrari: A Fable About Fulfilling Your Dreams & Reaching Your Destiny by Robin Sharma- google books\n",
248
+ "INFO:__main__:Processing query 6395/6397: I Am that Talks with Sri Nisargadatta Maharaj by Sri Nisargadatta Maharaj;Sudhakar S. Dikshit- google books\n",
249
+ "INFO:__main__:Processing query 6396/6397: The Berlin Phenomenology by Georg Wilhelm Friedrich Hegel- google books\n",
250
+ "INFO:__main__:Processing query 6397/6397: 'I'm Telling You Stories' Jeanette Winterson and the Politics of Reading by Helena Grice;Tim Woods- google books\n"
251
+ ]
252
+ }
253
+ ],
254
+ "source": [
255
+ "process_queries_with_checkpoints(queries_df)"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": 5,
261
+ "metadata": {},
262
+ "outputs": [],
263
+ "source": [
264
+ "queries_df = pd.concat([queries_df,pd.read_csv(\"search_progress.csv\")[\"result\"]], axis=1)"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": 6,
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": [
273
+ "queries_df.columns = [\"title\", \"url\"]"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 18,
279
+ "metadata": {},
280
+ "outputs": [],
281
+ "source": [
282
+ "unfinished = queries_df[(queries_df.isnull().any(axis=1)) | ~((queries_df[\"url\"].str.contains(\"amazon\", na=False)) | (queries_df[\"url\"].str.contains(\"google\", na=False)))]"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 19,
288
+ "metadata": {},
289
+ "outputs": [
290
+ {
291
+ "data": {
292
+ "text/html": [
293
+ "<div>\n",
294
+ "<style scoped>\n",
295
+ " .dataframe tbody tr th:only-of-type {\n",
296
+ " vertical-align: middle;\n",
297
+ " }\n",
298
+ "\n",
299
+ " .dataframe tbody tr th {\n",
300
+ " vertical-align: top;\n",
301
+ " }\n",
302
+ "\n",
303
+ " .dataframe thead th {\n",
304
+ " text-align: right;\n",
305
+ " }\n",
306
+ "</style>\n",
307
+ "<table border=\"1\" class=\"dataframe\">\n",
308
+ " <thead>\n",
309
+ " <tr style=\"text-align: right;\">\n",
310
+ " <th></th>\n",
311
+ " <th>title</th>\n",
312
+ " <th>url</th>\n",
313
+ " </tr>\n",
314
+ " </thead>\n",
315
+ " <tbody>\n",
316
+ " <tr>\n",
317
+ " <th>73</th>\n",
318
+ " <td>I Can Read with Me Eyes Shut! by Dr. Seuss- g...</td>\n",
319
+ " <td>/search?num=3</td>\n",
320
+ " </tr>\n",
321
+ " <tr>\n",
322
+ " <th>101</th>\n",
323
+ " <td>Tyranny of the Majority Funamental Fairness in...</td>\n",
324
+ " <td>/search?num=3</td>\n",
325
+ " </tr>\n",
326
+ " <tr>\n",
327
+ " <th>126</th>\n",
328
+ " <td>Mars and Venus Book of Days 365 Inspriations t...</td>\n",
329
+ " <td>/search?num=3</td>\n",
330
+ " </tr>\n",
331
+ " <tr>\n",
332
+ " <th>128</th>\n",
333
+ " <td>Today I Feel Silly &amp; Other Moods That Make My ...</td>\n",
334
+ " <td>/search?num=3</td>\n",
335
+ " </tr>\n",
336
+ " <tr>\n",
337
+ " <th>314</th>\n",
338
+ " <td>DREAM &amp; THE UNDERWOR by James Hillman- google...</td>\n",
339
+ " <td>/search?num=3</td>\n",
340
+ " </tr>\n",
341
+ " <tr>\n",
342
+ " <th>...</th>\n",
343
+ " <td>...</td>\n",
344
+ " <td>...</td>\n",
345
+ " </tr>\n",
346
+ " <tr>\n",
347
+ " <th>6392</th>\n",
348
+ " <td>Journey to the East by Hermann Hesse- google ...</td>\n",
349
+ " <td>NaN</td>\n",
350
+ " </tr>\n",
351
+ " <tr>\n",
352
+ " <th>6393</th>\n",
353
+ " <td>The Monk Who Sold His Ferrari: A Fable About F...</td>\n",
354
+ " <td>NaN</td>\n",
355
+ " </tr>\n",
356
+ " <tr>\n",
357
+ " <th>6394</th>\n",
358
+ " <td>I Am that Talks with Sri Nisargadatta Maharaj ...</td>\n",
359
+ " <td>NaN</td>\n",
360
+ " </tr>\n",
361
+ " <tr>\n",
362
+ " <th>6395</th>\n",
363
+ " <td>The Berlin Phenomenology by Georg Wilhelm Fri...</td>\n",
364
+ " <td>NaN</td>\n",
365
+ " </tr>\n",
366
+ " <tr>\n",
367
+ " <th>6396</th>\n",
368
+ " <td>'I'm Telling You Stories' Jeanette Winterson a...</td>\n",
369
+ " <td>NaN</td>\n",
370
+ " </tr>\n",
371
+ " </tbody>\n",
372
+ "</table>\n",
373
+ "<p>100 rows × 2 columns</p>\n",
374
+ "</div>"
375
+ ],
376
+ "text/plain": [
377
+ " title url\n",
378
+ "73 I Can Read with Me Eyes Shut! by Dr. Seuss- g... /search?num=3\n",
379
+ "101 Tyranny of the Majority Funamental Fairness in... /search?num=3\n",
380
+ "126 Mars and Venus Book of Days 365 Inspriations t... /search?num=3\n",
381
+ "128 Today I Feel Silly & Other Moods That Make My ... /search?num=3\n",
382
+ "314 DREAM & THE UNDERWOR by James Hillman- google... /search?num=3\n",
383
+ "... ... ...\n",
384
+ "6392 Journey to the East by Hermann Hesse- google ... NaN\n",
385
+ "6393 The Monk Who Sold His Ferrari: A Fable About F... NaN\n",
386
+ "6394 I Am that Talks with Sri Nisargadatta Maharaj ... NaN\n",
387
+ "6395 The Berlin Phenomenology by Georg Wilhelm Fri... NaN\n",
388
+ "6396 'I'm Telling You Stories' Jeanette Winterson a... NaN\n",
389
+ "\n",
390
+ "[100 rows x 2 columns]"
391
+ ]
392
+ },
393
+ "execution_count": 19,
394
+ "metadata": {},
395
+ "output_type": "execute_result"
396
+ }
397
+ ],
398
+ "source": [
399
+ "unfinished"
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "code",
404
+ "execution_count": 20,
405
+ "metadata": {},
406
+ "outputs": [
407
+ {
408
+ "data": {
409
+ "text/plain": [
410
+ "[None,\n",
411
+ " None,\n",
412
+ " None,\n",
413
+ " None,\n",
414
+ " None,\n",
415
+ " None,\n",
416
+ " None,\n",
417
+ " None,\n",
418
+ " None,\n",
419
+ " None,\n",
420
+ " None,\n",
421
+ " None,\n",
422
+ " None,\n",
423
+ " None,\n",
424
+ " None,\n",
425
+ " None,\n",
426
+ " None,\n",
427
+ " None,\n",
428
+ " None,\n",
429
+ " None,\n",
430
+ " None,\n",
431
+ " None,\n",
432
+ " None,\n",
433
+ " None,\n",
434
+ " None,\n",
435
+ " None,\n",
436
+ " None,\n",
437
+ " None,\n",
438
+ " None,\n",
439
+ " None,\n",
440
+ " None,\n",
441
+ " None,\n",
442
+ " None,\n",
443
+ " None,\n",
444
+ " None,\n",
445
+ " None,\n",
446
+ " None,\n",
447
+ " None,\n",
448
+ " None,\n",
449
+ " None,\n",
450
+ " None,\n",
451
+ " None,\n",
452
+ " None,\n",
453
+ " None,\n",
454
+ " None,\n",
455
+ " None,\n",
456
+ " None,\n",
457
+ " None,\n",
458
+ " None,\n",
459
+ " None,\n",
460
+ " None,\n",
461
+ " None,\n",
462
+ " None,\n",
463
+ " None,\n",
464
+ " None,\n",
465
+ " None,\n",
466
+ " None,\n",
467
+ " None,\n",
468
+ " None,\n",
469
+ " None,\n",
470
+ " None,\n",
471
+ " None,\n",
472
+ " None,\n",
473
+ " None,\n",
474
+ " None,\n",
475
+ " None,\n",
476
+ " None,\n",
477
+ " None,\n",
478
+ " None,\n",
479
+ " None,\n",
480
+ " None,\n",
481
+ " None,\n",
482
+ " None,\n",
483
+ " None,\n",
484
+ " None,\n",
485
+ " None,\n",
486
+ " None,\n",
487
+ " None,\n",
488
+ " None,\n",
489
+ " None,\n",
490
+ " None,\n",
491
+ " None,\n",
492
+ " None,\n",
493
+ " None,\n",
494
+ " None,\n",
495
+ " None,\n",
496
+ " None,\n",
497
+ " None,\n",
498
+ " None,\n",
499
+ " None,\n",
500
+ " None,\n",
501
+ " None,\n",
502
+ " None,\n",
503
+ " None,\n",
504
+ " None,\n",
505
+ " None,\n",
506
+ " None,\n",
507
+ " None,\n",
508
+ " None,\n",
509
+ " None]"
510
+ ]
511
+ },
512
+ "execution_count": 20,
513
+ "metadata": {},
514
+ "output_type": "execute_result"
515
+ }
516
+ ],
517
+ "source": [
518
+ "unfinished_urls"
519
+ ]
520
+ },
521
+ {
522
+ "cell_type": "code",
523
+ "execution_count": null,
524
+ "metadata": {},
525
+ "outputs": [
526
+ {
527
+ "data": {
528
+ "text/plain": [
529
+ "'/search?num=3'"
530
+ ]
531
+ },
532
+ "execution_count": 8,
533
+ "metadata": {},
534
+ "output_type": "execute_result"
535
+ }
536
+ ],
537
+ "source": [
538
+ "fetch_first_google_link_with_backoff(unfinished[\"title\"].tolist()[0])"
539
+ ]
540
+ },
541
+ {
542
+ "cell_type": "code",
543
+ "execution_count": 21,
544
+ "metadata": {},
545
+ "outputs": [
546
+ {
547
+ "name": "stderr",
548
+ "output_type": "stream",
549
+ "text": [
550
+ "C:\\Users\\NonsoDev\\AppData\\Local\\Temp\\ipykernel_40848\\271023033.py:1: SettingWithCopyWarning: \n",
551
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
552
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
553
+ "\n",
554
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
555
+ " unfinished[\"hhh\"] = unfinished_urls\n"
556
+ ]
557
+ }
558
+ ],
559
+ "source": [
560
+ "unfinished[\"hhh\"] = unfinished_urls"
561
+ ]
562
+ },
563
+ {
564
+ "cell_type": "code",
565
+ "execution_count": 22,
566
+ "metadata": {},
567
+ "outputs": [
568
+ {
569
+ "data": {
570
+ "text/html": [
571
+ "<div>\n",
572
+ "<style scoped>\n",
573
+ " .dataframe tbody tr th:only-of-type {\n",
574
+ " vertical-align: middle;\n",
575
+ " }\n",
576
+ "\n",
577
+ " .dataframe tbody tr th {\n",
578
+ " vertical-align: top;\n",
579
+ " }\n",
580
+ "\n",
581
+ " .dataframe thead th {\n",
582
+ " text-align: right;\n",
583
+ " }\n",
584
+ "</style>\n",
585
+ "<table border=\"1\" class=\"dataframe\">\n",
586
+ " <thead>\n",
587
+ " <tr style=\"text-align: right;\">\n",
588
+ " <th></th>\n",
589
+ " <th>title</th>\n",
590
+ " <th>url</th>\n",
591
+ " <th>hhh</th>\n",
592
+ " </tr>\n",
593
+ " </thead>\n",
594
+ " <tbody>\n",
595
+ " <tr>\n",
596
+ " <th>73</th>\n",
597
+ " <td>I Can Read with Me Eyes Shut! by Dr. Seuss- g...</td>\n",
598
+ " <td>/search?num=3</td>\n",
599
+ " <td>None</td>\n",
600
+ " </tr>\n",
601
+ " <tr>\n",
602
+ " <th>101</th>\n",
603
+ " <td>Tyranny of the Majority Funamental Fairness in...</td>\n",
604
+ " <td>/search?num=3</td>\n",
605
+ " <td>None</td>\n",
606
+ " </tr>\n",
607
+ " <tr>\n",
608
+ " <th>126</th>\n",
609
+ " <td>Mars and Venus Book of Days 365 Inspriations t...</td>\n",
610
+ " <td>/search?num=3</td>\n",
611
+ " <td>None</td>\n",
612
+ " </tr>\n",
613
+ " <tr>\n",
614
+ " <th>128</th>\n",
615
+ " <td>Today I Feel Silly &amp; Other Moods That Make My ...</td>\n",
616
+ " <td>/search?num=3</td>\n",
617
+ " <td>None</td>\n",
618
+ " </tr>\n",
619
+ " <tr>\n",
620
+ " <th>314</th>\n",
621
+ " <td>DREAM &amp; THE UNDERWOR by James Hillman- google...</td>\n",
622
+ " <td>/search?num=3</td>\n",
623
+ " <td>None</td>\n",
624
+ " </tr>\n",
625
+ " </tbody>\n",
626
+ "</table>\n",
627
+ "</div>"
628
+ ],
629
+ "text/plain": [
630
+ " title url hhh\n",
631
+ "73 I Can Read with Me Eyes Shut! by Dr. Seuss- g... /search?num=3 None\n",
632
+ "101 Tyranny of the Majority Funamental Fairness in... /search?num=3 None\n",
633
+ "126 Mars and Venus Book of Days 365 Inspriations t... /search?num=3 None\n",
634
+ "128 Today I Feel Silly & Other Moods That Make My ... /search?num=3 None\n",
635
+ "314 DREAM & THE UNDERWOR by James Hillman- google... /search?num=3 None"
636
+ ]
637
+ },
638
+ "execution_count": 22,
639
+ "metadata": {},
640
+ "output_type": "execute_result"
641
+ }
642
+ ],
643
+ "source": [
644
+ "unfinished.head()"
645
+ ]
646
+ },
647
+ {
648
+ "cell_type": "code",
649
+ "execution_count": 23,
650
+ "metadata": {},
651
+ "outputs": [],
652
+ "source": [
653
+ "df1 = pd.read_csv(\"search_progress1.csv\")"
654
+ ]
655
+ },
656
+ {
657
+ "cell_type": "code",
658
+ "execution_count": 30,
659
+ "metadata": {},
660
+ "outputs": [
661
+ {
662
+ "data": {
663
+ "text/plain": [
664
+ "806 NaN\n",
665
+ "Name: url, dtype: object"
666
+ ]
667
+ },
668
+ "execution_count": 30,
669
+ "metadata": {},
670
+ "output_type": "execute_result"
671
+ }
672
+ ],
673
+ "source": [
674
+ "df1[\"url\"][df1[\"url\"].isna()]"
675
+ ]
676
+ },
677
+ {
678
+ "cell_type": "code",
679
+ "execution_count": 31,
680
+ "metadata": {},
681
+ "outputs": [
682
+ {
683
+ "data": {
684
+ "text/html": [
685
+ "<div>\n",
686
+ "<style scoped>\n",
687
+ " .dataframe tbody tr th:only-of-type {\n",
688
+ " vertical-align: middle;\n",
689
+ " }\n",
690
+ "\n",
691
+ " .dataframe tbody tr th {\n",
692
+ " vertical-align: top;\n",
693
+ " }\n",
694
+ "\n",
695
+ " .dataframe thead th {\n",
696
+ " text-align: right;\n",
697
+ " }\n",
698
+ "</style>\n",
699
+ "<table border=\"1\" class=\"dataframe\">\n",
700
+ " <thead>\n",
701
+ " <tr style=\"text-align: right;\">\n",
702
+ " <th></th>\n",
703
+ " <th>isbn13</th>\n",
704
+ " <th>authors</th>\n",
705
+ " <th>categories</th>\n",
706
+ " <th>thumbnail</th>\n",
707
+ " <th>description</th>\n",
708
+ " <th>published_year</th>\n",
709
+ " <th>average_rating</th>\n",
710
+ " <th>num_pages</th>\n",
711
+ " <th>ratings_count</th>\n",
712
+ " <th>title_and_subtitle</th>\n",
713
+ " <th>tagged_description</th>\n",
714
+ " </tr>\n",
715
+ " </thead>\n",
716
+ " <tbody>\n",
717
+ " <tr>\n",
718
+ " <th>0</th>\n",
719
+ " <td>9780002005883</td>\n",
720
+ " <td>Marilynne Robinson</td>\n",
721
+ " <td>Fiction</td>\n",
722
+ " <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
723
+ " <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
724
+ " <td>2004.0</td>\n",
725
+ " <td>3.85</td>\n",
726
+ " <td>247.0</td>\n",
727
+ " <td>361.0</td>\n",
728
+ " <td>Gilead</td>\n",
729
+ " <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
730
+ " </tr>\n",
731
+ " <tr>\n",
732
+ " <th>1</th>\n",
733
+ " <td>9780002261982</td>\n",
734
+ " <td>Charles Osborne;Agatha Christie</td>\n",
735
+ " <td>Detective and mystery stories</td>\n",
736
+ " <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
737
+ " <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
738
+ " <td>2000.0</td>\n",
739
+ " <td>3.83</td>\n",
740
+ " <td>241.0</td>\n",
741
+ " <td>5164.0</td>\n",
742
+ " <td>Spider's Web A Novel</td>\n",
743
+ " <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
744
+ " </tr>\n",
745
+ " <tr>\n",
746
+ " <th>2</th>\n",
747
+ " <td>9780006163831</td>\n",
748
+ " <td>Stephen R. Donaldson</td>\n",
749
+ " <td>American fiction</td>\n",
750
+ " <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
751
+ " <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
752
+ " <td>1982.0</td>\n",
753
+ " <td>3.97</td>\n",
754
+ " <td>479.0</td>\n",
755
+ " <td>172.0</td>\n",
756
+ " <td>The One Tree</td>\n",
757
+ " <td>9780006163831 Volume Two of Stephen Donaldson'...</td>\n",
758
+ " </tr>\n",
759
+ " <tr>\n",
760
+ " <th>3</th>\n",
761
+ " <td>9780006178736</td>\n",
762
+ " <td>Sidney Sheldon</td>\n",
763
+ " <td>Fiction</td>\n",
764
+ " <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
765
+ " <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
766
+ " <td>1993.0</td>\n",
767
+ " <td>3.93</td>\n",
768
+ " <td>512.0</td>\n",
769
+ " <td>29532.0</td>\n",
770
+ " <td>Rage of angels</td>\n",
771
+ " <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
772
+ " </tr>\n",
773
+ " <tr>\n",
774
+ " <th>4</th>\n",
775
+ " <td>9780006280897</td>\n",
776
+ " <td>Clive Staples Lewis</td>\n",
777
+ " <td>Christian life</td>\n",
778
+ " <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
779
+ " <td>Lewis' work on the nature of love divides love...</td>\n",
780
+ " <td>2002.0</td>\n",
781
+ " <td>4.15</td>\n",
782
+ " <td>170.0</td>\n",
783
+ " <td>33684.0</td>\n",
784
+ " <td>The Four Loves</td>\n",
785
+ " <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
786
+ " </tr>\n",
787
+ " </tbody>\n",
788
+ "</table>\n",
789
+ "</div>"
790
+ ],
791
+ "text/plain": [
792
+ " isbn13 authors \\\n",
793
+ "0 9780002005883 Marilynne Robinson \n",
794
+ "1 9780002261982 Charles Osborne;Agatha Christie \n",
795
+ "2 9780006163831 Stephen R. Donaldson \n",
796
+ "3 9780006178736 Sidney Sheldon \n",
797
+ "4 9780006280897 Clive Staples Lewis \n",
798
+ "\n",
799
+ " categories \\\n",
800
+ "0 Fiction \n",
801
+ "1 Detective and mystery stories \n",
802
+ "2 American fiction \n",
803
+ "3 Fiction \n",
804
+ "4 Christian life \n",
805
+ "\n",
806
+ " thumbnail \\\n",
807
+ "0 http://books.google.com/books/content?id=KQZCP... \n",
808
+ "1 http://books.google.com/books/content?id=gA5GP... \n",
809
+ "2 http://books.google.com/books/content?id=OmQaw... \n",
810
+ "3 http://books.google.com/books/content?id=FKo2T... \n",
811
+ "4 http://books.google.com/books/content?id=XhQ5X... \n",
812
+ "\n",
813
+ " description published_year \\\n",
814
+ "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
815
+ "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
816
+ "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
817
+ "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
818
+ "4 Lewis' work on the nature of love divides love... 2002.0 \n",
819
+ "\n",
820
+ " average_rating num_pages ratings_count title_and_subtitle \\\n",
821
+ "0 3.85 247.0 361.0 Gilead \n",
822
+ "1 3.83 241.0 5164.0 Spider's Web A Novel \n",
823
+ "2 3.97 479.0 172.0 The One Tree \n",
824
+ "3 3.93 512.0 29532.0 Rage of angels \n",
825
+ "4 4.15 170.0 33684.0 The Four Loves \n",
826
+ "\n",
827
+ " tagged_description \n",
828
+ "0 9780002005883 A NOVEL THAT READERS and critics... \n",
829
+ "1 9780002261982 A new 'Christie for Christmas' -... \n",
830
+ "2 9780006163831 Volume Two of Stephen Donaldson'... \n",
831
+ "3 9780006178736 A memorable, mesmerizing heroine... \n",
832
+ "4 9780006280897 Lewis' work on the nature of lov... "
833
+ ]
834
+ },
835
+ "execution_count": 31,
836
+ "metadata": {},
837
+ "output_type": "execute_result"
838
+ }
839
+ ],
840
+ "source": [
841
+ "df.head()"
842
+ ]
843
+ },
844
+ {
845
+ "cell_type": "code",
846
+ "execution_count": null,
847
+ "metadata": {},
848
+ "outputs": [
849
+ {
850
+ "data": {
851
+ "text/html": [
852
+ "<div>\n",
853
+ "<style scoped>\n",
854
+ " .dataframe tbody tr th:only-of-type {\n",
855
+ " vertical-align: middle;\n",
856
+ " }\n",
857
+ "\n",
858
+ " .dataframe tbody tr th {\n",
859
+ " vertical-align: top;\n",
860
+ " }\n",
861
+ "\n",
862
+ " .dataframe thead th {\n",
863
+ " text-align: right;\n",
864
+ " }\n",
865
+ "</style>\n",
866
+ "<table border=\"1\" class=\"dataframe\">\n",
867
+ " <thead>\n",
868
+ " <tr style=\"text-align: right;\">\n",
869
+ " <th></th>\n",
870
+ " <th>title</th>\n",
871
+ " <th>url</th>\n",
872
+ " </tr>\n",
873
+ " </thead>\n",
874
+ " <tbody>\n",
875
+ " <tr>\n",
876
+ " <th>0</th>\n",
877
+ " <td>Gilead by Marilynne Robinson- google books</td>\n",
878
+ " <td>https://books.google.com/books/about/Gilead.ht...</td>\n",
879
+ " </tr>\n",
880
+ " <tr>\n",
881
+ " <th>1</th>\n",
882
+ " <td>Spider's Web A Novel by Charles Osborne;Agatha...</td>\n",
883
+ " <td>https://books.google.com/books/about/Spider_s_...</td>\n",
884
+ " </tr>\n",
885
+ " <tr>\n",
886
+ " <th>2</th>\n",
887
+ " <td>The One Tree by Stephen R. Donaldson- google ...</td>\n",
888
+ " <td>https://books.google.com/books/about/The_One_T...</td>\n",
889
+ " </tr>\n",
890
+ " <tr>\n",
891
+ " <th>3</th>\n",
892
+ " <td>Rage of angels by Sidney Sheldon- google books</td>\n",
893
+ " <td>https://books.google.com/books/about/Rage_of_A...</td>\n",
894
+ " </tr>\n",
895
+ " <tr>\n",
896
+ " <th>4</th>\n",
897
+ " <td>The Four Loves by Clive Staples Lewis- google...</td>\n",
898
+ " <td>https://books.google.com/books/about/The_Four_...</td>\n",
899
+ " </tr>\n",
900
+ " </tbody>\n",
901
+ "</table>\n",
902
+ "</div>"
903
+ ],
904
+ "text/plain": [
905
+ " title \\\n",
906
+ "0 Gilead by Marilynne Robinson- google books \n",
907
+ "1 Spider's Web A Novel by Charles Osborne;Agatha... \n",
908
+ "2 The One Tree by Stephen R. Donaldson- google ... \n",
909
+ "3 Rage of angels by Sidney Sheldon- google books \n",
910
+ "4 The Four Loves by Clive Staples Lewis- google... \n",
911
+ "\n",
912
+ " url \n",
913
+ "0 https://books.google.com/books/about/Gilead.ht... \n",
914
+ "1 https://books.google.com/books/about/Spider_s_... \n",
915
+ "2 https://books.google.com/books/about/The_One_T... \n",
916
+ "3 https://books.google.com/books/about/Rage_of_A... \n",
917
+ "4 https://books.google.com/books/about/The_Four_... "
918
+ ]
919
+ },
920
+ "execution_count": 32,
921
+ "metadata": {},
922
+ "output_type": "execute_result"
923
+ }
924
+ ],
925
+ "source": []
926
+ },
927
+ {
928
+ "cell_type": "code",
929
+ "execution_count": 33,
930
+ "metadata": {},
931
+ "outputs": [
932
+ {
933
+ "data": {
934
+ "text/html": [
935
+ "<div>\n",
936
+ "<style scoped>\n",
937
+ " .dataframe tbody tr th:only-of-type {\n",
938
+ " vertical-align: middle;\n",
939
+ " }\n",
940
+ "\n",
941
+ " .dataframe tbody tr th {\n",
942
+ " vertical-align: top;\n",
943
+ " }\n",
944
+ "\n",
945
+ " .dataframe thead th {\n",
946
+ " text-align: right;\n",
947
+ " }\n",
948
+ "</style>\n",
949
+ "<table border=\"1\" class=\"dataframe\">\n",
950
+ " <thead>\n",
951
+ " <tr style=\"text-align: right;\">\n",
952
+ " <th></th>\n",
953
+ " <th>title</th>\n",
954
+ " <th>url</th>\n",
955
+ " </tr>\n",
956
+ " </thead>\n",
957
+ " <tbody>\n",
958
+ " <tr>\n",
959
+ " <th>0</th>\n",
960
+ " <td>Gilead by Marilynne Robinson- google books</td>\n",
961
+ " <td>https://books.google.com/books/about/Gilead.ht...</td>\n",
962
+ " </tr>\n",
963
+ " <tr>\n",
964
+ " <th>1</th>\n",
965
+ " <td>Spider's Web A Novel by Charles Osborne;Agatha...</td>\n",
966
+ " <td>https://books.google.com/books/about/Spider_s_...</td>\n",
967
+ " </tr>\n",
968
+ " <tr>\n",
969
+ " <th>2</th>\n",
970
+ " <td>The One Tree by Stephen R. Donaldson- google ...</td>\n",
971
+ " <td>https://books.google.com/books/about/The_One_T...</td>\n",
972
+ " </tr>\n",
973
+ " <tr>\n",
974
+ " <th>3</th>\n",
975
+ " <td>Rage of angels by Sidney Sheldon- google books</td>\n",
976
+ " <td>https://books.google.com/books/about/Rage_of_A...</td>\n",
977
+ " </tr>\n",
978
+ " <tr>\n",
979
+ " <th>4</th>\n",
980
+ " <td>The Four Loves by Clive Staples Lewis- google...</td>\n",
981
+ " <td>https://books.google.com/books/about/The_Four_...</td>\n",
982
+ " </tr>\n",
983
+ " </tbody>\n",
984
+ "</table>\n",
985
+ "</div>"
986
+ ],
987
+ "text/plain": [
988
+ " title \\\n",
989
+ "0 Gilead by Marilynne Robinson- google books \n",
990
+ "1 Spider's Web A Novel by Charles Osborne;Agatha... \n",
991
+ "2 The One Tree by Stephen R. Donaldson- google ... \n",
992
+ "3 Rage of angels by Sidney Sheldon- google books \n",
993
+ "4 The Four Loves by Clive Staples Lewis- google... \n",
994
+ "\n",
995
+ " url \n",
996
+ "0 https://books.google.com/books/about/Gilead.ht... \n",
997
+ "1 https://books.google.com/books/about/Spider_s_... \n",
998
+ "2 https://books.google.com/books/about/The_One_T... \n",
999
+ "3 https://books.google.com/books/about/Rage_of_A... \n",
1000
+ "4 https://books.google.com/books/about/The_Four_... "
1001
+ ]
1002
+ },
1003
+ "execution_count": 33,
1004
+ "metadata": {},
1005
+ "output_type": "execute_result"
1006
+ }
1007
+ ],
1008
+ "source": [
1009
+ "df1.head()"
1010
+ ]
1011
+ },
1012
+ {
1013
+ "cell_type": "code",
1014
+ "execution_count": 35,
1015
+ "metadata": {},
1016
+ "outputs": [
1017
+ {
1018
+ "data": {
1019
+ "text/html": [
1020
+ "<div>\n",
1021
+ "<style scoped>\n",
1022
+ " .dataframe tbody tr th:only-of-type {\n",
1023
+ " vertical-align: middle;\n",
1024
+ " }\n",
1025
+ "\n",
1026
+ " .dataframe tbody tr th {\n",
1027
+ " vertical-align: top;\n",
1028
+ " }\n",
1029
+ "\n",
1030
+ " .dataframe thead th {\n",
1031
+ " text-align: right;\n",
1032
+ " }\n",
1033
+ "</style>\n",
1034
+ "<table border=\"1\" class=\"dataframe\">\n",
1035
+ " <thead>\n",
1036
+ " <tr style=\"text-align: right;\">\n",
1037
+ " <th></th>\n",
1038
+ " <th>title</th>\n",
1039
+ " <th>url</th>\n",
1040
+ " </tr>\n",
1041
+ " </thead>\n",
1042
+ " <tbody>\n",
1043
+ " <tr>\n",
1044
+ " <th>73</th>\n",
1045
+ " <td>I Can Read with Me Eyes Shut! by Dr. Seuss- g...</td>\n",
1046
+ " <td>/search?num=3</td>\n",
1047
+ " </tr>\n",
1048
+ " <tr>\n",
1049
+ " <th>101</th>\n",
1050
+ " <td>Tyranny of the Majority Funamental Fairness in...</td>\n",
1051
+ " <td>/search?num=3</td>\n",
1052
+ " </tr>\n",
1053
+ " <tr>\n",
1054
+ " <th>126</th>\n",
1055
+ " <td>Mars and Venus Book of Days 365 Inspriations t...</td>\n",
1056
+ " <td>/search?num=3</td>\n",
1057
+ " </tr>\n",
1058
+ " <tr>\n",
1059
+ " <th>128</th>\n",
1060
+ " <td>Today I Feel Silly &amp; Other Moods That Make My ...</td>\n",
1061
+ " <td>/search?num=3</td>\n",
1062
+ " </tr>\n",
1063
+ " <tr>\n",
1064
+ " <th>314</th>\n",
1065
+ " <td>DREAM &amp; THE UNDERWOR by James Hillman- google...</td>\n",
1066
+ " <td>/search?num=3</td>\n",
1067
+ " </tr>\n",
1068
+ " <tr>\n",
1069
+ " <th>...</th>\n",
1070
+ " <td>...</td>\n",
1071
+ " <td>...</td>\n",
1072
+ " </tr>\n",
1073
+ " <tr>\n",
1074
+ " <th>6392</th>\n",
1075
+ " <td>Journey to the East by Hermann Hesse- google ...</td>\n",
1076
+ " <td>NaN</td>\n",
1077
+ " </tr>\n",
1078
+ " <tr>\n",
1079
+ " <th>6393</th>\n",
1080
+ " <td>The Monk Who Sold His Ferrari: A Fable About F...</td>\n",
1081
+ " <td>NaN</td>\n",
1082
+ " </tr>\n",
1083
+ " <tr>\n",
1084
+ " <th>6394</th>\n",
1085
+ " <td>I Am that Talks with Sri Nisargadatta Maharaj ...</td>\n",
1086
+ " <td>NaN</td>\n",
1087
+ " </tr>\n",
1088
+ " <tr>\n",
1089
+ " <th>6395</th>\n",
1090
+ " <td>The Berlin Phenomenology by Georg Wilhelm Fri...</td>\n",
1091
+ " <td>NaN</td>\n",
1092
+ " </tr>\n",
1093
+ " <tr>\n",
1094
+ " <th>6396</th>\n",
1095
+ " <td>'I'm Telling You Stories' Jeanette Winterson a...</td>\n",
1096
+ " <td>NaN</td>\n",
1097
+ " </tr>\n",
1098
+ " </tbody>\n",
1099
+ "</table>\n",
1100
+ "<p>100 rows × 2 columns</p>\n",
1101
+ "</div>"
1102
+ ],
1103
+ "text/plain": [
1104
+ " title url\n",
1105
+ "73 I Can Read with Me Eyes Shut! by Dr. Seuss- g... /search?num=3\n",
1106
+ "101 Tyranny of the Majority Funamental Fairness in... /search?num=3\n",
1107
+ "126 Mars and Venus Book of Days 365 Inspriations t... /search?num=3\n",
1108
+ "128 Today I Feel Silly & Other Moods That Make My ... /search?num=3\n",
1109
+ "314 DREAM & THE UNDERWOR by James Hillman- google... /search?num=3\n",
1110
+ "... ... ...\n",
1111
+ "6392 Journey to the East by Hermann Hesse- google ... NaN\n",
1112
+ "6393 The Monk Who Sold His Ferrari: A Fable About F... NaN\n",
1113
+ "6394 I Am that Talks with Sri Nisargadatta Maharaj ... NaN\n",
1114
+ "6395 The Berlin Phenomenology by Georg Wilhelm Fri... NaN\n",
1115
+ "6396 'I'm Telling You Stories' Jeanette Winterson a... NaN\n",
1116
+ "\n",
1117
+ "[100 rows x 2 columns]"
1118
+ ]
1119
+ },
1120
+ "execution_count": 35,
1121
+ "metadata": {},
1122
+ "output_type": "execute_result"
1123
+ }
1124
+ ],
1125
+ "source": [
1126
+ "queries_df[(queries_df.isnull().any(axis=1)) | ~((queries_df[\"url\"].str.contains(\"amazon\", na=False)) | (queries_df[\"url\"].str.contains(\"google\", na=False)))]"
1127
+ ]
1128
+ },
1129
+ {
1130
+ "cell_type": "code",
1131
+ "execution_count": null,
1132
+ "metadata": {},
1133
+ "outputs": [],
1134
+ "source": [
1135
+ "# i'll drop dataframes without a good url"
1136
+ ]
1137
+ },
1138
+ {
1139
+ "cell_type": "code",
1140
+ "execution_count": 39,
1141
+ "metadata": {},
1142
+ "outputs": [
1143
+ {
1144
+ "data": {
1145
+ "text/plain": [
1146
+ "Index([ 768, 806, 1170, 1269, 1311, 1343, 2311, 2389, 2536, 3270, 3572, 4228,\n",
1147
+ " 4941, 5292, 5293, 6085],\n",
1148
+ " dtype='int64')"
1149
+ ]
1150
+ },
1151
+ "execution_count": 39,
1152
+ "metadata": {},
1153
+ "output_type": "execute_result"
1154
+ }
1155
+ ],
1156
+ "source": [
1157
+ "to_drop = df1[(df1.isnull().any(axis=1)) | ~((df1[\"url\"].str.contains(\"amazon\", na=False)) | (df1[\"url\"].str.contains(\"google\", na=False)))].index\n",
1158
+ "to_drop"
1159
+ ]
1160
+ },
1161
+ {
1162
+ "cell_type": "code",
1163
+ "execution_count": 40,
1164
+ "metadata": {},
1165
+ "outputs": [],
1166
+ "source": [
1167
+ "df1 = df1.drop(index=to_drop)"
1168
+ ]
1169
+ },
1170
+ {
1171
+ "cell_type": "code",
1172
+ "execution_count": 41,
1173
+ "metadata": {},
1174
+ "outputs": [
1175
+ {
1176
+ "data": {
1177
+ "text/html": [
1178
+ "<div>\n",
1179
+ "<style scoped>\n",
1180
+ " .dataframe tbody tr th:only-of-type {\n",
1181
+ " vertical-align: middle;\n",
1182
+ " }\n",
1183
+ "\n",
1184
+ " .dataframe tbody tr th {\n",
1185
+ " vertical-align: top;\n",
1186
+ " }\n",
1187
+ "\n",
1188
+ " .dataframe thead th {\n",
1189
+ " text-align: right;\n",
1190
+ " }\n",
1191
+ "</style>\n",
1192
+ "<table border=\"1\" class=\"dataframe\">\n",
1193
+ " <thead>\n",
1194
+ " <tr style=\"text-align: right;\">\n",
1195
+ " <th></th>\n",
1196
+ " <th>title</th>\n",
1197
+ " <th>url</th>\n",
1198
+ " </tr>\n",
1199
+ " </thead>\n",
1200
+ " <tbody>\n",
1201
+ " <tr>\n",
1202
+ " <th>0</th>\n",
1203
+ " <td>Gilead by Marilynne Robinson- google books</td>\n",
1204
+ " <td>https://books.google.com/books/about/Gilead.ht...</td>\n",
1205
+ " </tr>\n",
1206
+ " <tr>\n",
1207
+ " <th>1</th>\n",
1208
+ " <td>Spider's Web A Novel by Charles Osborne;Agatha...</td>\n",
1209
+ " <td>https://books.google.com/books/about/Spider_s_...</td>\n",
1210
+ " </tr>\n",
1211
+ " <tr>\n",
1212
+ " <th>2</th>\n",
1213
+ " <td>The One Tree by Stephen R. Donaldson- google ...</td>\n",
1214
+ " <td>https://books.google.com/books/about/The_One_T...</td>\n",
1215
+ " </tr>\n",
1216
+ " <tr>\n",
1217
+ " <th>3</th>\n",
1218
+ " <td>Rage of angels by Sidney Sheldon- google books</td>\n",
1219
+ " <td>https://books.google.com/books/about/Rage_of_A...</td>\n",
1220
+ " </tr>\n",
1221
+ " <tr>\n",
1222
+ " <th>4</th>\n",
1223
+ " <td>The Four Loves by Clive Staples Lewis- google...</td>\n",
1224
+ " <td>https://books.google.com/books/about/The_Four_...</td>\n",
1225
+ " </tr>\n",
1226
+ " <tr>\n",
1227
+ " <th>...</th>\n",
1228
+ " <td>...</td>\n",
1229
+ " <td>...</td>\n",
1230
+ " </tr>\n",
1231
+ " <tr>\n",
1232
+ " <th>6392</th>\n",
1233
+ " <td>Journey to the East by Hermann Hesse- google ...</td>\n",
1234
+ " <td>https://books.google.com/books/about/The_Journ...</td>\n",
1235
+ " </tr>\n",
1236
+ " <tr>\n",
1237
+ " <th>6393</th>\n",
1238
+ " <td>The Monk Who Sold His Ferrari: A Fable About F...</td>\n",
1239
+ " <td>https://books.google.com/books/about/The_Monk_...</td>\n",
1240
+ " </tr>\n",
1241
+ " <tr>\n",
1242
+ " <th>6394</th>\n",
1243
+ " <td>I Am that Talks with Sri Nisargadatta Maharaj ...</td>\n",
1244
+ " <td>https://books.google.com/books/about/I_Am_that...</td>\n",
1245
+ " </tr>\n",
1246
+ " <tr>\n",
1247
+ " <th>6395</th>\n",
1248
+ " <td>The Berlin Phenomenology by Georg Wilhelm Fri...</td>\n",
1249
+ " <td>https://books.google.com/books/about/The_Berli...</td>\n",
1250
+ " </tr>\n",
1251
+ " <tr>\n",
1252
+ " <th>6396</th>\n",
1253
+ " <td>'I'm Telling You Stories' Jeanette Winterson a...</td>\n",
1254
+ " <td>https://books.google.com/books/about/I_m_Telli...</td>\n",
1255
+ " </tr>\n",
1256
+ " </tbody>\n",
1257
+ "</table>\n",
1258
+ "<p>6381 rows × 2 columns</p>\n",
1259
+ "</div>"
1260
+ ],
1261
+ "text/plain": [
1262
+ " title \\\n",
1263
+ "0 Gilead by Marilynne Robinson- google books \n",
1264
+ "1 Spider's Web A Novel by Charles Osborne;Agatha... \n",
1265
+ "2 The One Tree by Stephen R. Donaldson- google ... \n",
1266
+ "3 Rage of angels by Sidney Sheldon- google books \n",
1267
+ "4 The Four Loves by Clive Staples Lewis- google... \n",
1268
+ "... ... \n",
1269
+ "6392 Journey to the East by Hermann Hesse- google ... \n",
1270
+ "6393 The Monk Who Sold His Ferrari: A Fable About F... \n",
1271
+ "6394 I Am that Talks with Sri Nisargadatta Maharaj ... \n",
1272
+ "6395 The Berlin Phenomenology by Georg Wilhelm Fri... \n",
1273
+ "6396 'I'm Telling You Stories' Jeanette Winterson a... \n",
1274
+ "\n",
1275
+ " url \n",
1276
+ "0 https://books.google.com/books/about/Gilead.ht... \n",
1277
+ "1 https://books.google.com/books/about/Spider_s_... \n",
1278
+ "2 https://books.google.com/books/about/The_One_T... \n",
1279
+ "3 https://books.google.com/books/about/Rage_of_A... \n",
1280
+ "4 https://books.google.com/books/about/The_Four_... \n",
1281
+ "... ... \n",
1282
+ "6392 https://books.google.com/books/about/The_Journ... \n",
1283
+ "6393 https://books.google.com/books/about/The_Monk_... \n",
1284
+ "6394 https://books.google.com/books/about/I_Am_that... \n",
1285
+ "6395 https://books.google.com/books/about/The_Berli... \n",
1286
+ "6396 https://books.google.com/books/about/I_m_Telli... \n",
1287
+ "\n",
1288
+ "[6381 rows x 2 columns]"
1289
+ ]
1290
+ },
1291
+ "execution_count": 41,
1292
+ "metadata": {},
1293
+ "output_type": "execute_result"
1294
+ }
1295
+ ],
1296
+ "source": [
1297
+ "df1"
1298
+ ]
1299
+ },
1300
+ {
1301
+ "cell_type": "code",
1302
+ "execution_count": 46,
1303
+ "metadata": {},
1304
+ "outputs": [],
1305
+ "source": [
1306
+ "with open(\"to_drop.txt\",\"w\") as f:\n",
1307
+ " f.write(\"\\n\".join(to_drop.astype(str).tolist()))"
1308
+ ]
1309
+ },
1310
+ {
1311
+ "cell_type": "code",
1312
+ "execution_count": 48,
1313
+ "metadata": {},
1314
+ "outputs": [],
1315
+ "source": [
1316
+ "df1.to_csv(\"books_with_urls.csv\", index=False)"
1317
+ ]
1318
+ },
1319
+ {
1320
+ "cell_type": "code",
1321
+ "execution_count": null,
1322
+ "metadata": {},
1323
+ "outputs": [],
1324
+ "source": []
1325
+ }
1326
+ ],
1327
+ "metadata": {
1328
+ "accelerator": "GPU",
1329
+ "colab": {
1330
+ "gpuType": "T4",
1331
+ "provenance": []
1332
+ },
1333
+ "kernelspec": {
1334
+ "display_name": "venv",
1335
+ "language": "python",
1336
+ "name": "python3"
1337
+ },
1338
+ "language_info": {
1339
+ "codemirror_mode": {
1340
+ "name": "ipython",
1341
+ "version": 3
1342
+ },
1343
+ "file_extension": ".py",
1344
+ "mimetype": "text/x-python",
1345
+ "name": "python",
1346
+ "nbconvert_exporter": "python",
1347
+ "pygments_lexer": "ipython3",
1348
+ "version": "3.11.9"
1349
+ }
1350
+ },
1351
+ "nbformat": 4,
1352
+ "nbformat_minor": 0
1353
+ }
final_book_df.csv ADDED
The diff for this file is too large to render. See raw diff
 
final_df.ipynb ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "7cbe0a72",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "import numpy as np"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": null,
17
+ "id": "63c42422",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "final_data_cols = [\n",
22
+ " 'id',\n",
23
+ " 'title',\n",
24
+ " \"authors\",\n",
25
+ " \"description\",\n",
26
+ " \"categories\",\n",
27
+ " \"thumbnail\",\n",
28
+ " \"published_year\",\n",
29
+ " \"average_rating\",\n",
30
+ " \"num_pages\",\n",
31
+ " \"download_url\",\n",
32
+ " \"anger\",\n",
33
+ " \"disgust\",\n",
34
+ " \"fear\",\n",
35
+ " \"joy\",\n",
36
+ " \"sadness\",\n",
37
+ " \"surprise\",\n",
38
+ " \"neutral\"\n",
39
+ " ]"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 22,
45
+ "id": "19fa8ab2",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "df_base = pd.read_csv(\"books_cleaned.csv\")\n",
50
+ "categories_df = pd.read_csv(\"books_with_categories.csv\")\n",
51
+ "df_sentiments = pd.read_csv(\"books_with_sentiment.csv\")\n",
52
+ "df_download_url = pd.read_csv(\"books_with_urls.csv\")"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "id": "8a9ebdc7",
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "data": {
63
+ "text/plain": [
64
+ "(6397, 11)"
65
+ ]
66
+ },
67
+ "execution_count": 25,
68
+ "metadata": {},
69
+ "output_type": "execute_result"
70
+ }
71
+ ],
72
+ "source": []
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 4,
77
+ "id": "5e81abf9",
78
+ "metadata": {},
79
+ "outputs": [],
80
+ "source": [
81
+ "with open(\"to_drop.txt\", \"r\") as f:\n",
82
+ " to_drop = f.read().splitlines()"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 7,
88
+ "id": "66e10c4c",
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "to_drop = [int(i) for i in to_drop]"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 26,
98
+ "id": "919ed91b",
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "df_base = df_base.drop(to_drop, errors=\"ignore\")\n",
103
+ "categories_df = categories_df.drop(to_drop, errors=\"ignore\")\n",
104
+ "df_sentiments = df_sentiments.drop(to_drop, errors=\"ignore\")"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 27,
110
+ "id": "2b140195",
111
+ "metadata": {},
112
+ "outputs": [
113
+ {
114
+ "data": {
115
+ "text/plain": [
116
+ "(6381, 11)"
117
+ ]
118
+ },
119
+ "execution_count": 27,
120
+ "metadata": {},
121
+ "output_type": "execute_result"
122
+ }
123
+ ],
124
+ "source": [
125
+ "df_base.shape"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 28,
131
+ "id": "4d1c9d6a",
132
+ "metadata": {},
133
+ "outputs": [
134
+ {
135
+ "data": {
136
+ "text/plain": [
137
+ "(6381, 2)"
138
+ ]
139
+ },
140
+ "execution_count": 28,
141
+ "metadata": {},
142
+ "output_type": "execute_result"
143
+ }
144
+ ],
145
+ "source": [
146
+ "df_download_url.shape"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 16,
152
+ "id": "32427a77",
153
+ "metadata": {},
154
+ "outputs": [
155
+ {
156
+ "data": {
157
+ "text/plain": [
158
+ "(6381, 11)"
159
+ ]
160
+ },
161
+ "execution_count": 16,
162
+ "metadata": {},
163
+ "output_type": "execute_result"
164
+ }
165
+ ],
166
+ "source": [
167
+ "categories_df.shape"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 40,
173
+ "id": "cbb04023",
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "df_download_url = df_download_url[[\"url\"]]"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": null,
183
+ "id": "18edb501",
184
+ "metadata": {},
185
+ "outputs": [],
186
+ "source": [
187
+ "df_sentiments = df_sentiments[[\"anger\",\"disgust\",\"fear\",\"joy\",\"sadness\",\"surprise\",\"neutral\"]]\n",
188
+ "df_sentiments.head()"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "id": "e72f2e81",
195
+ "metadata": {},
196
+ "outputs": [
197
+ {
198
+ "data": {
199
+ "text/html": [
200
+ "<div>\n",
201
+ "<style scoped>\n",
202
+ " .dataframe tbody tr th:only-of-type {\n",
203
+ " vertical-align: middle;\n",
204
+ " }\n",
205
+ "\n",
206
+ " .dataframe tbody tr th {\n",
207
+ " vertical-align: top;\n",
208
+ " }\n",
209
+ "\n",
210
+ " .dataframe thead th {\n",
211
+ " text-align: right;\n",
212
+ " }\n",
213
+ "</style>\n",
214
+ "<table border=\"1\" class=\"dataframe\">\n",
215
+ " <thead>\n",
216
+ " <tr style=\"text-align: right;\">\n",
217
+ " <th></th>\n",
218
+ " <th>title</th>\n",
219
+ " <th>url</th>\n",
220
+ " </tr>\n",
221
+ " </thead>\n",
222
+ " <tbody>\n",
223
+ " <tr>\n",
224
+ " <th>0</th>\n",
225
+ " <td>Gilead by Marilynne Robinson- google books</td>\n",
226
+ " <td>https://books.google.com/books/about/Gilead.ht...</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>1</th>\n",
230
+ " <td>Spider's Web A Novel by Charles Osborne;Agatha...</td>\n",
231
+ " <td>https://books.google.com/books/about/Spider_s_...</td>\n",
232
+ " </tr>\n",
233
+ " <tr>\n",
234
+ " <th>2</th>\n",
235
+ " <td>The One Tree by Stephen R. Donaldson- google ...</td>\n",
236
+ " <td>https://books.google.com/books/about/The_One_T...</td>\n",
237
+ " </tr>\n",
238
+ " <tr>\n",
239
+ " <th>3</th>\n",
240
+ " <td>Rage of angels by Sidney Sheldon- google books</td>\n",
241
+ " <td>https://books.google.com/books/about/Rage_of_A...</td>\n",
242
+ " </tr>\n",
243
+ " <tr>\n",
244
+ " <th>4</th>\n",
245
+ " <td>The Four Loves by Clive Staples Lewis- google...</td>\n",
246
+ " <td>https://books.google.com/books/about/The_Four_...</td>\n",
247
+ " </tr>\n",
248
+ " </tbody>\n",
249
+ "</table>\n",
250
+ "</div>"
251
+ ],
252
+ "text/plain": [
253
+ " title \\\n",
254
+ "0 Gilead by Marilynne Robinson- google books \n",
255
+ "1 Spider's Web A Novel by Charles Osborne;Agatha... \n",
256
+ "2 The One Tree by Stephen R. Donaldson- google ... \n",
257
+ "3 Rage of angels by Sidney Sheldon- google books \n",
258
+ "4 The Four Loves by Clive Staples Lewis- google... \n",
259
+ "\n",
260
+ " url \n",
261
+ "0 https://books.google.com/books/about/Gilead.ht... \n",
262
+ "1 https://books.google.com/books/about/Spider_s_... \n",
263
+ "2 https://books.google.com/books/about/The_One_T... \n",
264
+ "3 https://books.google.com/books/about/Rage_of_A... \n",
265
+ "4 https://books.google.com/books/about/The_Four_... "
266
+ ]
267
+ },
268
+ "execution_count": 34,
269
+ "metadata": {},
270
+ "output_type": "execute_result"
271
+ }
272
+ ],
273
+ "source": [
274
+ "df_base = df_base[[\"isbn13\", \"authors\",\"thumbnail\",\"description\",\"published_year\",\"average_rating\",\"num_pages\",\"ratings_count\",\"title_and_subtitle\",\"tagged_description\"]]"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "e3f317bc",
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": [
284
+ "categories_df = categories_df[[\"categories\"]]"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": 46,
290
+ "id": "d7126b63",
291
+ "metadata": {},
292
+ "outputs": [],
293
+ "source": [
294
+ "df_base = df_base.reset_index().drop(\"index\", axis=1)\n",
295
+ "categories_df = categories_df.reset_index().drop(\"index\", axis=1)\n",
296
+ "df_download_url = df_download_url.reset_index().drop(\"index\", axis=1)\n",
297
+ "df_sentiments = df_sentiments.reset_index().drop(\"index\", axis=1)"
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "execution_count": 50,
303
+ "id": "cc1e8c55",
304
+ "metadata": {},
305
+ "outputs": [],
306
+ "source": [
307
+ "final_df = pd.concat([df_base,categories_df,df_sentiments,df_download_url], axis=1)"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": 51,
313
+ "id": "9ba30e30",
314
+ "metadata": {},
315
+ "outputs": [
316
+ {
317
+ "data": {
318
+ "text/html": [
319
+ "<div>\n",
320
+ "<style scoped>\n",
321
+ " .dataframe tbody tr th:only-of-type {\n",
322
+ " vertical-align: middle;\n",
323
+ " }\n",
324
+ "\n",
325
+ " .dataframe tbody tr th {\n",
326
+ " vertical-align: top;\n",
327
+ " }\n",
328
+ "\n",
329
+ " .dataframe thead th {\n",
330
+ " text-align: right;\n",
331
+ " }\n",
332
+ "</style>\n",
333
+ "<table border=\"1\" class=\"dataframe\">\n",
334
+ " <thead>\n",
335
+ " <tr style=\"text-align: right;\">\n",
336
+ " <th></th>\n",
337
+ " <th>isbn13</th>\n",
338
+ " <th>authors</th>\n",
339
+ " <th>thumbnail</th>\n",
340
+ " <th>description</th>\n",
341
+ " <th>published_year</th>\n",
342
+ " <th>average_rating</th>\n",
343
+ " <th>num_pages</th>\n",
344
+ " <th>ratings_count</th>\n",
345
+ " <th>title_and_subtitle</th>\n",
346
+ " <th>tagged_description</th>\n",
347
+ " <th>categories</th>\n",
348
+ " <th>anger</th>\n",
349
+ " <th>disgust</th>\n",
350
+ " <th>fear</th>\n",
351
+ " <th>joy</th>\n",
352
+ " <th>sadness</th>\n",
353
+ " <th>surprise</th>\n",
354
+ " <th>neutral</th>\n",
355
+ " <th>url</th>\n",
356
+ " </tr>\n",
357
+ " </thead>\n",
358
+ " <tbody>\n",
359
+ " <tr>\n",
360
+ " <th>4039</th>\n",
361
+ " <td>9780727861153</td>\n",
362
+ " <td>Ja Jance;Judith A. Jance</td>\n",
363
+ " <td>http://books.google.com/books/content?id=YDFDP...</td>\n",
364
+ " <td>Life is good for Joanna Brady in the small des...</td>\n",
365
+ " <td>2004.0</td>\n",
366
+ " <td>4.00</td>\n",
367
+ " <td>256.0</td>\n",
368
+ " <td>39.0</td>\n",
369
+ " <td>Desert Heat</td>\n",
370
+ " <td>9780727861153 Life is good for Joanna Brady in...</td>\n",
371
+ " <td>mystery</td>\n",
372
+ " <td>0.839755</td>\n",
373
+ " <td>0.893530</td>\n",
374
+ " <td>0.051363</td>\n",
375
+ " <td>0.769920</td>\n",
376
+ " <td>0.111690</td>\n",
377
+ " <td>0.078765</td>\n",
378
+ " <td>0.558840</td>\n",
379
+ " <td>https://books.google.com/books/about/Desert_He...</td>\n",
380
+ " </tr>\n",
381
+ " <tr>\n",
382
+ " <th>2261</th>\n",
383
+ " <td>9780393059465</td>\n",
384
+ " <td>Harriet Beecher Stowe;Professor Harriet Beeche...</td>\n",
385
+ " <td>http://books.google.com/books/content?id=bSaWh...</td>\n",
386
+ " <td>An interpretation of the American classic refu...</td>\n",
387
+ " <td>2007.0</td>\n",
388
+ " <td>3.86</td>\n",
389
+ " <td>528.0</td>\n",
390
+ " <td>160.0</td>\n",
391
+ " <td>The Annotated Uncle Tom's Cabin</td>\n",
392
+ " <td>9780393059465 An interpretation of the America...</td>\n",
393
+ " <td>history</td>\n",
394
+ " <td>0.064134</td>\n",
395
+ " <td>0.728139</td>\n",
396
+ " <td>0.051363</td>\n",
397
+ " <td>0.040564</td>\n",
398
+ " <td>0.111690</td>\n",
399
+ " <td>0.348772</td>\n",
400
+ " <td>0.599532</td>\n",
401
+ " <td>https://books.google.com/books/about/Uncle_Tom...</td>\n",
402
+ " </tr>\n",
403
+ " <tr>\n",
404
+ " <th>6101</th>\n",
405
+ " <td>9781841157481</td>\n",
406
+ " <td>Jonathan Franzen</td>\n",
407
+ " <td>http://books.google.com/books/content?id=n9-ha...</td>\n",
408
+ " <td>Dying St. Louis is turned inside-out by the ap...</td>\n",
409
+ " <td>2003.0</td>\n",
410
+ " <td>3.12</td>\n",
411
+ " <td>528.0</td>\n",
412
+ " <td>119.0</td>\n",
413
+ " <td>The Twenty-seventh City</td>\n",
414
+ " <td>9781841157481 Dying St. Louis is turned inside...</td>\n",
415
+ " <td>fiction</td>\n",
416
+ " <td>0.470221</td>\n",
417
+ " <td>0.114413</td>\n",
418
+ " <td>0.066823</td>\n",
419
+ " <td>0.402793</td>\n",
420
+ " <td>0.111690</td>\n",
421
+ " <td>0.216259</td>\n",
422
+ " <td>0.735679</td>\n",
423
+ " <td>https://books.google.com/books/about/The_Twent...</td>\n",
424
+ " </tr>\n",
425
+ " <tr>\n",
426
+ " <th>5666</th>\n",
427
+ " <td>9781560258247</td>\n",
428
+ " <td>Norman Mailer;John Buffalo Mailer</td>\n",
429
+ " <td>http://books.google.com/books/content?id=9oBps...</td>\n",
430
+ " <td>Questions are posed, writes Norman Mailer, \"in...</td>\n",
431
+ " <td>2006.0</td>\n",
432
+ " <td>3.31</td>\n",
433
+ " <td>218.0</td>\n",
434
+ " <td>67.0</td>\n",
435
+ " <td>The Big Empty Dialogues on Politics, Sex, God,...</td>\n",
436
+ " <td>9781560258247 Questions are posed, writes Norm...</td>\n",
437
+ " <td>mystery</td>\n",
438
+ " <td>0.085885</td>\n",
439
+ " <td>0.104098</td>\n",
440
+ " <td>0.253858</td>\n",
441
+ " <td>0.370736</td>\n",
442
+ " <td>0.111690</td>\n",
443
+ " <td>0.313475</td>\n",
444
+ " <td>0.930554</td>\n",
445
+ " <td>https://books.google.com/books/about/The_Big_E...</td>\n",
446
+ " </tr>\n",
447
+ " <tr>\n",
448
+ " <th>1862</th>\n",
449
+ " <td>9780349107868</td>\n",
450
+ " <td>Daniel Jonah Goldhagen</td>\n",
451
+ " <td>http://books.google.com/books/content?id=L11gQ...</td>\n",
452
+ " <td>Daniel Goldhagen re-visits a question which hi...</td>\n",
453
+ " <td>1997.0</td>\n",
454
+ " <td>3.68</td>\n",
455
+ " <td>634.0</td>\n",
456
+ " <td>80.0</td>\n",
457
+ " <td>Hitler's Willing Executioners Ordinary Germans...</td>\n",
458
+ " <td>9780349107868 Daniel Goldhagen re-visits a que...</td>\n",
459
+ " <td>mystery</td>\n",
460
+ " <td>0.781836</td>\n",
461
+ " <td>0.129887</td>\n",
462
+ " <td>0.198395</td>\n",
463
+ " <td>0.040564</td>\n",
464
+ " <td>0.131437</td>\n",
465
+ " <td>0.088081</td>\n",
466
+ " <td>0.693353</td>\n",
467
+ " <td>https://books.google.com/books/about/Hitler_s_...</td>\n",
468
+ " </tr>\n",
469
+ " </tbody>\n",
470
+ "</table>\n",
471
+ "</div>"
472
+ ],
473
+ "text/plain": [
474
+ " isbn13 authors \\\n",
475
+ "4039 9780727861153 Ja Jance;Judith A. Jance \n",
476
+ "2261 9780393059465 Harriet Beecher Stowe;Professor Harriet Beeche... \n",
477
+ "6101 9781841157481 Jonathan Franzen \n",
478
+ "5666 9781560258247 Norman Mailer;John Buffalo Mailer \n",
479
+ "1862 9780349107868 Daniel Jonah Goldhagen \n",
480
+ "\n",
481
+ " thumbnail \\\n",
482
+ "4039 http://books.google.com/books/content?id=YDFDP... \n",
483
+ "2261 http://books.google.com/books/content?id=bSaWh... \n",
484
+ "6101 http://books.google.com/books/content?id=n9-ha... \n",
485
+ "5666 http://books.google.com/books/content?id=9oBps... \n",
486
+ "1862 http://books.google.com/books/content?id=L11gQ... \n",
487
+ "\n",
488
+ " description published_year \\\n",
489
+ "4039 Life is good for Joanna Brady in the small des... 2004.0 \n",
490
+ "2261 An interpretation of the American classic refu... 2007.0 \n",
491
+ "6101 Dying St. Louis is turned inside-out by the ap... 2003.0 \n",
492
+ "5666 Questions are posed, writes Norman Mailer, \"in... 2006.0 \n",
493
+ "1862 Daniel Goldhagen re-visits a question which hi... 1997.0 \n",
494
+ "\n",
495
+ " average_rating num_pages ratings_count \\\n",
496
+ "4039 4.00 256.0 39.0 \n",
497
+ "2261 3.86 528.0 160.0 \n",
498
+ "6101 3.12 528.0 119.0 \n",
499
+ "5666 3.31 218.0 67.0 \n",
500
+ "1862 3.68 634.0 80.0 \n",
501
+ "\n",
502
+ " title_and_subtitle \\\n",
503
+ "4039 Desert Heat \n",
504
+ "2261 The Annotated Uncle Tom's Cabin \n",
505
+ "6101 The Twenty-seventh City \n",
506
+ "5666 The Big Empty Dialogues on Politics, Sex, God,... \n",
507
+ "1862 Hitler's Willing Executioners Ordinary Germans... \n",
508
+ "\n",
509
+ " tagged_description categories anger \\\n",
510
+ "4039 9780727861153 Life is good for Joanna Brady in... mystery 0.839755 \n",
511
+ "2261 9780393059465 An interpretation of the America... history 0.064134 \n",
512
+ "6101 9781841157481 Dying St. Louis is turned inside... fiction 0.470221 \n",
513
+ "5666 9781560258247 Questions are posed, writes Norm... mystery 0.085885 \n",
514
+ "1862 9780349107868 Daniel Goldhagen re-visits a que... mystery 0.781836 \n",
515
+ "\n",
516
+ " disgust fear joy sadness surprise neutral \\\n",
517
+ "4039 0.893530 0.051363 0.769920 0.111690 0.078765 0.558840 \n",
518
+ "2261 0.728139 0.051363 0.040564 0.111690 0.348772 0.599532 \n",
519
+ "6101 0.114413 0.066823 0.402793 0.111690 0.216259 0.735679 \n",
520
+ "5666 0.104098 0.253858 0.370736 0.111690 0.313475 0.930554 \n",
521
+ "1862 0.129887 0.198395 0.040564 0.131437 0.088081 0.693353 \n",
522
+ "\n",
523
+ " url \n",
524
+ "4039 https://books.google.com/books/about/Desert_He... \n",
525
+ "2261 https://books.google.com/books/about/Uncle_Tom... \n",
526
+ "6101 https://books.google.com/books/about/The_Twent... \n",
527
+ "5666 https://books.google.com/books/about/The_Big_E... \n",
528
+ "1862 https://books.google.com/books/about/Hitler_s_... "
529
+ ]
530
+ },
531
+ "execution_count": 51,
532
+ "metadata": {},
533
+ "output_type": "execute_result"
534
+ }
535
+ ],
536
+ "source": [
537
+ "final_df.sample(5)"
538
+ ]
539
+ },
540
+ {
541
+ "cell_type": "code",
542
+ "execution_count": 53,
543
+ "id": "c90847c5",
544
+ "metadata": {},
545
+ "outputs": [],
546
+ "source": [
547
+ "final_df[\"tagged_description\"].to_csv(\"tagged_description.txt\", index=None, header=None)"
548
+ ]
549
+ },
550
+ {
551
+ "cell_type": "code",
552
+ "execution_count": 54,
553
+ "id": "5419aa0e",
554
+ "metadata": {},
555
+ "outputs": [],
556
+ "source": [
557
+ "final_df.to_csv(\"final_book_df.csv\", index=None)"
558
+ ]
559
+ },
560
+ {
561
+ "cell_type": "code",
562
+ "execution_count": null,
563
+ "id": "32b5edca",
564
+ "metadata": {},
565
+ "outputs": [],
566
+ "source": []
567
+ }
568
+ ],
569
+ "metadata": {
570
+ "kernelspec": {
571
+ "display_name": "venv",
572
+ "language": "python",
573
+ "name": "python3"
574
+ },
575
+ "language_info": {
576
+ "codemirror_mode": {
577
+ "name": "ipython",
578
+ "version": 3
579
+ },
580
+ "file_extension": ".py",
581
+ "mimetype": "text/x-python",
582
+ "name": "python",
583
+ "nbconvert_exporter": "python",
584
+ "pygments_lexer": "ipython3",
585
+ "version": "3.11.9"
586
+ }
587
+ },
588
+ "nbformat": 4,
589
+ "nbformat_minor": 5
590
+ }
gradio_dashboard.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import gradio as gr
3
+ import numpy as np
4
+
5
+
6
+ from langchain_chroma import Chroma
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+
9
+
10
+ embeddings = HuggingFaceEmbeddings(
11
+ model_name="sentence-transformers/all-MiniLM-L6-v2" # Fast and good quality
12
+ # or "sentence-transformers/all-mpnet-base-v2" # Higher quality, slower
13
+ )
14
+
15
+
16
+ books = pd.read_csv("final_book_df.csv")
17
+ books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
18
+ books["large_thumbnail"] = np.where(books["large_thumbnail"].isna(), "cover-not-found.jpg", books["large_thumbnail"])
19
+
20
+ db_books = Chroma(persist_directory="chroma_books", embedding_function=embeddings, collection_name="books")
21
+
22
+
23
+ def retrieve_semantic_recommendations(
24
+ query: str,
25
+ category: str = None,
26
+ tone: str = None,
27
+ initial_top_k: int = 50,
28
+ final_top_k: int = 16,
29
+ ) -> pd.DataFrame:
30
+
31
+ recs = db_books.similarity_search(query, k=initial_top_k)
32
+ books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
33
+ book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
34
+
35
+ if category != "All":
36
+ book_recs = book_recs[book_recs["categories"] == category].head(final_top_k)
37
+ else:
38
+ book_recs = book_recs.head(final_top_k)
39
+
40
+ if tone == "Happy":
41
+ book_recs.sort_values(by="joy", ascending=False, inplace=True)
42
+ elif tone == "Surprising":
43
+ book_recs.sort_values(by="surprise", ascending=False, inplace=True)
44
+ elif tone == "Angry":
45
+ book_recs.sort_values(by="anger", ascending=False, inplace=True)
46
+ elif tone == "Suspenseful":
47
+ book_recs.sort_values(by="fear", ascending=False, inplace=True)
48
+ elif tone == "Sad":
49
+ book_recs.sort_values(by="sadness", ascending=False, inplace=True)
50
+
51
+ return book_recs
52
+
53
+
54
+ def recommend_books(
55
+ query: str,
56
+ category: str,
57
+ tone: str
58
+ ):
59
+ recommendations = retrieve_semantic_recommendations(query, category, tone)
60
+ results = []
61
+
62
+ for _, row in recommendations.iterrows():
63
+ description = row["description"]
64
+ truncated_desc_split = description.split()
65
+ truncated_description = " ".join(truncated_desc_split[:30]) + "..."
66
+
67
+ authors_split = row["authors"].split(";")
68
+ if len(authors_split) == 2:
69
+ authors_str = f"{authors_split[0]} and {authors_split[1]}"
70
+ elif len(authors_split) > 2:
71
+ authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
72
+ else:
73
+ authors_str = row["authors"]
74
+
75
+ caption = f"{row['title_and_subtitle']} by {authors_str}: {truncated_description}"
76
+ results.append((row["large_thumbnail"], caption))
77
+ return results
78
+
79
+
80
+
81
+ categories = ["All"] + sorted(books["categories"].unique())
82
+ tones = ["All"] + ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
83
+
84
+ with gr.Blocks(theme=gr.themes.Glass()) as dashboard:
85
+ gr.Markdown("# Semantic Book Recommender")
86
+ gr.Markdown("## Find your next favorite book!")
87
+
88
+ with gr.Row():
89
+ user_query = gr.Textbox(
90
+ label="please enter a description of your book:",
91
+ placeholder="Enter your query here...",
92
+ lines=1,
93
+ max_lines=1,
94
+ )
95
+
96
+ category_dropdown = gr.Dropdown(
97
+ label="Select a category",
98
+ choices=categories,
99
+ value="All",
100
+ )
101
+ tone_dropdown = gr.Dropdown(
102
+ label="Select an emotional tone",
103
+ choices=tones,
104
+ value="All",
105
+ )
106
+ submit_button = gr.Button("Submit", variant="primary")
107
+
108
+ gr.Markdown("## Recommendations")
109
+ output = gr.Gallery(
110
+ label="Recommended Books",
111
+ columns=8,
112
+ rows=2,
113
+ )
114
+
115
+ submit_button.click(
116
+ fn=recommend_books,
117
+ inputs=[user_query, category_dropdown, tone_dropdown],
118
+ outputs=output,
119
+ )
120
+
121
+ if __name__ == "__main__":
122
+ dashboard.launch()
requirements.txt ADDED
Binary file (7.09 kB). View file
 
search_progress.csv ADDED
The diff for this file is too large to render. See raw diff
 
sentiment_analysis.ipynb ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "9faa187f",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "\n",
12
+ "df = pd.read_csv(\"books_with_categories.csv\")"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 3,
18
+ "id": "606d7c6e",
19
+ "metadata": {},
20
+ "outputs": [
21
+ {
22
+ "data": {
23
+ "text/plain": [
24
+ "True"
25
+ ]
26
+ },
27
+ "execution_count": 3,
28
+ "metadata": {},
29
+ "output_type": "execute_result"
30
+ }
31
+ ],
32
+ "source": [
33
+ "from dotenv import load_dotenv\n",
34
+ "\n",
35
+ "load_dotenv()"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 7,
41
+ "id": "ca93c495",
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "name": "stderr",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "Device set to use cpu\n"
49
+ ]
50
+ },
51
+ {
52
+ "data": {
53
+ "text/plain": [
54
+ "[[{'label': 'joy', 'score': 0.9771687984466553},\n",
55
+ " {'label': 'surprise', 'score': 0.00852868054062128},\n",
56
+ " {'label': 'neutral', 'score': 0.005764586851000786},\n",
57
+ " {'label': 'anger', 'score': 0.004419783595949411},\n",
58
+ " {'label': 'sadness', 'score': 0.002092392183840275},\n",
59
+ " {'label': 'disgust', 'score': 0.0016119900392368436},\n",
60
+ " {'label': 'fear', 'score': 0.0004138524236623198}]]"
61
+ ]
62
+ },
63
+ "execution_count": 7,
64
+ "metadata": {},
65
+ "output_type": "execute_result"
66
+ }
67
+ ],
68
+ "source": [
69
+ "from transformers import pipeline\n",
70
+ "classifier = pipeline(\"text-classification\", model=\"j-hartmann/emotion-english-distilroberta-base\", top_k=None)\n",
71
+ "classifier(\"I love this!\")\n"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 14,
77
+ "id": "f3708b48",
78
+ "metadata": {},
79
+ "outputs": [],
80
+ "source": [
81
+ "preds = classifier(df[\"description\"][0].split(\".\"))"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 17,
87
+ "id": "e8db387e",
88
+ "metadata": {},
89
+ "outputs": [
90
+ {
91
+ "data": {
92
+ "text/plain": [
93
+ "[[{'label': 'surprise', 'score': 0.7296020984649658},\n",
94
+ " {'label': 'neutral', 'score': 0.14038598537445068},\n",
95
+ " {'label': 'fear', 'score': 0.06816228479146957},\n",
96
+ " {'label': 'joy', 'score': 0.0479426383972168},\n",
97
+ " {'label': 'anger', 'score': 0.009156371466815472},\n",
98
+ " {'label': 'disgust', 'score': 0.0026284765917807817},\n",
99
+ " {'label': 'sadness', 'score': 0.002122163539752364}],\n",
100
+ " [{'label': 'neutral', 'score': 0.4493706524372101},\n",
101
+ " {'label': 'disgust', 'score': 0.2735912799835205},\n",
102
+ " {'label': 'joy', 'score': 0.10908322036266327},\n",
103
+ " {'label': 'sadness', 'score': 0.09362740069627762},\n",
104
+ " {'label': 'anger', 'score': 0.04047828167676926},\n",
105
+ " {'label': 'surprise', 'score': 0.026970166712999344},\n",
106
+ " {'label': 'fear', 'score': 0.006879047024995089}],\n",
107
+ " [{'label': 'neutral', 'score': 0.6462154984474182},\n",
108
+ " {'label': 'sadness', 'score': 0.24273382127285004},\n",
109
+ " {'label': 'disgust', 'score': 0.04342272877693176},\n",
110
+ " {'label': 'surprise', 'score': 0.028300544247031212},\n",
111
+ " {'label': 'joy', 'score': 0.014211482368409634},\n",
112
+ " {'label': 'fear', 'score': 0.014084099791944027},\n",
113
+ " {'label': 'anger', 'score': 0.01103190891444683}],\n",
114
+ " [{'label': 'fear', 'score': 0.9281682968139648},\n",
115
+ " {'label': 'anger', 'score': 0.032190896570682526},\n",
116
+ " {'label': 'neutral', 'score': 0.012808685190975666},\n",
117
+ " {'label': 'sadness', 'score': 0.008756878785789013},\n",
118
+ " {'label': 'surprise', 'score': 0.00859791412949562},\n",
119
+ " {'label': 'disgust', 'score': 0.008431827649474144},\n",
120
+ " {'label': 'joy', 'score': 0.0010455839801579714}],\n",
121
+ " [{'label': 'sadness', 'score': 0.9671575427055359},\n",
122
+ " {'label': 'neutral', 'score': 0.01510414108633995},\n",
123
+ " {'label': 'disgust', 'score': 0.006480586249381304},\n",
124
+ " {'label': 'fear', 'score': 0.005393984727561474},\n",
125
+ " {'label': 'surprise', 'score': 0.0022869384847581387},\n",
126
+ " {'label': 'anger', 'score': 0.0018428878393024206},\n",
127
+ " {'label': 'joy', 'score': 0.001733877114020288}],\n",
128
+ " [{'label': 'joy', 'score': 0.9327980279922485},\n",
129
+ " {'label': 'disgust', 'score': 0.03771715983748436},\n",
130
+ " {'label': 'neutral', 'score': 0.015891825780272484},\n",
131
+ " {'label': 'sadness', 'score': 0.006444509141147137},\n",
132
+ " {'label': 'anger', 'score': 0.005024974700063467},\n",
133
+ " {'label': 'surprise', 'score': 0.001581205753609538},\n",
134
+ " {'label': 'fear', 'score': 0.0005423063994385302}],\n",
135
+ " [{'label': 'joy', 'score': 0.6528708338737488},\n",
136
+ " {'label': 'neutral', 'score': 0.2542746663093567},\n",
137
+ " {'label': 'surprise', 'score': 0.06808295100927353},\n",
138
+ " {'label': 'sadness', 'score': 0.009908987209200859},\n",
139
+ " {'label': 'disgust', 'score': 0.0065122139640152454},\n",
140
+ " {'label': 'anger', 'score': 0.004821307025849819},\n",
141
+ " {'label': 'fear', 'score': 0.0035290210507810116}],\n",
142
+ " [{'label': 'neutral', 'score': 0.549477219581604},\n",
143
+ " {'label': 'sadness', 'score': 0.11169005185365677},\n",
144
+ " {'label': 'disgust', 'score': 0.1040065810084343},\n",
145
+ " {'label': 'surprise', 'score': 0.07876542955636978},\n",
146
+ " {'label': 'anger', 'score': 0.06413355469703674},\n",
147
+ " {'label': 'fear', 'score': 0.051362741738557816},\n",
148
+ " {'label': 'joy', 'score': 0.040564361959695816}]]"
149
+ ]
150
+ },
151
+ "execution_count": 17,
152
+ "metadata": {},
153
+ "output_type": "execute_result"
154
+ }
155
+ ],
156
+ "source": [
157
+ "preds"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 32,
163
+ "id": "67dd5f0d",
164
+ "metadata": {},
165
+ "outputs": [
166
+ {
167
+ "name": "stderr",
168
+ "output_type": "stream",
169
+ "text": [
170
+ " 1%| | 55/6397 [00:16<31:32, 3.35it/s] \n"
171
+ ]
172
+ },
173
+ {
174
+ "ename": "KeyboardInterrupt",
175
+ "evalue": "",
176
+ "output_type": "error",
177
+ "traceback": [
178
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
179
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
180
+ "Cell \u001b[1;32mIn[32], line 23\u001b[0m\n\u001b[0;32m 21\u001b[0m isbns\u001b[38;5;241m.\u001b[39mappend(df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124misbn13\u001b[39m\u001b[38;5;124m\"\u001b[39m][i])\n\u001b[0;32m 22\u001b[0m sentences \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m\"\u001b[39m][i]\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 23\u001b[0m sentence_pred \u001b[38;5;241m=\u001b[39m \u001b[43mclassifier\u001b[49m\u001b[43m(\u001b[49m\u001b[43msentences\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m8\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# batching\u001b[39;00m\n\u001b[0;32m 24\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m emotion \u001b[38;5;129;01min\u001b[39;00m emotions_dict:\n\u001b[0;32m 25\u001b[0m max_score \u001b[38;5;241m=\u001b[39m get_max_emotion_score(emotion, sentence_pred)\n",
181
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\text_classification.py:159\u001b[0m, in \u001b[0;36mTextClassificationPipeline.__call__\u001b[1;34m(self, inputs, **kwargs)\u001b[0m\n\u001b[0;32m 124\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 125\u001b[0m \u001b[38;5;124;03mClassify the text(s) given as inputs.\u001b[39;00m\n\u001b[0;32m 126\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 156\u001b[0m \u001b[38;5;124;03m If `top_k` is used, one such dictionary is returned per label.\u001b[39;00m\n\u001b[0;32m 157\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 158\u001b[0m inputs \u001b[38;5;241m=\u001b[39m (inputs,)\n\u001b[1;32m--> 159\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 160\u001b[0m \u001b[38;5;66;03m# TODO try and retrieve it in a nicer way from _sanitize_parameters.\u001b[39;00m\n\u001b[0;32m 161\u001b[0m _legacy \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtop_k\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m kwargs\n",
182
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\base.py:1343\u001b[0m, in \u001b[0;36mPipeline.__call__\u001b[1;34m(self, inputs, num_workers, batch_size, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m can_use_iterator:\n\u001b[0;32m 1340\u001b[0m final_iterator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_iterator(\n\u001b[0;32m 1341\u001b[0m inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params\n\u001b[0;32m 1342\u001b[0m )\n\u001b[1;32m-> 1343\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(final_iterator)\n\u001b[0;32m 1344\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n\u001b[0;32m 1345\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
183
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\pt_utils.py:124\u001b[0m, in \u001b[0;36mPipelineIterator.__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloader_batch_item()\n\u001b[0;32m 123\u001b[0m \u001b[38;5;66;03m# We're out of items within a batch\u001b[39;00m\n\u001b[1;32m--> 124\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miterator)\n\u001b[0;32m 125\u001b[0m processed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfer(item, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparams)\n\u001b[0;32m 126\u001b[0m \u001b[38;5;66;03m# We now have a batch of \"inferred things\".\u001b[39;00m\n",
184
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\pt_utils.py:125\u001b[0m, in \u001b[0;36mPipelineIterator.__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 123\u001b[0m \u001b[38;5;66;03m# We're out of items within a batch\u001b[39;00m\n\u001b[0;32m 124\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miterator)\n\u001b[1;32m--> 125\u001b[0m processed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 126\u001b[0m \u001b[38;5;66;03m# We now have a batch of \"inferred things\".\u001b[39;00m\n\u001b[0;32m 127\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloader_batch_size \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 128\u001b[0m \u001b[38;5;66;03m# Try to infer the size of the batch\u001b[39;00m\n",
185
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\base.py:1269\u001b[0m, in \u001b[0;36mPipeline.forward\u001b[1;34m(self, model_inputs, **forward_params)\u001b[0m\n\u001b[0;32m 1267\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m inference_context():\n\u001b[0;32m 1268\u001b[0m model_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_ensure_tensor_on_device(model_inputs, device\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m-> 1269\u001b[0m model_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mforward_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1270\u001b[0m model_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_ensure_tensor_on_device(model_outputs, device\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mdevice(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[0;32m 1271\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
186
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\text_classification.py:190\u001b[0m, in \u001b[0;36mTextClassificationPipeline._forward\u001b[1;34m(self, model_inputs)\u001b[0m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muse_cache\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39msignature(model_forward)\u001b[38;5;241m.\u001b[39mparameters\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m 189\u001b[0m model_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muse_cache\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m--> 190\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m)\u001b[49m\n",
187
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
188
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
189
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:1318\u001b[0m, in \u001b[0;36mRobertaForSequenceClassification.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 1310\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[0;32m 1312\u001b[0m \u001b[38;5;124;03m Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[0;32m 1313\u001b[0m \u001b[38;5;124;03m config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\u001b[39;00m\n\u001b[0;32m 1314\u001b[0m \u001b[38;5;124;03m `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\u001b[39;00m\n\u001b[0;32m 1315\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1316\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m-> 1318\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroberta\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1319\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1320\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1321\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1322\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1323\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1324\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1325\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1326\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1327\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1328\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1329\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m 1330\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclassifier(sequence_output)\n",
190
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
191
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
192
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:976\u001b[0m, in \u001b[0;36mRobertaModel.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 969\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[0;32m 970\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[0;32m 971\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[0;32m 972\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[0;32m 973\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[0;32m 974\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[1;32m--> 976\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 979\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 980\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 981\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_extended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 982\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 983\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 984\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 985\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 986\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 987\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 988\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m 989\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
193
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
194
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
195
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:631\u001b[0m, in \u001b[0;36mRobertaEncoder.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 620\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[0;32m 621\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[0;32m 622\u001b[0m hidden_states,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 628\u001b[0m output_attentions,\n\u001b[0;32m 629\u001b[0m )\n\u001b[0;32m 630\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 631\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 632\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 633\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 634\u001b[0m \u001b[43m \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 635\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 636\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 637\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 638\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 639\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 641\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m 642\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n",
196
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
197
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
198
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:520\u001b[0m, in \u001b[0;36mRobertaLayer.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[0;32m 508\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\n\u001b[0;32m 509\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 510\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 517\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[0;32m 518\u001b[0m \u001b[38;5;66;03m# decoder uni-directional self-attention cached key/values tuple is at positions 1,2\u001b[39;00m\n\u001b[0;32m 519\u001b[0m self_attn_past_key_value \u001b[38;5;241m=\u001b[39m past_key_value[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m--> 520\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 521\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 522\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 524\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 525\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mself_attn_past_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 526\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 527\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m 529\u001b[0m \u001b[38;5;66;03m# if decoder, the last output is tuple of self-attn cache\u001b[39;00m\n",
199
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
200
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
201
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:447\u001b[0m, in \u001b[0;36mRobertaAttention.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[0;32m 437\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\n\u001b[0;32m 438\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 439\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 445\u001b[0m output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 446\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m--> 447\u001b[0m self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 448\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 449\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 450\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 451\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 452\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 453\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 454\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 455\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 456\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[0;32m 457\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n",
202
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
203
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
204
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:325\u001b[0m, in \u001b[0;36mRobertaSdpaSelfAttention.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[0;32m 313\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mforward(\n\u001b[0;32m 314\u001b[0m hidden_states,\n\u001b[0;32m 315\u001b[0m attention_mask,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 320\u001b[0m output_attentions,\n\u001b[0;32m 321\u001b[0m )\n\u001b[0;32m 323\u001b[0m bsz, tgt_len, _ \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39msize()\n\u001b[1;32m--> 325\u001b[0m query_layer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtranspose_for_scores(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 327\u001b[0m \u001b[38;5;66;03m# If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention\u001b[39;00m\n\u001b[0;32m 328\u001b[0m \u001b[38;5;66;03m# mask needs to be such that the encoder's padding tokens are not attended to.\u001b[39;00m\n\u001b[0;32m 329\u001b[0m is_cross_attention \u001b[38;5;241m=\u001b[39m encoder_hidden_states \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
205
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
206
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
207
+ "File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\linear.py:116\u001b[0m, in \u001b[0;36mLinear.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 115\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[1;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
208
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
209
+ ]
210
+ }
211
+ ],
212
+ "source": [
213
+ "from tqdm import tqdm\n",
214
+ "\n",
215
+ "isbns = []\n",
216
+ "\n",
217
+ "def get_max_emotion_score(emotion, sentence_pred):\n",
218
+ " scores = []\n",
219
+ " for i in sentence_pred:\n",
220
+ " for j in i:\n",
221
+ " if j[\"label\"] == emotion:\n",
222
+ " scores.append(j[\"score\"])\n",
223
+ " return max(scores)\n",
224
+ "\n",
225
+ "\n",
226
+ "emotions = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
227
+ "\n",
228
+ "\n",
229
+ "emotions_dict = {emotion: [] for emotion in emotions}\n",
230
+ "\n",
231
+ "\n",
232
+ "for i in tqdm(range(len(df))):\n",
233
+ " isbns.append(df[\"isbn13\"][i])\n",
234
+ " sentences = df[\"description\"][i].split(\".\")\n",
235
+ " sentence_pred = classifier(sentences, batch_size=8) # batching\n",
236
+ " for emotion in emotions_dict:\n",
237
+ " max_score = get_max_emotion_score(emotion, sentence_pred)\n",
238
+ " emotions_dict[emotion].append(max_score)\n",
239
+ "\n"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 30,
245
+ "id": "6f596cee",
246
+ "metadata": {},
247
+ "outputs": [
248
+ {
249
+ "data": {
250
+ "text/plain": [
251
+ "{'anger': array([0.04047828, 0.61261988, 0.01603621, 0.35148466, 0.0814124 ,\n",
252
+ " 0.53818434, 0.13283803, 0. ]),\n",
253
+ " 'disgust': array([0.27359128, 0.34828481, 0.0606952 , 0.15072225, 0.18449552,\n",
254
+ " 0.72717494, 0.064666 , 0. ]),\n",
255
+ " 'fear': array([0.9281683 , 0.9723208 , 0.00191786, 0.36070606, 0.04019525,\n",
256
+ " 0.26585764, 0.74742717, 0. ]),\n",
257
+ " 'joy': array([0.93279803, 0.7672382 , 0.2518813 , 0.02480991, 0.035207 ,\n",
258
+ " 0.87256557, 0.00796585, 0. ]),\n",
259
+ " 'sadness': array([0.96715754, 0.06179974, 0.02098823, 0.47588021, 0.16030179,\n",
260
+ " 0.1565351 , 0.40800145, 0. ]),\n",
261
+ " 'surprise': array([0.7296021 , 0.25254625, 0.02968322, 0.07487808, 0.07487808,\n",
262
+ " 0.27190357, 0.02882093, 0. ]),\n",
263
+ " 'neutral': array([0.6462155 , 0.88793951, 0.73268509, 0.56766838, 0.8843897 ,\n",
264
+ " 0.71219414, 0.38535854, 0. ])}"
265
+ ]
266
+ },
267
+ "execution_count": 30,
268
+ "metadata": {},
269
+ "output_type": "execute_result"
270
+ }
271
+ ],
272
+ "source": [
273
+ "emotions_dict"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 31,
279
+ "id": "d3331aa5",
280
+ "metadata": {},
281
+ "outputs": [
282
+ {
283
+ "data": {
284
+ "text/plain": [
285
+ "[9780002005883,\n",
286
+ " 9780002261982,\n",
287
+ " 9780006163831,\n",
288
+ " 9780006178736,\n",
289
+ " 9780006280897,\n",
290
+ " 9780006280934,\n",
291
+ " 9780006380832,\n",
292
+ " 9780006470229]"
293
+ ]
294
+ },
295
+ "execution_count": 31,
296
+ "metadata": {},
297
+ "output_type": "execute_result"
298
+ }
299
+ ],
300
+ "source": [
301
+ "isbns"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "code",
306
+ "execution_count": null,
307
+ "id": "0e8fabc9",
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": []
311
+ }
312
+ ],
313
+ "metadata": {
314
+ "kernelspec": {
315
+ "display_name": "llms",
316
+ "language": "python",
317
+ "name": "python3"
318
+ },
319
+ "language_info": {
320
+ "codemirror_mode": {
321
+ "name": "ipython",
322
+ "version": 3
323
+ },
324
+ "file_extension": ".py",
325
+ "mimetype": "text/x-python",
326
+ "name": "python",
327
+ "nbconvert_exporter": "python",
328
+ "pygments_lexer": "ipython3",
329
+ "version": "3.11.11"
330
+ }
331
+ },
332
+ "nbformat": 4,
333
+ "nbformat_minor": 5
334
+ }
supervised_clean.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import pandas as pd
4
+ from googlesearch import search
5
+ import time
6
+ import random
7
+
8
+ df = pd.read_csv("search_progress.csv")
9
+ df1 = df.drop("query_index", axis=1)
10
+
11
+ print("Initial DataFrame:")
12
+ print(df1.head())
13
+
14
+ df1.columns = ["title", "url"]
15
+
16
+ unfinished = df1[(df1.isnull().any(axis=1)) | ~((df1["url"].str.contains("amazon", na=False)) | (df1["url"].str.contains("google", na=False)))]
17
+
18
+ unfinished_list = unfinished["title"].tolist()
19
+ unfinished_urls = [None] * len(unfinished_list)
20
+
21
+
22
+
23
+ for idx,i in enumerate(unfinished_list):
24
+ print()
25
+ print(f"Processing title {idx + 1}/{len(unfinished_list)}: {i}")
26
+ try:
27
+ results1 = search(i, num_results=3, lang="en")
28
+ results2 = search(i.replace("google", "amazon"), num_results=3, lang="en")
29
+ url = list(results1) + list(results2)
30
+ count = 0
31
+ print("\n")
32
+ print(f"Searching for: {i}")
33
+ for j in url:
34
+ count += 1
35
+ print(count, j)
36
+ index = int(input("Enter the index of the correct URL (1-3): ")) - 1
37
+ unfinished_urls[idx] = url[index]
38
+ except Exception as e:
39
+ print(f"Error occurred while searching for {i}: {e}")
40
+ unfinished_urls[idx] = None
41
+ time.sleep(random.randint(1,5)) # Sleep to avoid hitting the search API too quickly
42
+
43
+ unfinished["url"] = unfinished_urls
44
+ print("Updated DataFrame with URLs:")
45
+ print(unfinished.head())
46
+
47
+ df1.update(unfinished)
48
+ df1.to_csv("search_progress1.csv", index=False)
49
+
50
+
tagged_description.txt ADDED
The diff for this file is too large to render. See raw diff
 
test_classification.ipynb ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "290dff84",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "\n",
12
+ "df = pd.read_csv(\"books_cleaned.csv\", encoding=\"utf-8\")"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 3,
18
+ "id": "2e2d9604",
19
+ "metadata": {},
20
+ "outputs": [
21
+ {
22
+ "name": "stdout",
23
+ "output_type": "stream",
24
+ "text": [
25
+ "<class 'pandas.core.frame.DataFrame'>\n",
26
+ "RangeIndex: 6397 entries, 0 to 6396\n",
27
+ "Data columns (total 11 columns):\n",
28
+ " # Column Non-Null Count Dtype \n",
29
+ "--- ------ -------------- ----- \n",
30
+ " 0 isbn13 6397 non-null int64 \n",
31
+ " 1 authors 6397 non-null object \n",
32
+ " 2 categories 6364 non-null object \n",
33
+ " 3 thumbnail 6190 non-null object \n",
34
+ " 4 description 6397 non-null object \n",
35
+ " 5 published_year 6397 non-null float64\n",
36
+ " 6 average_rating 6397 non-null float64\n",
37
+ " 7 num_pages 6397 non-null float64\n",
38
+ " 8 ratings_count 6397 non-null float64\n",
39
+ " 9 title_and_subtitle 6397 non-null object \n",
40
+ " 10 tagged_description 6397 non-null object \n",
41
+ "dtypes: float64(4), int64(1), object(6)\n",
42
+ "memory usage: 549.9+ KB\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "df.info()"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 4,
53
+ "id": "06585b26",
54
+ "metadata": {},
55
+ "outputs": [
56
+ {
57
+ "data": {
58
+ "text/html": [
59
+ "<div>\n",
60
+ "<style scoped>\n",
61
+ " .dataframe tbody tr th:only-of-type {\n",
62
+ " vertical-align: middle;\n",
63
+ " }\n",
64
+ "\n",
65
+ " .dataframe tbody tr th {\n",
66
+ " vertical-align: top;\n",
67
+ " }\n",
68
+ "\n",
69
+ " .dataframe thead th {\n",
70
+ " text-align: right;\n",
71
+ " }\n",
72
+ "</style>\n",
73
+ "<table border=\"1\" class=\"dataframe\">\n",
74
+ " <thead>\n",
75
+ " <tr style=\"text-align: right;\">\n",
76
+ " <th></th>\n",
77
+ " <th>categories</th>\n",
78
+ " <th>count</th>\n",
79
+ " </tr>\n",
80
+ " </thead>\n",
81
+ " <tbody>\n",
82
+ " <tr>\n",
83
+ " <th>0</th>\n",
84
+ " <td>Fiction</td>\n",
85
+ " <td>2491</td>\n",
86
+ " </tr>\n",
87
+ " <tr>\n",
88
+ " <th>1</th>\n",
89
+ " <td>Juvenile Fiction</td>\n",
90
+ " <td>519</td>\n",
91
+ " </tr>\n",
92
+ " <tr>\n",
93
+ " <th>2</th>\n",
94
+ " <td>Biography &amp; Autobiography</td>\n",
95
+ " <td>388</td>\n",
96
+ " </tr>\n",
97
+ " <tr>\n",
98
+ " <th>3</th>\n",
99
+ " <td>History</td>\n",
100
+ " <td>255</td>\n",
101
+ " </tr>\n",
102
+ " <tr>\n",
103
+ " <th>4</th>\n",
104
+ " <td>Literary Criticism</td>\n",
105
+ " <td>163</td>\n",
106
+ " </tr>\n",
107
+ " <tr>\n",
108
+ " <th>...</th>\n",
109
+ " <td>...</td>\n",
110
+ " <td>...</td>\n",
111
+ " </tr>\n",
112
+ " <tr>\n",
113
+ " <th>520</th>\n",
114
+ " <td>Humorous stories</td>\n",
115
+ " <td>1</td>\n",
116
+ " </tr>\n",
117
+ " <tr>\n",
118
+ " <th>521</th>\n",
119
+ " <td>Ballets</td>\n",
120
+ " <td>1</td>\n",
121
+ " </tr>\n",
122
+ " <tr>\n",
123
+ " <th>522</th>\n",
124
+ " <td>Aged women</td>\n",
125
+ " <td>1</td>\n",
126
+ " </tr>\n",
127
+ " <tr>\n",
128
+ " <th>523</th>\n",
129
+ " <td>Imperialism</td>\n",
130
+ " <td>1</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>524</th>\n",
134
+ " <td>Illinois</td>\n",
135
+ " <td>1</td>\n",
136
+ " </tr>\n",
137
+ " </tbody>\n",
138
+ "</table>\n",
139
+ "<p>525 rows × 2 columns</p>\n",
140
+ "</div>"
141
+ ],
142
+ "text/plain": [
143
+ " categories count\n",
144
+ "0 Fiction 2491\n",
145
+ "1 Juvenile Fiction 519\n",
146
+ "2 Biography & Autobiography 388\n",
147
+ "3 History 255\n",
148
+ "4 Literary Criticism 163\n",
149
+ ".. ... ...\n",
150
+ "520 Humorous stories 1\n",
151
+ "521 Ballets 1\n",
152
+ "522 Aged women 1\n",
153
+ "523 Imperialism 1\n",
154
+ "524 Illinois 1\n",
155
+ "\n",
156
+ "[525 rows x 2 columns]"
157
+ ]
158
+ },
159
+ "execution_count": 4,
160
+ "metadata": {},
161
+ "output_type": "execute_result"
162
+ }
163
+ ],
164
+ "source": [
165
+ "# too many categories\n",
166
+ "df[\"categories\"].value_counts().reset_index()"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": 5,
172
+ "id": "1976240c",
173
+ "metadata": {},
174
+ "outputs": [
175
+ {
176
+ "data": {
177
+ "text/html": [
178
+ "<div>\n",
179
+ "<style scoped>\n",
180
+ " .dataframe tbody tr th:only-of-type {\n",
181
+ " vertical-align: middle;\n",
182
+ " }\n",
183
+ "\n",
184
+ " .dataframe tbody tr th {\n",
185
+ " vertical-align: top;\n",
186
+ " }\n",
187
+ "\n",
188
+ " .dataframe thead th {\n",
189
+ " text-align: right;\n",
190
+ " }\n",
191
+ "</style>\n",
192
+ "<table border=\"1\" class=\"dataframe\">\n",
193
+ " <thead>\n",
194
+ " <tr style=\"text-align: right;\">\n",
195
+ " <th></th>\n",
196
+ " <th>isbn13</th>\n",
197
+ " <th>authors</th>\n",
198
+ " <th>categories</th>\n",
199
+ " <th>thumbnail</th>\n",
200
+ " <th>description</th>\n",
201
+ " <th>published_year</th>\n",
202
+ " <th>average_rating</th>\n",
203
+ " <th>num_pages</th>\n",
204
+ " <th>ratings_count</th>\n",
205
+ " <th>title_and_subtitle</th>\n",
206
+ " <th>tagged_description</th>\n",
207
+ " </tr>\n",
208
+ " </thead>\n",
209
+ " <tbody>\n",
210
+ " </tbody>\n",
211
+ "</table>\n",
212
+ "</div>"
213
+ ],
214
+ "text/plain": [
215
+ "Empty DataFrame\n",
216
+ "Columns: [isbn13, authors, categories, thumbnail, description, published_year, average_rating, num_pages, ratings_count, title_and_subtitle, tagged_description]\n",
217
+ "Index: []"
218
+ ]
219
+ },
220
+ "execution_count": 5,
221
+ "metadata": {},
222
+ "output_type": "execute_result"
223
+ }
224
+ ],
225
+ "source": [
226
+ "df[df[\"description\"].str.len() < 25 ]"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": null,
232
+ "id": "8effbaa7",
233
+ "metadata": {},
234
+ "outputs": [],
235
+ "source": []
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": 6,
240
+ "id": "7a11c3d3",
241
+ "metadata": {},
242
+ "outputs": [
243
+ {
244
+ "name": "stderr",
245
+ "output_type": "stream",
246
+ "text": [
247
+ "c:\\Users\\NonsoDev\\Documents\\Allcodes\\Projects_DL_for resume\\Recommender systems\\book reccomender - llm\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
248
+ " from .autonotebook import tqdm as notebook_tqdm\n",
249
+ "Device set to use cpu\n"
250
+ ]
251
+ }
252
+ ],
253
+ "source": [
254
+ "# let us change the categories to a more manageable number, fiction and non fiction with a zero shot classifier\n",
255
+ "from transformers import pipeline\n",
256
+ "classifier = pipeline(\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 7,
262
+ "id": "3cc8882a",
263
+ "metadata": {},
264
+ "outputs": [
265
+ {
266
+ "data": {
267
+ "text/html": [
268
+ "<div>\n",
269
+ "<style scoped>\n",
270
+ " .dataframe tbody tr th:only-of-type {\n",
271
+ " vertical-align: middle;\n",
272
+ " }\n",
273
+ "\n",
274
+ " .dataframe tbody tr th {\n",
275
+ " vertical-align: top;\n",
276
+ " }\n",
277
+ "\n",
278
+ " .dataframe thead th {\n",
279
+ " text-align: right;\n",
280
+ " }\n",
281
+ "</style>\n",
282
+ "<table border=\"1\" class=\"dataframe\">\n",
283
+ " <thead>\n",
284
+ " <tr style=\"text-align: right;\">\n",
285
+ " <th></th>\n",
286
+ " <th>sequence</th>\n",
287
+ " <th>categories</th>\n",
288
+ " <th>scores</th>\n",
289
+ " </tr>\n",
290
+ " </thead>\n",
291
+ " <tbody>\n",
292
+ " <tr>\n",
293
+ " <th>0</th>\n",
294
+ " <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
295
+ " <td>[fiction, history, biography, fantasy, mystery...</td>\n",
296
+ " <td>[0.8558421730995178, 0.6128803491592407, 0.296...</td>\n",
297
+ " </tr>\n",
298
+ " <tr>\n",
299
+ " <th>1</th>\n",
300
+ " <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
301
+ " <td>[mystery, fiction, fantasy, scifi, biography, ...</td>\n",
302
+ " <td>[0.9339157342910767, 0.5139176249504089, 0.155...</td>\n",
303
+ " </tr>\n",
304
+ " <tr>\n",
305
+ " <th>2</th>\n",
306
+ " <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
307
+ " <td>[fiction, fantasy, history, scifi, biography, ...</td>\n",
308
+ " <td>[0.5638813972473145, 0.2660749554634094, 0.249...</td>\n",
309
+ " </tr>\n",
310
+ " <tr>\n",
311
+ " <th>3</th>\n",
312
+ " <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
313
+ " <td>[scifi, biography, fiction, history, romance, ...</td>\n",
314
+ " <td>[0.19755955040454865, 0.09938773512840271, 0.0...</td>\n",
315
+ " </tr>\n",
316
+ " <tr>\n",
317
+ " <th>4</th>\n",
318
+ " <td>Lewis' work on the nature of love divides love...</td>\n",
319
+ " <td>[mystery, romance, history, biography, scifi, ...</td>\n",
320
+ " <td>[0.16078977286815643, 0.06188512220978737, 0.0...</td>\n",
321
+ " </tr>\n",
322
+ " <tr>\n",
323
+ " <th>5</th>\n",
324
+ " <td>\"In The Problem of Pain, C.S. Lewis, one of th...</td>\n",
325
+ " <td>[mystery, history, biography, scifi, romance, ...</td>\n",
326
+ " <td>[0.6848734021186829, 0.11091233044862747, 0.08...</td>\n",
327
+ " </tr>\n",
328
+ " <tr>\n",
329
+ " <th>6</th>\n",
330
+ " <td>Until Vasco da Gama discovered the sea-route t...</td>\n",
331
+ " <td>[history, mystery, biography, scifi, fiction, ...</td>\n",
332
+ " <td>[0.9738430976867676, 0.19697055220603943, 0.18...</td>\n",
333
+ " </tr>\n",
334
+ " <tr>\n",
335
+ " <th>7</th>\n",
336
+ " <td>A new-cover reissue of the fourth book in the ...</td>\n",
337
+ " <td>[scifi, fantasy, fiction, mystery, history, ro...</td>\n",
338
+ " <td>[0.9945376515388489, 0.9806752800941467, 0.934...</td>\n",
339
+ " </tr>\n",
340
+ " <tr>\n",
341
+ " <th>8</th>\n",
342
+ " <td>Kate Blackwell is an enigma and one of the mos...</td>\n",
343
+ " <td>[mystery, biography, fiction, scifi, history, ...</td>\n",
344
+ " <td>[0.9990025162696838, 0.43301281332969666, 0.04...</td>\n",
345
+ " </tr>\n",
346
+ " <tr>\n",
347
+ " <th>9</th>\n",
348
+ " <td>One of Sidney Sheldon's most popular and bests...</td>\n",
349
+ " <td>[romance, mystery, biography, fantasy, scifi, ...</td>\n",
350
+ " <td>[0.6518456935882568, 0.4315004348754883, 0.367...</td>\n",
351
+ " </tr>\n",
352
+ " </tbody>\n",
353
+ "</table>\n",
354
+ "</div>"
355
+ ],
356
+ "text/plain": [
357
+ " sequence \\\n",
358
+ "0 A NOVEL THAT READERS and critics have been eag... \n",
359
+ "1 A new 'Christie for Christmas' -- a full-lengt... \n",
360
+ "2 Volume Two of Stephen Donaldson's acclaimed se... \n",
361
+ "3 A memorable, mesmerizing heroine Jennifer -- b... \n",
362
+ "4 Lewis' work on the nature of love divides love... \n",
363
+ "5 \"In The Problem of Pain, C.S. Lewis, one of th... \n",
364
+ "6 Until Vasco da Gama discovered the sea-route t... \n",
365
+ "7 A new-cover reissue of the fourth book in the ... \n",
366
+ "8 Kate Blackwell is an enigma and one of the mos... \n",
367
+ "9 One of Sidney Sheldon's most popular and bests... \n",
368
+ "\n",
369
+ " categories \\\n",
370
+ "0 [fiction, history, biography, fantasy, mystery... \n",
371
+ "1 [mystery, fiction, fantasy, scifi, biography, ... \n",
372
+ "2 [fiction, fantasy, history, scifi, biography, ... \n",
373
+ "3 [scifi, biography, fiction, history, romance, ... \n",
374
+ "4 [mystery, romance, history, biography, scifi, ... \n",
375
+ "5 [mystery, history, biography, scifi, romance, ... \n",
376
+ "6 [history, mystery, biography, scifi, fiction, ... \n",
377
+ "7 [scifi, fantasy, fiction, mystery, history, ro... \n",
378
+ "8 [mystery, biography, fiction, scifi, history, ... \n",
379
+ "9 [romance, mystery, biography, fantasy, scifi, ... \n",
380
+ "\n",
381
+ " scores \n",
382
+ "0 [0.8558421730995178, 0.6128803491592407, 0.296... \n",
383
+ "1 [0.9339157342910767, 0.5139176249504089, 0.155... \n",
384
+ "2 [0.5638813972473145, 0.2660749554634094, 0.249... \n",
385
+ "3 [0.19755955040454865, 0.09938773512840271, 0.0... \n",
386
+ "4 [0.16078977286815643, 0.06188512220978737, 0.0... \n",
387
+ "5 [0.6848734021186829, 0.11091233044862747, 0.08... \n",
388
+ "6 [0.9738430976867676, 0.19697055220603943, 0.18... \n",
389
+ "7 [0.9945376515388489, 0.9806752800941467, 0.934... \n",
390
+ "8 [0.9990025162696838, 0.43301281332969666, 0.04... \n",
391
+ "9 [0.6518456935882568, 0.4315004348754883, 0.367... "
392
+ ]
393
+ },
394
+ "execution_count": 7,
395
+ "metadata": {},
396
+ "output_type": "execute_result"
397
+ }
398
+ ],
399
+ "source": [
400
+ "fiction_categories = [\"fiction\",\"mystery\",\"romance\",\"scifi\",\"fantasy\",\"biography\",\"history\"]\n",
401
+ "\n",
402
+ "df.head(10).apply(\n",
403
+ " lambda x: classifier(x[\"description\"], candidate_labels=fiction_categories, multi_label=True),\n",
404
+ " axis=1,\n",
405
+ " result_type=\"expand\",\n",
406
+ ").rename(columns={\"labels\": \"categories\", \"scores\": \"scores\"})"
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "execution_count": 8,
412
+ "id": "365964c7",
413
+ "metadata": {},
414
+ "outputs": [
415
+ {
416
+ "data": {
417
+ "text/html": [
418
+ "<div>\n",
419
+ "<style scoped>\n",
420
+ " .dataframe tbody tr th:only-of-type {\n",
421
+ " vertical-align: middle;\n",
422
+ " }\n",
423
+ "\n",
424
+ " .dataframe tbody tr th {\n",
425
+ " vertical-align: top;\n",
426
+ " }\n",
427
+ "\n",
428
+ " .dataframe thead th {\n",
429
+ " text-align: right;\n",
430
+ " }\n",
431
+ "</style>\n",
432
+ "<table border=\"1\" class=\"dataframe\">\n",
433
+ " <thead>\n",
434
+ " <tr style=\"text-align: right;\">\n",
435
+ " <th></th>\n",
436
+ " <th>isbn13</th>\n",
437
+ " <th>authors</th>\n",
438
+ " <th>categories</th>\n",
439
+ " <th>thumbnail</th>\n",
440
+ " <th>description</th>\n",
441
+ " <th>published_year</th>\n",
442
+ " <th>average_rating</th>\n",
443
+ " <th>num_pages</th>\n",
444
+ " <th>ratings_count</th>\n",
445
+ " <th>title_and_subtitle</th>\n",
446
+ " <th>tagged_description</th>\n",
447
+ " </tr>\n",
448
+ " </thead>\n",
449
+ " <tbody>\n",
450
+ " <tr>\n",
451
+ " <th>0</th>\n",
452
+ " <td>9780002005883</td>\n",
453
+ " <td>Marilynne Robinson</td>\n",
454
+ " <td>Fiction</td>\n",
455
+ " <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
456
+ " <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
457
+ " <td>2004.0</td>\n",
458
+ " <td>3.85</td>\n",
459
+ " <td>247.0</td>\n",
460
+ " <td>361.0</td>\n",
461
+ " <td>Gilead</td>\n",
462
+ " <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
463
+ " </tr>\n",
464
+ " <tr>\n",
465
+ " <th>1</th>\n",
466
+ " <td>9780002261982</td>\n",
467
+ " <td>Charles Osborne;Agatha Christie</td>\n",
468
+ " <td>Detective and mystery stories</td>\n",
469
+ " <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
470
+ " <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
471
+ " <td>2000.0</td>\n",
472
+ " <td>3.83</td>\n",
473
+ " <td>241.0</td>\n",
474
+ " <td>5164.0</td>\n",
475
+ " <td>Spider's Web A Novel</td>\n",
476
+ " <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
477
+ " </tr>\n",
478
+ " <tr>\n",
479
+ " <th>2</th>\n",
480
+ " <td>9780006163831</td>\n",
481
+ " <td>Stephen R. Donaldson</td>\n",
482
+ " <td>American fiction</td>\n",
483
+ " <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
484
+ " <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
485
+ " <td>1982.0</td>\n",
486
+ " <td>3.97</td>\n",
487
+ " <td>479.0</td>\n",
488
+ " <td>172.0</td>\n",
489
+ " <td>The One Tree</td>\n",
490
+ " <td>9780006163831 Volume Two of Stephen Donaldson'...</td>\n",
491
+ " </tr>\n",
492
+ " <tr>\n",
493
+ " <th>3</th>\n",
494
+ " <td>9780006178736</td>\n",
495
+ " <td>Sidney Sheldon</td>\n",
496
+ " <td>Fiction</td>\n",
497
+ " <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
498
+ " <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
499
+ " <td>1993.0</td>\n",
500
+ " <td>3.93</td>\n",
501
+ " <td>512.0</td>\n",
502
+ " <td>29532.0</td>\n",
503
+ " <td>Rage of angels</td>\n",
504
+ " <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
505
+ " </tr>\n",
506
+ " <tr>\n",
507
+ " <th>4</th>\n",
508
+ " <td>9780006280897</td>\n",
509
+ " <td>Clive Staples Lewis</td>\n",
510
+ " <td>Christian life</td>\n",
511
+ " <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
512
+ " <td>Lewis' work on the nature of love divides love...</td>\n",
513
+ " <td>2002.0</td>\n",
514
+ " <td>4.15</td>\n",
515
+ " <td>170.0</td>\n",
516
+ " <td>33684.0</td>\n",
517
+ " <td>The Four Loves</td>\n",
518
+ " <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
519
+ " </tr>\n",
520
+ " </tbody>\n",
521
+ "</table>\n",
522
+ "</div>"
523
+ ],
524
+ "text/plain": [
525
+ " isbn13 authors \\\n",
526
+ "0 9780002005883 Marilynne Robinson \n",
527
+ "1 9780002261982 Charles Osborne;Agatha Christie \n",
528
+ "2 9780006163831 Stephen R. Donaldson \n",
529
+ "3 9780006178736 Sidney Sheldon \n",
530
+ "4 9780006280897 Clive Staples Lewis \n",
531
+ "\n",
532
+ " categories \\\n",
533
+ "0 Fiction \n",
534
+ "1 Detective and mystery stories \n",
535
+ "2 American fiction \n",
536
+ "3 Fiction \n",
537
+ "4 Christian life \n",
538
+ "\n",
539
+ " thumbnail \\\n",
540
+ "0 http://books.google.com/books/content?id=KQZCP... \n",
541
+ "1 http://books.google.com/books/content?id=gA5GP... \n",
542
+ "2 http://books.google.com/books/content?id=OmQaw... \n",
543
+ "3 http://books.google.com/books/content?id=FKo2T... \n",
544
+ "4 http://books.google.com/books/content?id=XhQ5X... \n",
545
+ "\n",
546
+ " description published_year \\\n",
547
+ "0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
548
+ "1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
549
+ "2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
550
+ "3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
551
+ "4 Lewis' work on the nature of love divides love... 2002.0 \n",
552
+ "\n",
553
+ " average_rating num_pages ratings_count title_and_subtitle \\\n",
554
+ "0 3.85 247.0 361.0 Gilead \n",
555
+ "1 3.83 241.0 5164.0 Spider's Web A Novel \n",
556
+ "2 3.97 479.0 172.0 The One Tree \n",
557
+ "3 3.93 512.0 29532.0 Rage of angels \n",
558
+ "4 4.15 170.0 33684.0 The Four Loves \n",
559
+ "\n",
560
+ " tagged_description \n",
561
+ "0 9780002005883 A NOVEL THAT READERS and critics... \n",
562
+ "1 9780002261982 A new 'Christie for Christmas' -... \n",
563
+ "2 9780006163831 Volume Two of Stephen Donaldson'... \n",
564
+ "3 9780006178736 A memorable, mesmerizing heroine... \n",
565
+ "4 9780006280897 Lewis' work on the nature of lov... "
566
+ ]
567
+ },
568
+ "execution_count": 8,
569
+ "metadata": {},
570
+ "output_type": "execute_result"
571
+ }
572
+ ],
573
+ "source": [
574
+ "df.head()"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "code",
579
+ "execution_count": null,
580
+ "id": "96c4d3c9",
581
+ "metadata": {},
582
+ "outputs": [],
583
+ "source": [
584
+ "from googlesearch import search\n",
585
+ "\n",
586
+ "def fetch_first_google_link(query):\n",
587
+ " results = search(query, num_results=1, lang=\"en\")\n",
588
+ " return list(results) if results else None\n",
589
+ "\n",
590
+ "\n"
591
+ ]
592
+ },
593
+ {
594
+ "cell_type": "code",
595
+ "execution_count": 25,
596
+ "id": "402a59f3",
597
+ "metadata": {},
598
+ "outputs": [
599
+ {
600
+ "name": "stdout",
601
+ "output_type": "stream",
602
+ "text": [
603
+ "['https://books.google.com/books/about/The_One_Tree.html?id=dXzwAAAAQBAJ&source=kp_cover']\n"
604
+ ]
605
+ }
606
+ ],
607
+ "source": [
608
+ "print(fetch_first_google_link(\"The One Tree by Stephen R. Donaldson -google books\"))"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": null,
614
+ "id": "be3349dc",
615
+ "metadata": {},
616
+ "outputs": [],
617
+ "source": []
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": null,
622
+ "id": "7da59931",
623
+ "metadata": {},
624
+ "outputs": [],
625
+ "source": []
626
+ }
627
+ ],
628
+ "metadata": {
629
+ "kernelspec": {
630
+ "display_name": "venv",
631
+ "language": "python",
632
+ "name": "python3"
633
+ },
634
+ "language_info": {
635
+ "codemirror_mode": {
636
+ "name": "ipython",
637
+ "version": 3
638
+ },
639
+ "file_extension": ".py",
640
+ "mimetype": "text/x-python",
641
+ "name": "python",
642
+ "nbconvert_exporter": "python",
643
+ "pygments_lexer": "ipython3",
644
+ "version": "3.11.9"
645
+ }
646
+ },
647
+ "nbformat": 4,
648
+ "nbformat_minor": 5
649
+ }
to_drop.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 768
2
+ 806
3
+ 1170
4
+ 1269
5
+ 1311
6
+ 1343
7
+ 2311
8
+ 2389
9
+ 2536
10
+ 3270
11
+ 3572
12
+ 4228
13
+ 4941
14
+ 5292
15
+ 5293
16
+ 6085
vector_search.ipynb ADDED
The diff for this file is too large to render. See raw diff