Spaces:
Sleeping
Sleeping
Commit ·
d38101e
0
Parent(s):
first commit
Browse files- .gitignore +2 -0
- .html +166 -0
- books_cleaned.csv +0 -0
- books_with_categories.csv +0 -0
- books_with_sentiment.csv +0 -0
- books_with_urls.csv +0 -0
- cover-not-found.jpg +0 -0
- data-exploration.ipynb +1111 -0
- download_url.ipynb +1353 -0
- final_book_df.csv +0 -0
- final_df.ipynb +590 -0
- gradio_dashboard.py +122 -0
- requirements.txt +0 -0
- search_progress.csv +0 -0
- sentiment_analysis.ipynb +334 -0
- supervised_clean.py +50 -0
- tagged_description.txt +0 -0
- test_classification.ipynb +649 -0
- to_drop.txt +16 -0
- vector_search.ipynb +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv/
|
| 2 |
+
.env
|
.html
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<html>
|
| 2 |
+
<head>
|
| 3 |
+
<link rel="preconnect" href="https://fonts.gstatic.com/" crossorigin="" />
|
| 4 |
+
<link
|
| 5 |
+
rel="stylesheet"
|
| 6 |
+
as="style"
|
| 7 |
+
onload="this.rel='stylesheet'"
|
| 8 |
+
href="https://fonts.googleapis.com/css2?display=swap&family=Noto+Serif%3Awght%40400%3B500%3B700%3B900&family=Noto+Sans%3Awght%40400%3B500%3B700%3B900"
|
| 9 |
+
/>
|
| 10 |
+
|
| 11 |
+
<title>BookWise - Responsive</title>
|
| 12 |
+
<link rel="icon" type="image/x-icon" href="data:image/x-icon;base64," />
|
| 13 |
+
|
| 14 |
+
<script src="https://cdn.tailwindcss.com?plugins=forms,container-queries"></script>
|
| 15 |
+
</head>
|
| 16 |
+
<body>
|
| 17 |
+
<div
|
| 18 |
+
class="relative flex size-full min-h-screen flex-col bg-white group/design-root overflow-x-hidden"
|
| 19 |
+
style='--select-button-svg: url('data:image/svg+xml,%3csvg xmlns=%27http://www.w3.org/2000/svg%27 width=%2724px%27 height=%2724px%27 fill=%27rgb(99,116,136)%27 viewBox=%270 0 256 256%27%3e%3cpath d=%27M181.66,170.34a8,8,0,0,1,0,11.32l-48,48a8,8,0,0,1-11.32,0l-48-48a8,8,0,0,1,11.32-11.32L128,212.69l42.34-42.35A8,8,0,0,1,181.66,170.34Zm-96-84.68L128,43.31l42.34,42.35a8,8,0,0,0,11.32-11.32l-48-48a8,8,0,0,0-11.32,0l-48,48A8,8,0,0,0,85.66,85.66Z%27%3e%3c/path%3e%3c/svg%3e'); font-family: "Noto Serif", "Noto Sans", sans-serif;'
|
| 20 |
+
>
|
| 21 |
+
<div class="layout-container flex h-full grow flex-col">
|
| 22 |
+
<header class="flex items-center justify-between whitespace-nowrap border-b border-solid border-b-[#f0f2f4] px-4 sm:px-6 lg:px-10 py-3">
|
| 23 |
+
<div class="flex items-center gap-4 sm:gap-8">
|
| 24 |
+
<div class="flex items-center gap-4 text-[#111418]">
|
| 25 |
+
<div class="size-4">
|
| 26 |
+
<svg viewBox="0 0 48 48" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M6 6H42L36 24L42 42H6L12 24L6 6Z" fill="currentColor"></path></svg>
|
| 27 |
+
</div>
|
| 28 |
+
<h2 class="text-[#111418] text-lg font-bold leading-tight tracking-[-0.015em]">BookWise</h2>
|
| 29 |
+
</div>
|
| 30 |
+
<div class="flex items-center gap-4 sm:gap-6 lg:gap-9">
|
| 31 |
+
<a class="text-[#111418] text-sm font-medium leading-normal" href="#">Home</a>
|
| 32 |
+
<a class="text-[#111418] text-sm font-medium leading-normal" href="#">My Library</a>
|
| 33 |
+
</div>
|
| 34 |
+
</div>
|
| 35 |
+
<div class="flex flex-1 justify-end gap-2 sm:gap-4 lg:gap-8">
|
| 36 |
+
<label class="hidden sm:flex flex-col min-w-32 md:min-w-40 !h-10 max-w-64">
|
| 37 |
+
<div class="flex w-full flex-1 items-stretch rounded-xl h-full">
|
| 38 |
+
<input
|
| 39 |
+
type="search"
|
| 40 |
+
placeholder="Search books..."
|
| 41 |
+
class="flex-1 px-3 py-2 text-sm border border-[#dce0e5] rounded-xl focus:outline-none focus:border-[#1669c9]"
|
| 42 |
+
/>
|
| 43 |
+
</div>
|
| 44 |
+
</label>
|
| 45 |
+
<div
|
| 46 |
+
class="bg-center bg-no-repeat aspect-square bg-cover rounded-full size-8 sm:size-10"
|
| 47 |
+
style='background-image: url("https://lh3.googleusercontent.com/aida-public/AB6AXuBUFyw4SUtL0Bo77m9z_aqCPzx8jrYQ23Iz0YTuhs46ShVxgc5Soj1GQwKpCt9ZxQMSKH5bT-lodTkHcLdV3_qNp12gLkZTdHBtJFt5bDjUqT7CZHFN0QfWSoqRdPy4zx8RW_6N_MEmDJckbi0Ea2st3Kx-6gFPNMFCOLx2ofYQrOiSQ_kbKQw-wWQ7H8CvhkaTaXLXGEcpDXN5EJA8-UbK19-eAe34zXeJkXlqE3873k0hhvB6XGP2etAtUFf0e17br6aohFXWQKw");'
|
| 48 |
+
></div>
|
| 49 |
+
</div>
|
| 50 |
+
</header>
|
| 51 |
+
<div class="px-4 sm:px-8 lg:px-40 flex flex-1 justify-center py-5">
|
| 52 |
+
<div class="layout-content-container flex flex-col max-w-[960px] flex-1">
|
| 53 |
+
<div class="flex flex-wrap justify-between gap-3 p-4">
|
| 54 |
+
<p class="text-[#111418] tracking-light text-2xl sm:text-[32px] font-bold leading-tight min-w-0">Describe Your Ideal Book</p>
|
| 55 |
+
</div>
|
| 56 |
+
<div class="flex max-w-full sm:max-w-[480px] flex-wrap items-end gap-4 px-4 py-3">
|
| 57 |
+
<label class="flex flex-col min-w-40 flex-1">
|
| 58 |
+
<textarea
|
| 59 |
+
placeholder="Enter a few words or a detailed description of a book you enjoyed or wish to read."
|
| 60 |
+
class="form-input flex w-full min-w-0 flex-1 resize-none overflow-hidden rounded-xl text-[#111418] focus:outline-0 focus:ring-0 border border-[#dce0e5] bg-white focus:border-[#dce0e5] min-h-24 sm:min-h-36 placeholder:text-[#637488] p-[15px] text-base font-normal leading-normal"
|
| 61 |
+
></textarea>
|
| 62 |
+
</label>
|
| 63 |
+
</div>
|
| 64 |
+
<div class="flex max-w-full sm:max-w-[480px] flex-wrap items-end gap-4 px-4 py-3">
|
| 65 |
+
<label class="flex flex-col min-w-40 flex-1">
|
| 66 |
+
<p class="text-[#111418] text-base font-medium leading-normal pb-2">Genre</p>
|
| 67 |
+
<select
|
| 68 |
+
class="form-input flex w-full min-w-0 flex-1 resize-none overflow-hidden rounded-xl text-[#111418] focus:outline-0 focus:ring-0 border border-[#dce0e5] bg-white focus:border-[#dce0e5] h-14 bg-[image:--select-button-svg] placeholder:text-[#637488] p-[15px] text-base font-normal leading-normal"
|
| 69 |
+
>
|
| 70 |
+
<option value="">Select a genre</option>
|
| 71 |
+
<option value="fiction">Fiction</option>
|
| 72 |
+
<option value="mystery">Mystery</option>
|
| 73 |
+
<option value="romance">Romance</option>
|
| 74 |
+
<option value="scifi">Science Fiction</option>
|
| 75 |
+
<option value="fantasy">Fantasy</option>
|
| 76 |
+
<option value="biography">Biography</option>
|
| 77 |
+
<option value="history">History</option>
|
| 78 |
+
</select>
|
| 79 |
+
</label>
|
| 80 |
+
</div>
|
| 81 |
+
<div class="flex max-w-full sm:max-w-[480px] flex-wrap items-end gap-4 px-4 py-3">
|
| 82 |
+
<label class="flex flex-col min-w-40 flex-1">
|
| 83 |
+
<p class="text-[#111418] text-base font-medium leading-normal pb-2">Tone</p>
|
| 84 |
+
<select
|
| 85 |
+
class="form-input flex w-full min-w-0 flex-1 resize-none overflow-hidden rounded-xl text-[#111418] focus:outline-0 focus:ring-0 border border-[#dce0e5] bg-white focus:border-[#dce0e5] h-14 bg-[image:--select-button-svg] placeholder:text-[#637488] p-[15px] text-base font-normal leading-normal"
|
| 86 |
+
>
|
| 87 |
+
<option value="">Select a tone</option>
|
| 88 |
+
<option value="light">Light & Humorous</option>
|
| 89 |
+
<option value="serious">Serious & Dramatic</option>
|
| 90 |
+
<option value="dark">Dark & Mysterious</option>
|
| 91 |
+
<option value="uplifting">Uplifting & Inspiring</option>
|
| 92 |
+
<option value="thoughtful">Thoughtful & Reflective</option>
|
| 93 |
+
</select>
|
| 94 |
+
</label>
|
| 95 |
+
</div>
|
| 96 |
+
<div class="flex px-4 py-3">
|
| 97 |
+
<button
|
| 98 |
+
class="flex min-w-[84px] max-w-full sm:max-w-[480px] cursor-pointer items-center justify-center overflow-hidden rounded-full h-12 px-5 flex-1 bg-[#1669c9] text-white text-base font-bold leading-normal tracking-[0.015em]"
|
| 99 |
+
>
|
| 100 |
+
<span class="truncate">Search</span>
|
| 101 |
+
</button>
|
| 102 |
+
</div>
|
| 103 |
+
<h2 class="text-[#111418] text-xl sm:text-[22px] font-bold leading-tight tracking-[-0.015em] px-4 pb-3 pt-5">Recommended Books</h2>
|
| 104 |
+
<div class="p-4">
|
| 105 |
+
<div class="flex flex-col sm:flex-row items-stretch justify-between gap-4 rounded-xl">
|
| 106 |
+
<div class="flex flex-col sm:flex-[2_2_0px] gap-4">
|
| 107 |
+
<div class="flex flex-col gap-1">
|
| 108 |
+
<p class="text-[#111418] text-base font-bold leading-tight">The Secret Garden</p>
|
| 109 |
+
<p class="text-[#637488] text-sm font-normal leading-normal">Frances Bennett | A young girl discovers a hidden garden and unlocks its mysteries.</p>
|
| 110 |
+
</div>
|
| 111 |
+
<button
|
| 112 |
+
class="flex min-w-[84px] max-w-[480px] cursor-pointer items-center justify-center overflow-hidden rounded-full h-8 px-4 flex-row-reverse bg-[#f0f2f4] text-[#111418] text-sm font-medium leading-normal w-fit"
|
| 113 |
+
>
|
| 114 |
+
<span class="truncate">More Details</span>
|
| 115 |
+
</button>
|
| 116 |
+
</div>
|
| 117 |
+
<div
|
| 118 |
+
class="w-full bg-center bg-no-repeat aspect-video bg-cover rounded-xl flex-1 min-h-32 sm:min-h-0"
|
| 119 |
+
style='background-image: url("https://lh3.googleusercontent.com/aida-public/AB6AXuAhvJGw2xq2ulM1eOrAvTprpJpVc1vuviCpOk8vShEIBbBx6QE73cIwxDuKmNFPbS1D_uvqCRhBTFMIRIbEhMCoNvAu4T2c3GUpj-Ek1cgDY-S88u5m3Djfv3jKbmWHyzo9bSf3w1MZgWEevsLl5Ug3NWZ49xQB46X4MpQb9BRL6MjvUI12TbRp-P2ho9PALgBlj7Y2ZIVWKVQSHkwgO7_aeYqeQNTKOS4RxQrwHKBB-inDY6CtKFFi4P2WhiVp9PrnR8g5hVhij1k");'
|
| 120 |
+
></div>
|
| 121 |
+
</div>
|
| 122 |
+
</div>
|
| 123 |
+
<div class="p-4">
|
| 124 |
+
<div class="flex flex-col sm:flex-row items-stretch justify-between gap-4 rounded-xl">
|
| 125 |
+
<div class="flex flex-col sm:flex-[2_2_0px] gap-4">
|
| 126 |
+
<div class="flex flex-col gap-1">
|
| 127 |
+
<p class="text-[#111418] text-base font-bold leading-tight">The Adventures of Tom Sawyer</p>
|
| 128 |
+
<p class="text-[#637488] text-sm font-normal leading-normal">Mark Twain | A mischievous boy's escapades along the Mississippi River.</p>
|
| 129 |
+
</div>
|
| 130 |
+
<button
|
| 131 |
+
class="flex min-w-[84px] max-w-[480px] cursor-pointer items-center justify-center overflow-hidden rounded-full h-8 px-4 flex-row-reverse bg-[#f0f2f4] text-[#111418] text-sm font-medium leading-normal w-fit"
|
| 132 |
+
>
|
| 133 |
+
<span class="truncate">More Details</span>
|
| 134 |
+
</button>
|
| 135 |
+
</div>
|
| 136 |
+
<div
|
| 137 |
+
class="w-full bg-center bg-no-repeat aspect-video bg-cover rounded-xl flex-1 min-h-32 sm:min-h-0"
|
| 138 |
+
style='background-image: url("https://lh3.googleusercontent.com/aida-public/AB6AXuCAiL_cW6RPNH4wyHoJFKsAzdorxksRaMXfr-QII83iGD0eJD1GZ8Jlbsq9oxbEIHkL9O0P_AtPUnrX146wgZ6bJefkas6SjVdM1uRe15ZbtlxjWfxz6k057F-6z7_UJhV8KQ5R1NR9hcxYTRhkWo9J6mCCJIY8NQmVc8YKfwKHgHEAC3UV3rPRXOK3bQfw6zdSQrulwq6jDm69jbmy2TiS5hCTkE1igPfIUtedG4KUxMM8p1IRy6OJfHbJfh4V78FoE1bf9VHijwA");'
|
| 139 |
+
></div>
|
| 140 |
+
</div>
|
| 141 |
+
</div>
|
| 142 |
+
<div class="p-4">
|
| 143 |
+
<div class="flex flex-col sm:flex-row items-stretch justify-between gap-4 rounded-xl">
|
| 144 |
+
<div class="flex flex-col sm:flex-[2_2_0px] gap-4">
|
| 145 |
+
<div class="flex flex-col gap-1">
|
| 146 |
+
<p class="text-[#111418] text-base font-bold leading-tight">Pride and Prejudice</p>
|
| 147 |
+
<p class="text-[#637488] text-sm font-normal leading-normal">Jane Austen | A classic tale of love and societal expectations in 19th-century England.</p>
|
| 148 |
+
</div>
|
| 149 |
+
<button
|
| 150 |
+
class="flex min-w-[84px] max-w-[480px] cursor-pointer items-center justify-center overflow-hidden rounded-full h-8 px-4 flex-row-reverse bg-[#f0f2f4] text-[#111418] text-sm font-medium leading-normal w-fit"
|
| 151 |
+
>
|
| 152 |
+
<span class="truncate">More Details</span>
|
| 153 |
+
</button>
|
| 154 |
+
</div>
|
| 155 |
+
<div
|
| 156 |
+
class="w-full bg-center bg-no-repeat aspect-video bg-cover rounded-xl flex-1 min-h-32 sm:min-h-0"
|
| 157 |
+
style='background-image: url("https://lh3.googleusercontent.com/aida-public/AB6AXuAvoPL4nOSFDk3mNVaL1VmxSQw32s11eLzXYFT5EnmGXgl7pxTJJ9uEwowWGn54SgqFto0TNEqkwde-sanAoLRLWL_puvGrXW0xwzMX6fQrfDLo9daSrGViJT8rB9WePaw1n_Cm_XK9Uruv4c6M-7RcrpVZfGpEYCJ1wyu9ls87x8w3fA6bi7kUM_aebpza82L8qQ583ikVjOc45xjgiCH0MnJnZJQPbNpTEfQizMBw1EFjE7CI-RnUovvqXJC4-R74Q-KBUbgtJCA");'
|
| 158 |
+
></div>
|
| 159 |
+
</div>
|
| 160 |
+
</div>
|
| 161 |
+
</div>
|
| 162 |
+
</div>
|
| 163 |
+
</div>
|
| 164 |
+
</div>
|
| 165 |
+
</body>
|
| 166 |
+
</html>
|
books_cleaned.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
books_with_categories.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
books_with_sentiment.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
books_with_urls.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
cover-not-found.jpg
ADDED
|
data-exploration.ipynb
ADDED
|
@@ -0,0 +1,1111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 2,
|
| 6 |
+
"metadata": {
|
| 7 |
+
"collapsed": true
|
| 8 |
+
},
|
| 9 |
+
"outputs": [
|
| 10 |
+
{
|
| 11 |
+
"name": "stderr",
|
| 12 |
+
"output_type": "stream",
|
| 13 |
+
"text": [
|
| 14 |
+
"c:\\Users\\NonsoDev\\Documents\\Allcodes\\Projects_DL_for resume\\Recommender systems\\book reccomender - llm\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 15 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 16 |
+
]
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"name": "stdout",
|
| 20 |
+
"output_type": "stream",
|
| 21 |
+
"text": [
|
| 22 |
+
"Path to dataset files: C:\\Users\\NonsoDev\\.cache\\kagglehub\\datasets\\dylanjcastillo\\7k-books-with-metadata\\versions\\3\n"
|
| 23 |
+
]
|
| 24 |
+
}
|
| 25 |
+
],
|
| 26 |
+
"source": [
|
| 27 |
+
"import kagglehub\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"# Download latest version\n",
|
| 30 |
+
"path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n",
|
| 31 |
+
"\n",
|
| 32 |
+
"print(\"Path to dataset files:\", path)"
|
| 33 |
+
]
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"cell_type": "code",
|
| 37 |
+
"execution_count": 3,
|
| 38 |
+
"metadata": {},
|
| 39 |
+
"outputs": [
|
| 40 |
+
{
|
| 41 |
+
"name": "stdout",
|
| 42 |
+
"output_type": "stream",
|
| 43 |
+
"text": [
|
| 44 |
+
"Number of books: 6810\n"
|
| 45 |
+
]
|
| 46 |
+
}
|
| 47 |
+
],
|
| 48 |
+
"source": [
|
| 49 |
+
"import pandas as pd\n",
|
| 50 |
+
"data = pd.read_csv(path + \"/books.csv\")\n",
|
| 51 |
+
"print(\"Number of books:\", len(data))"
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"cell_type": "code",
|
| 56 |
+
"execution_count": 4,
|
| 57 |
+
"metadata": {},
|
| 58 |
+
"outputs": [
|
| 59 |
+
{
|
| 60 |
+
"data": {
|
| 61 |
+
"text/html": [
|
| 62 |
+
"<div>\n",
|
| 63 |
+
"<style scoped>\n",
|
| 64 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 65 |
+
" vertical-align: middle;\n",
|
| 66 |
+
" }\n",
|
| 67 |
+
"\n",
|
| 68 |
+
" .dataframe tbody tr th {\n",
|
| 69 |
+
" vertical-align: top;\n",
|
| 70 |
+
" }\n",
|
| 71 |
+
"\n",
|
| 72 |
+
" .dataframe thead th {\n",
|
| 73 |
+
" text-align: right;\n",
|
| 74 |
+
" }\n",
|
| 75 |
+
"</style>\n",
|
| 76 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 77 |
+
" <thead>\n",
|
| 78 |
+
" <tr style=\"text-align: right;\">\n",
|
| 79 |
+
" <th></th>\n",
|
| 80 |
+
" <th>isbn13</th>\n",
|
| 81 |
+
" <th>isbn10</th>\n",
|
| 82 |
+
" <th>title</th>\n",
|
| 83 |
+
" <th>subtitle</th>\n",
|
| 84 |
+
" <th>authors</th>\n",
|
| 85 |
+
" <th>categories</th>\n",
|
| 86 |
+
" <th>thumbnail</th>\n",
|
| 87 |
+
" <th>description</th>\n",
|
| 88 |
+
" <th>published_year</th>\n",
|
| 89 |
+
" <th>average_rating</th>\n",
|
| 90 |
+
" <th>num_pages</th>\n",
|
| 91 |
+
" <th>ratings_count</th>\n",
|
| 92 |
+
" </tr>\n",
|
| 93 |
+
" </thead>\n",
|
| 94 |
+
" <tbody>\n",
|
| 95 |
+
" <tr>\n",
|
| 96 |
+
" <th>0</th>\n",
|
| 97 |
+
" <td>9780002005883</td>\n",
|
| 98 |
+
" <td>0002005883</td>\n",
|
| 99 |
+
" <td>Gilead</td>\n",
|
| 100 |
+
" <td>NaN</td>\n",
|
| 101 |
+
" <td>Marilynne Robinson</td>\n",
|
| 102 |
+
" <td>Fiction</td>\n",
|
| 103 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 104 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 105 |
+
" <td>2004.0</td>\n",
|
| 106 |
+
" <td>3.85</td>\n",
|
| 107 |
+
" <td>247.0</td>\n",
|
| 108 |
+
" <td>361.0</td>\n",
|
| 109 |
+
" </tr>\n",
|
| 110 |
+
" <tr>\n",
|
| 111 |
+
" <th>1</th>\n",
|
| 112 |
+
" <td>9780002261982</td>\n",
|
| 113 |
+
" <td>0002261987</td>\n",
|
| 114 |
+
" <td>Spider's Web</td>\n",
|
| 115 |
+
" <td>A Novel</td>\n",
|
| 116 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 117 |
+
" <td>Detective and mystery stories</td>\n",
|
| 118 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 119 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 120 |
+
" <td>2000.0</td>\n",
|
| 121 |
+
" <td>3.83</td>\n",
|
| 122 |
+
" <td>241.0</td>\n",
|
| 123 |
+
" <td>5164.0</td>\n",
|
| 124 |
+
" </tr>\n",
|
| 125 |
+
" <tr>\n",
|
| 126 |
+
" <th>2</th>\n",
|
| 127 |
+
" <td>9780006163831</td>\n",
|
| 128 |
+
" <td>0006163831</td>\n",
|
| 129 |
+
" <td>The One Tree</td>\n",
|
| 130 |
+
" <td>NaN</td>\n",
|
| 131 |
+
" <td>Stephen R. Donaldson</td>\n",
|
| 132 |
+
" <td>American fiction</td>\n",
|
| 133 |
+
" <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
|
| 134 |
+
" <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
|
| 135 |
+
" <td>1982.0</td>\n",
|
| 136 |
+
" <td>3.97</td>\n",
|
| 137 |
+
" <td>479.0</td>\n",
|
| 138 |
+
" <td>172.0</td>\n",
|
| 139 |
+
" </tr>\n",
|
| 140 |
+
" <tr>\n",
|
| 141 |
+
" <th>3</th>\n",
|
| 142 |
+
" <td>9780006178736</td>\n",
|
| 143 |
+
" <td>0006178731</td>\n",
|
| 144 |
+
" <td>Rage of angels</td>\n",
|
| 145 |
+
" <td>NaN</td>\n",
|
| 146 |
+
" <td>Sidney Sheldon</td>\n",
|
| 147 |
+
" <td>Fiction</td>\n",
|
| 148 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 149 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 150 |
+
" <td>1993.0</td>\n",
|
| 151 |
+
" <td>3.93</td>\n",
|
| 152 |
+
" <td>512.0</td>\n",
|
| 153 |
+
" <td>29532.0</td>\n",
|
| 154 |
+
" </tr>\n",
|
| 155 |
+
" <tr>\n",
|
| 156 |
+
" <th>4</th>\n",
|
| 157 |
+
" <td>9780006280897</td>\n",
|
| 158 |
+
" <td>0006280897</td>\n",
|
| 159 |
+
" <td>The Four Loves</td>\n",
|
| 160 |
+
" <td>NaN</td>\n",
|
| 161 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 162 |
+
" <td>Christian life</td>\n",
|
| 163 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 164 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 165 |
+
" <td>2002.0</td>\n",
|
| 166 |
+
" <td>4.15</td>\n",
|
| 167 |
+
" <td>170.0</td>\n",
|
| 168 |
+
" <td>33684.0</td>\n",
|
| 169 |
+
" </tr>\n",
|
| 170 |
+
" </tbody>\n",
|
| 171 |
+
"</table>\n",
|
| 172 |
+
"</div>"
|
| 173 |
+
],
|
| 174 |
+
"text/plain": [
|
| 175 |
+
" isbn13 isbn10 title subtitle \\\n",
|
| 176 |
+
"0 9780002005883 0002005883 Gilead NaN \n",
|
| 177 |
+
"1 9780002261982 0002261987 Spider's Web A Novel \n",
|
| 178 |
+
"2 9780006163831 0006163831 The One Tree NaN \n",
|
| 179 |
+
"3 9780006178736 0006178731 Rage of angels NaN \n",
|
| 180 |
+
"4 9780006280897 0006280897 The Four Loves NaN \n",
|
| 181 |
+
"\n",
|
| 182 |
+
" authors categories \\\n",
|
| 183 |
+
"0 Marilynne Robinson Fiction \n",
|
| 184 |
+
"1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
|
| 185 |
+
"2 Stephen R. Donaldson American fiction \n",
|
| 186 |
+
"3 Sidney Sheldon Fiction \n",
|
| 187 |
+
"4 Clive Staples Lewis Christian life \n",
|
| 188 |
+
"\n",
|
| 189 |
+
" thumbnail \\\n",
|
| 190 |
+
"0 http://books.google.com/books/content?id=KQZCP... \n",
|
| 191 |
+
"1 http://books.google.com/books/content?id=gA5GP... \n",
|
| 192 |
+
"2 http://books.google.com/books/content?id=OmQaw... \n",
|
| 193 |
+
"3 http://books.google.com/books/content?id=FKo2T... \n",
|
| 194 |
+
"4 http://books.google.com/books/content?id=XhQ5X... \n",
|
| 195 |
+
"\n",
|
| 196 |
+
" description published_year \\\n",
|
| 197 |
+
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
|
| 198 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
|
| 199 |
+
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
|
| 200 |
+
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
|
| 201 |
+
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
|
| 202 |
+
"\n",
|
| 203 |
+
" average_rating num_pages ratings_count \n",
|
| 204 |
+
"0 3.85 247.0 361.0 \n",
|
| 205 |
+
"1 3.83 241.0 5164.0 \n",
|
| 206 |
+
"2 3.97 479.0 172.0 \n",
|
| 207 |
+
"3 3.93 512.0 29532.0 \n",
|
| 208 |
+
"4 4.15 170.0 33684.0 "
|
| 209 |
+
]
|
| 210 |
+
},
|
| 211 |
+
"execution_count": 4,
|
| 212 |
+
"metadata": {},
|
| 213 |
+
"output_type": "execute_result"
|
| 214 |
+
}
|
| 215 |
+
],
|
| 216 |
+
"source": [
|
| 217 |
+
"data.head()"
|
| 218 |
+
]
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"cell_type": "code",
|
| 222 |
+
"execution_count": 5,
|
| 223 |
+
"metadata": {},
|
| 224 |
+
"outputs": [
|
| 225 |
+
{
|
| 226 |
+
"name": "stdout",
|
| 227 |
+
"output_type": "stream",
|
| 228 |
+
"text": [
|
| 229 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
| 230 |
+
"RangeIndex: 6810 entries, 0 to 6809\n",
|
| 231 |
+
"Data columns (total 12 columns):\n",
|
| 232 |
+
" # Column Non-Null Count Dtype \n",
|
| 233 |
+
"--- ------ -------------- ----- \n",
|
| 234 |
+
" 0 isbn13 6810 non-null int64 \n",
|
| 235 |
+
" 1 isbn10 6810 non-null object \n",
|
| 236 |
+
" 2 title 6810 non-null object \n",
|
| 237 |
+
" 3 subtitle 2381 non-null object \n",
|
| 238 |
+
" 4 authors 6738 non-null object \n",
|
| 239 |
+
" 5 categories 6711 non-null object \n",
|
| 240 |
+
" 6 thumbnail 6481 non-null object \n",
|
| 241 |
+
" 7 description 6548 non-null object \n",
|
| 242 |
+
" 8 published_year 6804 non-null float64\n",
|
| 243 |
+
" 9 average_rating 6767 non-null float64\n",
|
| 244 |
+
" 10 num_pages 6767 non-null float64\n",
|
| 245 |
+
" 11 ratings_count 6767 non-null float64\n",
|
| 246 |
+
"dtypes: float64(4), int64(1), object(7)\n",
|
| 247 |
+
"memory usage: 638.6+ KB\n"
|
| 248 |
+
]
|
| 249 |
+
}
|
| 250 |
+
],
|
| 251 |
+
"source": [
|
| 252 |
+
"data.info()"
|
| 253 |
+
]
|
| 254 |
+
},
|
| 255 |
+
{
|
| 256 |
+
"cell_type": "code",
|
| 257 |
+
"execution_count": 6,
|
| 258 |
+
"metadata": {},
|
| 259 |
+
"outputs": [
|
| 260 |
+
{
|
| 261 |
+
"data": {
|
| 262 |
+
"text/plain": [
|
| 263 |
+
"isbn13 0.000000\n",
|
| 264 |
+
"isbn10 0.000000\n",
|
| 265 |
+
"title 0.000000\n",
|
| 266 |
+
"subtitle 65.036711\n",
|
| 267 |
+
"authors 1.057269\n",
|
| 268 |
+
"categories 1.453744\n",
|
| 269 |
+
"thumbnail 4.831131\n",
|
| 270 |
+
"description 3.847283\n",
|
| 271 |
+
"published_year 0.088106\n",
|
| 272 |
+
"average_rating 0.631424\n",
|
| 273 |
+
"num_pages 0.631424\n",
|
| 274 |
+
"ratings_count 0.631424\n",
|
| 275 |
+
"dtype: float64"
|
| 276 |
+
]
|
| 277 |
+
},
|
| 278 |
+
"execution_count": 6,
|
| 279 |
+
"metadata": {},
|
| 280 |
+
"output_type": "execute_result"
|
| 281 |
+
}
|
| 282 |
+
],
|
| 283 |
+
"source": [
|
| 284 |
+
"data.isnull().sum() / len(data) * 100"
|
| 285 |
+
]
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"cell_type": "code",
|
| 289 |
+
"execution_count": 7,
|
| 290 |
+
"metadata": {},
|
| 291 |
+
"outputs": [],
|
| 292 |
+
"source": [
|
| 293 |
+
"rows_to_remove = data[(data[\"description\"].isnull()) | (data[\"authors\"].isnull()) | (data[\"published_year\"].isnull() )| (data[\"average_rating\"].isnull()) |( data[\"num_pages\"].isnull()) | ( data[\"ratings_count\"].isnull())]"
|
| 294 |
+
]
|
| 295 |
+
},
|
| 296 |
+
{
|
| 297 |
+
"cell_type": "code",
|
| 298 |
+
"execution_count": 8,
|
| 299 |
+
"metadata": {},
|
| 300 |
+
"outputs": [
|
| 301 |
+
{
|
| 302 |
+
"data": {
|
| 303 |
+
"text/plain": [
|
| 304 |
+
"5.3744493392070485"
|
| 305 |
+
]
|
| 306 |
+
},
|
| 307 |
+
"execution_count": 8,
|
| 308 |
+
"metadata": {},
|
| 309 |
+
"output_type": "execute_result"
|
| 310 |
+
}
|
| 311 |
+
],
|
| 312 |
+
"source": [
|
| 313 |
+
"len(rows_to_remove) / len(data) * 100 #5.5% of the data"
|
| 314 |
+
]
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"cell_type": "code",
|
| 318 |
+
"execution_count": 9,
|
| 319 |
+
"metadata": {},
|
| 320 |
+
"outputs": [],
|
| 321 |
+
"source": [
|
| 322 |
+
"data = data.drop(index=rows_to_remove.index)"
|
| 323 |
+
]
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"cell_type": "code",
|
| 327 |
+
"execution_count": 10,
|
| 328 |
+
"metadata": {},
|
| 329 |
+
"outputs": [
|
| 330 |
+
{
|
| 331 |
+
"data": {
|
| 332 |
+
"text/plain": [
|
| 333 |
+
"subtitle 64.959652\n",
|
| 334 |
+
"thumbnail 3.227809\n",
|
| 335 |
+
"categories 0.512104\n",
|
| 336 |
+
"isbn13 0.000000\n",
|
| 337 |
+
"title 0.000000\n",
|
| 338 |
+
"isbn10 0.000000\n",
|
| 339 |
+
"authors 0.000000\n",
|
| 340 |
+
"description 0.000000\n",
|
| 341 |
+
"published_year 0.000000\n",
|
| 342 |
+
"average_rating 0.000000\n",
|
| 343 |
+
"num_pages 0.000000\n",
|
| 344 |
+
"ratings_count 0.000000\n",
|
| 345 |
+
"dtype: float64"
|
| 346 |
+
]
|
| 347 |
+
},
|
| 348 |
+
"execution_count": 10,
|
| 349 |
+
"metadata": {},
|
| 350 |
+
"output_type": "execute_result"
|
| 351 |
+
}
|
| 352 |
+
],
|
| 353 |
+
"source": [
|
| 354 |
+
"(data.isnull().sum() / len(data) * 100).sort_values(ascending=False)"
|
| 355 |
+
]
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"cell_type": "code",
|
| 359 |
+
"execution_count": 11,
|
| 360 |
+
"metadata": {},
|
| 361 |
+
"outputs": [
|
| 362 |
+
{
|
| 363 |
+
"data": {
|
| 364 |
+
"text/plain": [
|
| 365 |
+
"categories\n",
|
| 366 |
+
"Fiction 2510\n",
|
| 367 |
+
"Juvenile Fiction 521\n",
|
| 368 |
+
"Biography & Autobiography 390\n",
|
| 369 |
+
"History 256\n",
|
| 370 |
+
"Literary Criticism 163\n",
|
| 371 |
+
" ... \n",
|
| 372 |
+
"Humorous stories 1\n",
|
| 373 |
+
"Ballets 1\n",
|
| 374 |
+
"Aged women 1\n",
|
| 375 |
+
"Catholic women 1\n",
|
| 376 |
+
"Christian fiction 1\n",
|
| 377 |
+
"Name: count, Length: 530, dtype: int64"
|
| 378 |
+
]
|
| 379 |
+
},
|
| 380 |
+
"execution_count": 11,
|
| 381 |
+
"metadata": {},
|
| 382 |
+
"output_type": "execute_result"
|
| 383 |
+
}
|
| 384 |
+
],
|
| 385 |
+
"source": [
|
| 386 |
+
"data[\"categories\"].value_counts() #530 categories is too much, there is something wrong with this column"
|
| 387 |
+
]
|
| 388 |
+
},
|
| 389 |
+
{
|
| 390 |
+
"cell_type": "code",
|
| 391 |
+
"execution_count": 12,
|
| 392 |
+
"metadata": {},
|
| 393 |
+
"outputs": [
|
| 394 |
+
{
|
| 395 |
+
"data": {
|
| 396 |
+
"text/plain": [
|
| 397 |
+
"5024 Violence erupts in the poor town of Milagro wh...\n",
|
| 398 |
+
"3235 FBI Special Agent Dillon Savich teams up with ...\n",
|
| 399 |
+
"5235 Seventeen-year-old Manhattan society girl Grad...\n",
|
| 400 |
+
"4516 This is the story of the Tuck family, who are ...\n",
|
| 401 |
+
"3204 Prejudice, the intricacies of Mediterranean po...\n",
|
| 402 |
+
"Name: description, dtype: object"
|
| 403 |
+
]
|
| 404 |
+
},
|
| 405 |
+
"execution_count": 12,
|
| 406 |
+
"metadata": {},
|
| 407 |
+
"output_type": "execute_result"
|
| 408 |
+
}
|
| 409 |
+
],
|
| 410 |
+
"source": [
|
| 411 |
+
"data[\"description\"].sample(5) #some desscriptions are too short to be useful and some are too long\n",
|
| 412 |
+
"# i think characters greater than 25 are better suited for understanding the context"
|
| 413 |
+
]
|
| 414 |
+
},
|
| 415 |
+
{
|
| 416 |
+
"cell_type": "code",
|
| 417 |
+
"execution_count": 13,
|
| 418 |
+
"metadata": {},
|
| 419 |
+
"outputs": [],
|
| 420 |
+
"source": [
|
| 421 |
+
"data[\"description_chars\"] = data[\"description\"].apply(lambda x: len(x) if isinstance(x, str) else 0)\n",
|
| 422 |
+
"data = data[data[\"description_chars\"] > 25]"
|
| 423 |
+
]
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"cell_type": "code",
|
| 427 |
+
"execution_count": 14,
|
| 428 |
+
"metadata": {},
|
| 429 |
+
"outputs": [
|
| 430 |
+
{
|
| 431 |
+
"data": {
|
| 432 |
+
"text/plain": [
|
| 433 |
+
"6397"
|
| 434 |
+
]
|
| 435 |
+
},
|
| 436 |
+
"execution_count": 14,
|
| 437 |
+
"metadata": {},
|
| 438 |
+
"output_type": "execute_result"
|
| 439 |
+
}
|
| 440 |
+
],
|
| 441 |
+
"source": [
|
| 442 |
+
"len(data)"
|
| 443 |
+
]
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"cell_type": "code",
|
| 447 |
+
"execution_count": 15,
|
| 448 |
+
"metadata": {},
|
| 449 |
+
"outputs": [],
|
| 450 |
+
"source": [
|
| 451 |
+
"# some of the subtitle are missing, so we can have a cojoined title and subtitle, to replace both the title and subtitle\n",
|
| 452 |
+
"data[\"title_and_subtitle\"] = data[\"title\"].apply(lambda x: x if isinstance(x, str) else \"\") + \" \" + data[\"subtitle\"].apply(lambda x: x if isinstance(x, str) else \"\")"
|
| 453 |
+
]
|
| 454 |
+
},
|
| 455 |
+
{
|
| 456 |
+
"cell_type": "code",
|
| 457 |
+
"execution_count": 16,
|
| 458 |
+
"metadata": {},
|
| 459 |
+
"outputs": [
|
| 460 |
+
{
|
| 461 |
+
"data": {
|
| 462 |
+
"text/html": [
|
| 463 |
+
"<div>\n",
|
| 464 |
+
"<style scoped>\n",
|
| 465 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 466 |
+
" vertical-align: middle;\n",
|
| 467 |
+
" }\n",
|
| 468 |
+
"\n",
|
| 469 |
+
" .dataframe tbody tr th {\n",
|
| 470 |
+
" vertical-align: top;\n",
|
| 471 |
+
" }\n",
|
| 472 |
+
"\n",
|
| 473 |
+
" .dataframe thead th {\n",
|
| 474 |
+
" text-align: right;\n",
|
| 475 |
+
" }\n",
|
| 476 |
+
"</style>\n",
|
| 477 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 478 |
+
" <thead>\n",
|
| 479 |
+
" <tr style=\"text-align: right;\">\n",
|
| 480 |
+
" <th></th>\n",
|
| 481 |
+
" <th>isbn13</th>\n",
|
| 482 |
+
" <th>isbn10</th>\n",
|
| 483 |
+
" <th>title</th>\n",
|
| 484 |
+
" <th>subtitle</th>\n",
|
| 485 |
+
" <th>authors</th>\n",
|
| 486 |
+
" <th>categories</th>\n",
|
| 487 |
+
" <th>thumbnail</th>\n",
|
| 488 |
+
" <th>description</th>\n",
|
| 489 |
+
" <th>published_year</th>\n",
|
| 490 |
+
" <th>average_rating</th>\n",
|
| 491 |
+
" <th>num_pages</th>\n",
|
| 492 |
+
" <th>ratings_count</th>\n",
|
| 493 |
+
" <th>description_chars</th>\n",
|
| 494 |
+
" <th>title_and_subtitle</th>\n",
|
| 495 |
+
" </tr>\n",
|
| 496 |
+
" </thead>\n",
|
| 497 |
+
" <tbody>\n",
|
| 498 |
+
" <tr>\n",
|
| 499 |
+
" <th>0</th>\n",
|
| 500 |
+
" <td>9780002005883</td>\n",
|
| 501 |
+
" <td>0002005883</td>\n",
|
| 502 |
+
" <td>Gilead</td>\n",
|
| 503 |
+
" <td>NaN</td>\n",
|
| 504 |
+
" <td>Marilynne Robinson</td>\n",
|
| 505 |
+
" <td>Fiction</td>\n",
|
| 506 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 507 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 508 |
+
" <td>2004.0</td>\n",
|
| 509 |
+
" <td>3.85</td>\n",
|
| 510 |
+
" <td>247.0</td>\n",
|
| 511 |
+
" <td>361.0</td>\n",
|
| 512 |
+
" <td>1154</td>\n",
|
| 513 |
+
" <td>Gilead</td>\n",
|
| 514 |
+
" </tr>\n",
|
| 515 |
+
" <tr>\n",
|
| 516 |
+
" <th>1</th>\n",
|
| 517 |
+
" <td>9780002261982</td>\n",
|
| 518 |
+
" <td>0002261987</td>\n",
|
| 519 |
+
" <td>Spider's Web</td>\n",
|
| 520 |
+
" <td>A Novel</td>\n",
|
| 521 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 522 |
+
" <td>Detective and mystery stories</td>\n",
|
| 523 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 524 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 525 |
+
" <td>2000.0</td>\n",
|
| 526 |
+
" <td>3.83</td>\n",
|
| 527 |
+
" <td>241.0</td>\n",
|
| 528 |
+
" <td>5164.0</td>\n",
|
| 529 |
+
" <td>1200</td>\n",
|
| 530 |
+
" <td>Spider's Web A Novel</td>\n",
|
| 531 |
+
" </tr>\n",
|
| 532 |
+
" <tr>\n",
|
| 533 |
+
" <th>2</th>\n",
|
| 534 |
+
" <td>9780006163831</td>\n",
|
| 535 |
+
" <td>0006163831</td>\n",
|
| 536 |
+
" <td>The One Tree</td>\n",
|
| 537 |
+
" <td>NaN</td>\n",
|
| 538 |
+
" <td>Stephen R. Donaldson</td>\n",
|
| 539 |
+
" <td>American fiction</td>\n",
|
| 540 |
+
" <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
|
| 541 |
+
" <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
|
| 542 |
+
" <td>1982.0</td>\n",
|
| 543 |
+
" <td>3.97</td>\n",
|
| 544 |
+
" <td>479.0</td>\n",
|
| 545 |
+
" <td>172.0</td>\n",
|
| 546 |
+
" <td>109</td>\n",
|
| 547 |
+
" <td>The One Tree</td>\n",
|
| 548 |
+
" </tr>\n",
|
| 549 |
+
" <tr>\n",
|
| 550 |
+
" <th>3</th>\n",
|
| 551 |
+
" <td>9780006178736</td>\n",
|
| 552 |
+
" <td>0006178731</td>\n",
|
| 553 |
+
" <td>Rage of angels</td>\n",
|
| 554 |
+
" <td>NaN</td>\n",
|
| 555 |
+
" <td>Sidney Sheldon</td>\n",
|
| 556 |
+
" <td>Fiction</td>\n",
|
| 557 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 558 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 559 |
+
" <td>1993.0</td>\n",
|
| 560 |
+
" <td>3.93</td>\n",
|
| 561 |
+
" <td>512.0</td>\n",
|
| 562 |
+
" <td>29532.0</td>\n",
|
| 563 |
+
" <td>359</td>\n",
|
| 564 |
+
" <td>Rage of angels</td>\n",
|
| 565 |
+
" </tr>\n",
|
| 566 |
+
" <tr>\n",
|
| 567 |
+
" <th>4</th>\n",
|
| 568 |
+
" <td>9780006280897</td>\n",
|
| 569 |
+
" <td>0006280897</td>\n",
|
| 570 |
+
" <td>The Four Loves</td>\n",
|
| 571 |
+
" <td>NaN</td>\n",
|
| 572 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 573 |
+
" <td>Christian life</td>\n",
|
| 574 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 575 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 576 |
+
" <td>2002.0</td>\n",
|
| 577 |
+
" <td>4.15</td>\n",
|
| 578 |
+
" <td>170.0</td>\n",
|
| 579 |
+
" <td>33684.0</td>\n",
|
| 580 |
+
" <td>295</td>\n",
|
| 581 |
+
" <td>The Four Loves</td>\n",
|
| 582 |
+
" </tr>\n",
|
| 583 |
+
" </tbody>\n",
|
| 584 |
+
"</table>\n",
|
| 585 |
+
"</div>"
|
| 586 |
+
],
|
| 587 |
+
"text/plain": [
|
| 588 |
+
" isbn13 isbn10 title subtitle \\\n",
|
| 589 |
+
"0 9780002005883 0002005883 Gilead NaN \n",
|
| 590 |
+
"1 9780002261982 0002261987 Spider's Web A Novel \n",
|
| 591 |
+
"2 9780006163831 0006163831 The One Tree NaN \n",
|
| 592 |
+
"3 9780006178736 0006178731 Rage of angels NaN \n",
|
| 593 |
+
"4 9780006280897 0006280897 The Four Loves NaN \n",
|
| 594 |
+
"\n",
|
| 595 |
+
" authors categories \\\n",
|
| 596 |
+
"0 Marilynne Robinson Fiction \n",
|
| 597 |
+
"1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
|
| 598 |
+
"2 Stephen R. Donaldson American fiction \n",
|
| 599 |
+
"3 Sidney Sheldon Fiction \n",
|
| 600 |
+
"4 Clive Staples Lewis Christian life \n",
|
| 601 |
+
"\n",
|
| 602 |
+
" thumbnail \\\n",
|
| 603 |
+
"0 http://books.google.com/books/content?id=KQZCP... \n",
|
| 604 |
+
"1 http://books.google.com/books/content?id=gA5GP... \n",
|
| 605 |
+
"2 http://books.google.com/books/content?id=OmQaw... \n",
|
| 606 |
+
"3 http://books.google.com/books/content?id=FKo2T... \n",
|
| 607 |
+
"4 http://books.google.com/books/content?id=XhQ5X... \n",
|
| 608 |
+
"\n",
|
| 609 |
+
" description published_year \\\n",
|
| 610 |
+
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
|
| 611 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
|
| 612 |
+
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
|
| 613 |
+
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
|
| 614 |
+
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
|
| 615 |
+
"\n",
|
| 616 |
+
" average_rating num_pages ratings_count description_chars \\\n",
|
| 617 |
+
"0 3.85 247.0 361.0 1154 \n",
|
| 618 |
+
"1 3.83 241.0 5164.0 1200 \n",
|
| 619 |
+
"2 3.97 479.0 172.0 109 \n",
|
| 620 |
+
"3 3.93 512.0 29532.0 359 \n",
|
| 621 |
+
"4 4.15 170.0 33684.0 295 \n",
|
| 622 |
+
"\n",
|
| 623 |
+
" title_and_subtitle \n",
|
| 624 |
+
"0 Gilead \n",
|
| 625 |
+
"1 Spider's Web A Novel \n",
|
| 626 |
+
"2 The One Tree \n",
|
| 627 |
+
"3 Rage of angels \n",
|
| 628 |
+
"4 The Four Loves "
|
| 629 |
+
]
|
| 630 |
+
},
|
| 631 |
+
"execution_count": 16,
|
| 632 |
+
"metadata": {},
|
| 633 |
+
"output_type": "execute_result"
|
| 634 |
+
}
|
| 635 |
+
],
|
| 636 |
+
"source": [
|
| 637 |
+
"data.head()"
|
| 638 |
+
]
|
| 639 |
+
},
|
| 640 |
+
{
|
| 641 |
+
"cell_type": "code",
|
| 642 |
+
"execution_count": 17,
|
| 643 |
+
"metadata": {},
|
| 644 |
+
"outputs": [],
|
| 645 |
+
"source": [
|
| 646 |
+
"data[\"tagged_description\"] = data[\"isbn13\"].apply(str) + \" \" + data[\"description\"].apply(str)"
|
| 647 |
+
]
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"cell_type": "code",
|
| 651 |
+
"execution_count": 18,
|
| 652 |
+
"metadata": {},
|
| 653 |
+
"outputs": [
|
| 654 |
+
{
|
| 655 |
+
"data": {
|
| 656 |
+
"text/html": [
|
| 657 |
+
"<div>\n",
|
| 658 |
+
"<style scoped>\n",
|
| 659 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 660 |
+
" vertical-align: middle;\n",
|
| 661 |
+
" }\n",
|
| 662 |
+
"\n",
|
| 663 |
+
" .dataframe tbody tr th {\n",
|
| 664 |
+
" vertical-align: top;\n",
|
| 665 |
+
" }\n",
|
| 666 |
+
"\n",
|
| 667 |
+
" .dataframe thead th {\n",
|
| 668 |
+
" text-align: right;\n",
|
| 669 |
+
" }\n",
|
| 670 |
+
"</style>\n",
|
| 671 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 672 |
+
" <thead>\n",
|
| 673 |
+
" <tr style=\"text-align: right;\">\n",
|
| 674 |
+
" <th></th>\n",
|
| 675 |
+
" <th>isbn13</th>\n",
|
| 676 |
+
" <th>isbn10</th>\n",
|
| 677 |
+
" <th>title</th>\n",
|
| 678 |
+
" <th>subtitle</th>\n",
|
| 679 |
+
" <th>authors</th>\n",
|
| 680 |
+
" <th>categories</th>\n",
|
| 681 |
+
" <th>thumbnail</th>\n",
|
| 682 |
+
" <th>description</th>\n",
|
| 683 |
+
" <th>published_year</th>\n",
|
| 684 |
+
" <th>average_rating</th>\n",
|
| 685 |
+
" <th>num_pages</th>\n",
|
| 686 |
+
" <th>ratings_count</th>\n",
|
| 687 |
+
" <th>description_chars</th>\n",
|
| 688 |
+
" <th>title_and_subtitle</th>\n",
|
| 689 |
+
" <th>tagged_description</th>\n",
|
| 690 |
+
" </tr>\n",
|
| 691 |
+
" </thead>\n",
|
| 692 |
+
" <tbody>\n",
|
| 693 |
+
" <tr>\n",
|
| 694 |
+
" <th>0</th>\n",
|
| 695 |
+
" <td>9780002005883</td>\n",
|
| 696 |
+
" <td>0002005883</td>\n",
|
| 697 |
+
" <td>Gilead</td>\n",
|
| 698 |
+
" <td>NaN</td>\n",
|
| 699 |
+
" <td>Marilynne Robinson</td>\n",
|
| 700 |
+
" <td>Fiction</td>\n",
|
| 701 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 702 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 703 |
+
" <td>2004.0</td>\n",
|
| 704 |
+
" <td>3.85</td>\n",
|
| 705 |
+
" <td>247.0</td>\n",
|
| 706 |
+
" <td>361.0</td>\n",
|
| 707 |
+
" <td>1154</td>\n",
|
| 708 |
+
" <td>Gilead</td>\n",
|
| 709 |
+
" <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
|
| 710 |
+
" </tr>\n",
|
| 711 |
+
" <tr>\n",
|
| 712 |
+
" <th>1</th>\n",
|
| 713 |
+
" <td>9780002261982</td>\n",
|
| 714 |
+
" <td>0002261987</td>\n",
|
| 715 |
+
" <td>Spider's Web</td>\n",
|
| 716 |
+
" <td>A Novel</td>\n",
|
| 717 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 718 |
+
" <td>Detective and mystery stories</td>\n",
|
| 719 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 720 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 721 |
+
" <td>2000.0</td>\n",
|
| 722 |
+
" <td>3.83</td>\n",
|
| 723 |
+
" <td>241.0</td>\n",
|
| 724 |
+
" <td>5164.0</td>\n",
|
| 725 |
+
" <td>1200</td>\n",
|
| 726 |
+
" <td>Spider's Web A Novel</td>\n",
|
| 727 |
+
" <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
|
| 728 |
+
" </tr>\n",
|
| 729 |
+
" <tr>\n",
|
| 730 |
+
" <th>2</th>\n",
|
| 731 |
+
" <td>9780006163831</td>\n",
|
| 732 |
+
" <td>0006163831</td>\n",
|
| 733 |
+
" <td>The One Tree</td>\n",
|
| 734 |
+
" <td>NaN</td>\n",
|
| 735 |
+
" <td>Stephen R. Donaldson</td>\n",
|
| 736 |
+
" <td>American fiction</td>\n",
|
| 737 |
+
" <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
|
| 738 |
+
" <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
|
| 739 |
+
" <td>1982.0</td>\n",
|
| 740 |
+
" <td>3.97</td>\n",
|
| 741 |
+
" <td>479.0</td>\n",
|
| 742 |
+
" <td>172.0</td>\n",
|
| 743 |
+
" <td>109</td>\n",
|
| 744 |
+
" <td>The One Tree</td>\n",
|
| 745 |
+
" <td>9780006163831 Volume Two of Stephen Donaldson'...</td>\n",
|
| 746 |
+
" </tr>\n",
|
| 747 |
+
" <tr>\n",
|
| 748 |
+
" <th>3</th>\n",
|
| 749 |
+
" <td>9780006178736</td>\n",
|
| 750 |
+
" <td>0006178731</td>\n",
|
| 751 |
+
" <td>Rage of angels</td>\n",
|
| 752 |
+
" <td>NaN</td>\n",
|
| 753 |
+
" <td>Sidney Sheldon</td>\n",
|
| 754 |
+
" <td>Fiction</td>\n",
|
| 755 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 756 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 757 |
+
" <td>1993.0</td>\n",
|
| 758 |
+
" <td>3.93</td>\n",
|
| 759 |
+
" <td>512.0</td>\n",
|
| 760 |
+
" <td>29532.0</td>\n",
|
| 761 |
+
" <td>359</td>\n",
|
| 762 |
+
" <td>Rage of angels</td>\n",
|
| 763 |
+
" <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
|
| 764 |
+
" </tr>\n",
|
| 765 |
+
" <tr>\n",
|
| 766 |
+
" <th>4</th>\n",
|
| 767 |
+
" <td>9780006280897</td>\n",
|
| 768 |
+
" <td>0006280897</td>\n",
|
| 769 |
+
" <td>The Four Loves</td>\n",
|
| 770 |
+
" <td>NaN</td>\n",
|
| 771 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 772 |
+
" <td>Christian life</td>\n",
|
| 773 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 774 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 775 |
+
" <td>2002.0</td>\n",
|
| 776 |
+
" <td>4.15</td>\n",
|
| 777 |
+
" <td>170.0</td>\n",
|
| 778 |
+
" <td>33684.0</td>\n",
|
| 779 |
+
" <td>295</td>\n",
|
| 780 |
+
" <td>The Four Loves</td>\n",
|
| 781 |
+
" <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
|
| 782 |
+
" </tr>\n",
|
| 783 |
+
" </tbody>\n",
|
| 784 |
+
"</table>\n",
|
| 785 |
+
"</div>"
|
| 786 |
+
],
|
| 787 |
+
"text/plain": [
|
| 788 |
+
" isbn13 isbn10 title subtitle \\\n",
|
| 789 |
+
"0 9780002005883 0002005883 Gilead NaN \n",
|
| 790 |
+
"1 9780002261982 0002261987 Spider's Web A Novel \n",
|
| 791 |
+
"2 9780006163831 0006163831 The One Tree NaN \n",
|
| 792 |
+
"3 9780006178736 0006178731 Rage of angels NaN \n",
|
| 793 |
+
"4 9780006280897 0006280897 The Four Loves NaN \n",
|
| 794 |
+
"\n",
|
| 795 |
+
" authors categories \\\n",
|
| 796 |
+
"0 Marilynne Robinson Fiction \n",
|
| 797 |
+
"1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
|
| 798 |
+
"2 Stephen R. Donaldson American fiction \n",
|
| 799 |
+
"3 Sidney Sheldon Fiction \n",
|
| 800 |
+
"4 Clive Staples Lewis Christian life \n",
|
| 801 |
+
"\n",
|
| 802 |
+
" thumbnail \\\n",
|
| 803 |
+
"0 http://books.google.com/books/content?id=KQZCP... \n",
|
| 804 |
+
"1 http://books.google.com/books/content?id=gA5GP... \n",
|
| 805 |
+
"2 http://books.google.com/books/content?id=OmQaw... \n",
|
| 806 |
+
"3 http://books.google.com/books/content?id=FKo2T... \n",
|
| 807 |
+
"4 http://books.google.com/books/content?id=XhQ5X... \n",
|
| 808 |
+
"\n",
|
| 809 |
+
" description published_year \\\n",
|
| 810 |
+
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
|
| 811 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
|
| 812 |
+
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
|
| 813 |
+
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
|
| 814 |
+
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
|
| 815 |
+
"\n",
|
| 816 |
+
" average_rating num_pages ratings_count description_chars \\\n",
|
| 817 |
+
"0 3.85 247.0 361.0 1154 \n",
|
| 818 |
+
"1 3.83 241.0 5164.0 1200 \n",
|
| 819 |
+
"2 3.97 479.0 172.0 109 \n",
|
| 820 |
+
"3 3.93 512.0 29532.0 359 \n",
|
| 821 |
+
"4 4.15 170.0 33684.0 295 \n",
|
| 822 |
+
"\n",
|
| 823 |
+
" title_and_subtitle tagged_description \n",
|
| 824 |
+
"0 Gilead 9780002005883 A NOVEL THAT READERS and critics... \n",
|
| 825 |
+
"1 Spider's Web A Novel 9780002261982 A new 'Christie for Christmas' -... \n",
|
| 826 |
+
"2 The One Tree 9780006163831 Volume Two of Stephen Donaldson'... \n",
|
| 827 |
+
"3 Rage of angels 9780006178736 A memorable, mesmerizing heroine... \n",
|
| 828 |
+
"4 The Four Loves 9780006280897 Lewis' work on the nature of lov... "
|
| 829 |
+
]
|
| 830 |
+
},
|
| 831 |
+
"execution_count": 18,
|
| 832 |
+
"metadata": {},
|
| 833 |
+
"output_type": "execute_result"
|
| 834 |
+
}
|
| 835 |
+
],
|
| 836 |
+
"source": [
|
| 837 |
+
"data.head()"
|
| 838 |
+
]
|
| 839 |
+
},
|
| 840 |
+
{
|
| 841 |
+
"cell_type": "code",
|
| 842 |
+
"execution_count": null,
|
| 843 |
+
"metadata": {},
|
| 844 |
+
"outputs": [],
|
| 845 |
+
"source": []
|
| 846 |
+
},
|
| 847 |
+
{
|
| 848 |
+
"cell_type": "code",
|
| 849 |
+
"execution_count": 19,
|
| 850 |
+
"metadata": {},
|
| 851 |
+
"outputs": [
|
| 852 |
+
{
|
| 853 |
+
"data": {
|
| 854 |
+
"text/html": [
|
| 855 |
+
"<div>\n",
|
| 856 |
+
"<style scoped>\n",
|
| 857 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 858 |
+
" vertical-align: middle;\n",
|
| 859 |
+
" }\n",
|
| 860 |
+
"\n",
|
| 861 |
+
" .dataframe tbody tr th {\n",
|
| 862 |
+
" vertical-align: top;\n",
|
| 863 |
+
" }\n",
|
| 864 |
+
"\n",
|
| 865 |
+
" .dataframe thead th {\n",
|
| 866 |
+
" text-align: right;\n",
|
| 867 |
+
" }\n",
|
| 868 |
+
"</style>\n",
|
| 869 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 870 |
+
" <thead>\n",
|
| 871 |
+
" <tr style=\"text-align: right;\">\n",
|
| 872 |
+
" <th></th>\n",
|
| 873 |
+
" <th>isbn13</th>\n",
|
| 874 |
+
" <th>isbn10</th>\n",
|
| 875 |
+
" <th>title</th>\n",
|
| 876 |
+
" <th>subtitle</th>\n",
|
| 877 |
+
" <th>authors</th>\n",
|
| 878 |
+
" <th>categories</th>\n",
|
| 879 |
+
" <th>thumbnail</th>\n",
|
| 880 |
+
" <th>description</th>\n",
|
| 881 |
+
" <th>published_year</th>\n",
|
| 882 |
+
" <th>average_rating</th>\n",
|
| 883 |
+
" <th>num_pages</th>\n",
|
| 884 |
+
" <th>ratings_count</th>\n",
|
| 885 |
+
" <th>description_chars</th>\n",
|
| 886 |
+
" <th>title_and_subtitle</th>\n",
|
| 887 |
+
" <th>tagged_description</th>\n",
|
| 888 |
+
" </tr>\n",
|
| 889 |
+
" </thead>\n",
|
| 890 |
+
" <tbody>\n",
|
| 891 |
+
" <tr>\n",
|
| 892 |
+
" <th>0</th>\n",
|
| 893 |
+
" <td>9780002005883</td>\n",
|
| 894 |
+
" <td>0002005883</td>\n",
|
| 895 |
+
" <td>Gilead</td>\n",
|
| 896 |
+
" <td>NaN</td>\n",
|
| 897 |
+
" <td>Marilynne Robinson</td>\n",
|
| 898 |
+
" <td>Fiction</td>\n",
|
| 899 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 900 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 901 |
+
" <td>2004.0</td>\n",
|
| 902 |
+
" <td>3.85</td>\n",
|
| 903 |
+
" <td>247.0</td>\n",
|
| 904 |
+
" <td>361.0</td>\n",
|
| 905 |
+
" <td>1154</td>\n",
|
| 906 |
+
" <td>Gilead</td>\n",
|
| 907 |
+
" <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
|
| 908 |
+
" </tr>\n",
|
| 909 |
+
" <tr>\n",
|
| 910 |
+
" <th>1</th>\n",
|
| 911 |
+
" <td>9780002261982</td>\n",
|
| 912 |
+
" <td>0002261987</td>\n",
|
| 913 |
+
" <td>Spider's Web</td>\n",
|
| 914 |
+
" <td>A Novel</td>\n",
|
| 915 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 916 |
+
" <td>Detective and mystery stories</td>\n",
|
| 917 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 918 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 919 |
+
" <td>2000.0</td>\n",
|
| 920 |
+
" <td>3.83</td>\n",
|
| 921 |
+
" <td>241.0</td>\n",
|
| 922 |
+
" <td>5164.0</td>\n",
|
| 923 |
+
" <td>1200</td>\n",
|
| 924 |
+
" <td>Spider's Web A Novel</td>\n",
|
| 925 |
+
" <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
|
| 926 |
+
" </tr>\n",
|
| 927 |
+
" <tr>\n",
|
| 928 |
+
" <th>2</th>\n",
|
| 929 |
+
" <td>9780006163831</td>\n",
|
| 930 |
+
" <td>0006163831</td>\n",
|
| 931 |
+
" <td>The One Tree</td>\n",
|
| 932 |
+
" <td>NaN</td>\n",
|
| 933 |
+
" <td>Stephen R. Donaldson</td>\n",
|
| 934 |
+
" <td>American fiction</td>\n",
|
| 935 |
+
" <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
|
| 936 |
+
" <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
|
| 937 |
+
" <td>1982.0</td>\n",
|
| 938 |
+
" <td>3.97</td>\n",
|
| 939 |
+
" <td>479.0</td>\n",
|
| 940 |
+
" <td>172.0</td>\n",
|
| 941 |
+
" <td>109</td>\n",
|
| 942 |
+
" <td>The One Tree</td>\n",
|
| 943 |
+
" <td>9780006163831 Volume Two of Stephen Donaldson'...</td>\n",
|
| 944 |
+
" </tr>\n",
|
| 945 |
+
" <tr>\n",
|
| 946 |
+
" <th>3</th>\n",
|
| 947 |
+
" <td>9780006178736</td>\n",
|
| 948 |
+
" <td>0006178731</td>\n",
|
| 949 |
+
" <td>Rage of angels</td>\n",
|
| 950 |
+
" <td>NaN</td>\n",
|
| 951 |
+
" <td>Sidney Sheldon</td>\n",
|
| 952 |
+
" <td>Fiction</td>\n",
|
| 953 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 954 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 955 |
+
" <td>1993.0</td>\n",
|
| 956 |
+
" <td>3.93</td>\n",
|
| 957 |
+
" <td>512.0</td>\n",
|
| 958 |
+
" <td>29532.0</td>\n",
|
| 959 |
+
" <td>359</td>\n",
|
| 960 |
+
" <td>Rage of angels</td>\n",
|
| 961 |
+
" <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
|
| 962 |
+
" </tr>\n",
|
| 963 |
+
" <tr>\n",
|
| 964 |
+
" <th>4</th>\n",
|
| 965 |
+
" <td>9780006280897</td>\n",
|
| 966 |
+
" <td>0006280897</td>\n",
|
| 967 |
+
" <td>The Four Loves</td>\n",
|
| 968 |
+
" <td>NaN</td>\n",
|
| 969 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 970 |
+
" <td>Christian life</td>\n",
|
| 971 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 972 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 973 |
+
" <td>2002.0</td>\n",
|
| 974 |
+
" <td>4.15</td>\n",
|
| 975 |
+
" <td>170.0</td>\n",
|
| 976 |
+
" <td>33684.0</td>\n",
|
| 977 |
+
" <td>295</td>\n",
|
| 978 |
+
" <td>The Four Loves</td>\n",
|
| 979 |
+
" <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
|
| 980 |
+
" </tr>\n",
|
| 981 |
+
" </tbody>\n",
|
| 982 |
+
"</table>\n",
|
| 983 |
+
"</div>"
|
| 984 |
+
],
|
| 985 |
+
"text/plain": [
|
| 986 |
+
" isbn13 isbn10 title subtitle \\\n",
|
| 987 |
+
"0 9780002005883 0002005883 Gilead NaN \n",
|
| 988 |
+
"1 9780002261982 0002261987 Spider's Web A Novel \n",
|
| 989 |
+
"2 9780006163831 0006163831 The One Tree NaN \n",
|
| 990 |
+
"3 9780006178736 0006178731 Rage of angels NaN \n",
|
| 991 |
+
"4 9780006280897 0006280897 The Four Loves NaN \n",
|
| 992 |
+
"\n",
|
| 993 |
+
" authors categories \\\n",
|
| 994 |
+
"0 Marilynne Robinson Fiction \n",
|
| 995 |
+
"1 Charles Osborne;Agatha Christie Detective and mystery stories \n",
|
| 996 |
+
"2 Stephen R. Donaldson American fiction \n",
|
| 997 |
+
"3 Sidney Sheldon Fiction \n",
|
| 998 |
+
"4 Clive Staples Lewis Christian life \n",
|
| 999 |
+
"\n",
|
| 1000 |
+
" thumbnail \\\n",
|
| 1001 |
+
"0 http://books.google.com/books/content?id=KQZCP... \n",
|
| 1002 |
+
"1 http://books.google.com/books/content?id=gA5GP... \n",
|
| 1003 |
+
"2 http://books.google.com/books/content?id=OmQaw... \n",
|
| 1004 |
+
"3 http://books.google.com/books/content?id=FKo2T... \n",
|
| 1005 |
+
"4 http://books.google.com/books/content?id=XhQ5X... \n",
|
| 1006 |
+
"\n",
|
| 1007 |
+
" description published_year \\\n",
|
| 1008 |
+
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
|
| 1009 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
|
| 1010 |
+
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
|
| 1011 |
+
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
|
| 1012 |
+
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
|
| 1013 |
+
"\n",
|
| 1014 |
+
" average_rating num_pages ratings_count description_chars \\\n",
|
| 1015 |
+
"0 3.85 247.0 361.0 1154 \n",
|
| 1016 |
+
"1 3.83 241.0 5164.0 1200 \n",
|
| 1017 |
+
"2 3.97 479.0 172.0 109 \n",
|
| 1018 |
+
"3 3.93 512.0 29532.0 359 \n",
|
| 1019 |
+
"4 4.15 170.0 33684.0 295 \n",
|
| 1020 |
+
"\n",
|
| 1021 |
+
" title_and_subtitle tagged_description \n",
|
| 1022 |
+
"0 Gilead 9780002005883 A NOVEL THAT READERS and critics... \n",
|
| 1023 |
+
"1 Spider's Web A Novel 9780002261982 A new 'Christie for Christmas' -... \n",
|
| 1024 |
+
"2 The One Tree 9780006163831 Volume Two of Stephen Donaldson'... \n",
|
| 1025 |
+
"3 Rage of angels 9780006178736 A memorable, mesmerizing heroine... \n",
|
| 1026 |
+
"4 The Four Loves 9780006280897 Lewis' work on the nature of lov... "
|
| 1027 |
+
]
|
| 1028 |
+
},
|
| 1029 |
+
"execution_count": 19,
|
| 1030 |
+
"metadata": {},
|
| 1031 |
+
"output_type": "execute_result"
|
| 1032 |
+
}
|
| 1033 |
+
],
|
| 1034 |
+
"source": [
|
| 1035 |
+
"data.head()"
|
| 1036 |
+
]
|
| 1037 |
+
},
|
| 1038 |
+
{
|
| 1039 |
+
"cell_type": "code",
|
| 1040 |
+
"execution_count": 20,
|
| 1041 |
+
"metadata": {},
|
| 1042 |
+
"outputs": [],
|
| 1043 |
+
"source": [
|
| 1044 |
+
"data = data.drop(columns=[ \"title\", \"subtitle\", \"description_chars\",\"isbn10\"], axis=1)\n",
|
| 1045 |
+
"data.to_csv(\"books_cleaned.csv\", index=False)"
|
| 1046 |
+
]
|
| 1047 |
+
},
|
| 1048 |
+
{
|
| 1049 |
+
"cell_type": "code",
|
| 1050 |
+
"execution_count": 21,
|
| 1051 |
+
"metadata": {},
|
| 1052 |
+
"outputs": [
|
| 1053 |
+
{
|
| 1054 |
+
"name": "stdout",
|
| 1055 |
+
"output_type": "stream",
|
| 1056 |
+
"text": [
|
| 1057 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
| 1058 |
+
"Index: 6397 entries, 0 to 6809\n",
|
| 1059 |
+
"Data columns (total 11 columns):\n",
|
| 1060 |
+
" # Column Non-Null Count Dtype \n",
|
| 1061 |
+
"--- ------ -------------- ----- \n",
|
| 1062 |
+
" 0 isbn13 6397 non-null int64 \n",
|
| 1063 |
+
" 1 authors 6397 non-null object \n",
|
| 1064 |
+
" 2 categories 6364 non-null object \n",
|
| 1065 |
+
" 3 thumbnail 6190 non-null object \n",
|
| 1066 |
+
" 4 description 6397 non-null object \n",
|
| 1067 |
+
" 5 published_year 6397 non-null float64\n",
|
| 1068 |
+
" 6 average_rating 6397 non-null float64\n",
|
| 1069 |
+
" 7 num_pages 6397 non-null float64\n",
|
| 1070 |
+
" 8 ratings_count 6397 non-null float64\n",
|
| 1071 |
+
" 9 title_and_subtitle 6397 non-null object \n",
|
| 1072 |
+
" 10 tagged_description 6397 non-null object \n",
|
| 1073 |
+
"dtypes: float64(4), int64(1), object(6)\n",
|
| 1074 |
+
"memory usage: 599.7+ KB\n"
|
| 1075 |
+
]
|
| 1076 |
+
}
|
| 1077 |
+
],
|
| 1078 |
+
"source": [
|
| 1079 |
+
"data.info()"
|
| 1080 |
+
]
|
| 1081 |
+
},
|
| 1082 |
+
{
|
| 1083 |
+
"cell_type": "code",
|
| 1084 |
+
"execution_count": null,
|
| 1085 |
+
"metadata": {},
|
| 1086 |
+
"outputs": [],
|
| 1087 |
+
"source": []
|
| 1088 |
+
}
|
| 1089 |
+
],
|
| 1090 |
+
"metadata": {
|
| 1091 |
+
"kernelspec": {
|
| 1092 |
+
"display_name": "venv",
|
| 1093 |
+
"language": "python",
|
| 1094 |
+
"name": "python3"
|
| 1095 |
+
},
|
| 1096 |
+
"language_info": {
|
| 1097 |
+
"codemirror_mode": {
|
| 1098 |
+
"name": "ipython",
|
| 1099 |
+
"version": 3
|
| 1100 |
+
},
|
| 1101 |
+
"file_extension": ".py",
|
| 1102 |
+
"mimetype": "text/x-python",
|
| 1103 |
+
"name": "python",
|
| 1104 |
+
"nbconvert_exporter": "python",
|
| 1105 |
+
"pygments_lexer": "ipython3",
|
| 1106 |
+
"version": "3.11.9"
|
| 1107 |
+
}
|
| 1108 |
+
},
|
| 1109 |
+
"nbformat": 4,
|
| 1110 |
+
"nbformat_minor": 0
|
| 1111 |
+
}
|
download_url.ipynb
ADDED
|
@@ -0,0 +1,1353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {
|
| 7 |
+
"colab": {
|
| 8 |
+
"base_uri": "https://localhost:8080/"
|
| 9 |
+
},
|
| 10 |
+
"id": "39eGIDIr-3ju",
|
| 11 |
+
"outputId": "ac47d677-9538-4860-dc27-b0d43ef6e41d"
|
| 12 |
+
},
|
| 13 |
+
"outputs": [
|
| 14 |
+
{
|
| 15 |
+
"name": "stdout",
|
| 16 |
+
"output_type": "stream",
|
| 17 |
+
"text": [
|
| 18 |
+
"Requirement already satisfied: googlesearch-python in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (1.3.0)\n",
|
| 19 |
+
"Requirement already satisfied: beautifulsoup4>=4.9 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from googlesearch-python) (4.13.4)\n",
|
| 20 |
+
"Requirement already satisfied: requests>=2.20 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from googlesearch-python) (2.32.3)\n",
|
| 21 |
+
"Requirement already satisfied: soupsieve>1.2 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from beautifulsoup4>=4.9->googlesearch-python) (2.7)\n",
|
| 22 |
+
"Requirement already satisfied: typing-extensions>=4.0.0 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from beautifulsoup4>=4.9->googlesearch-python) (4.14.0)\n",
|
| 23 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from requests>=2.20->googlesearch-python) (3.4.2)\n",
|
| 24 |
+
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from requests>=2.20->googlesearch-python) (3.10)\n",
|
| 25 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from requests>=2.20->googlesearch-python) (2.4.0)\n",
|
| 26 |
+
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\nonsodev\\documents\\allcodes\\projects_dl_for resume\\recommender systems\\book reccomender - llm\\venv\\lib\\site-packages (from requests>=2.20->googlesearch-python) (2025.4.26)\n"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"name": "stderr",
|
| 31 |
+
"output_type": "stream",
|
| 32 |
+
"text": [
|
| 33 |
+
"\n",
|
| 34 |
+
"[notice] A new release of pip is available: 24.0 -> 25.1.1\n",
|
| 35 |
+
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
| 36 |
+
]
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
"source": [
|
| 40 |
+
"!pip install googlesearch-python\n"
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"execution_count": null,
|
| 46 |
+
"metadata": {
|
| 47 |
+
"id": "6KyyK4zD_eqC"
|
| 48 |
+
},
|
| 49 |
+
"outputs": [],
|
| 50 |
+
"source": []
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"cell_type": "code",
|
| 54 |
+
"execution_count": 2,
|
| 55 |
+
"metadata": {
|
| 56 |
+
"id": "9vzs6ees_CDr"
|
| 57 |
+
},
|
| 58 |
+
"outputs": [],
|
| 59 |
+
"source": [
|
| 60 |
+
"import time\n",
|
| 61 |
+
"import random\n",
|
| 62 |
+
"import numpy as np\n",
|
| 63 |
+
"from googlesearch import search\n",
|
| 64 |
+
"import pandas as pd\n",
|
| 65 |
+
"from typing import Optional\n",
|
| 66 |
+
"import logging\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"# Set up logging\n",
|
| 69 |
+
"logging.basicConfig(level=logging.INFO)\n",
|
| 70 |
+
"logger = logging.getLogger(__name__)\n",
|
| 71 |
+
"\n",
|
| 72 |
+
"def fetch_first_google_link_with_backoff(query: str, max_retries: int = 3) -> Optional[str]:\n",
|
| 73 |
+
" \"\"\"\n",
|
| 74 |
+
" Fetch first Google search result with exponential backoff and retry logic\n",
|
| 75 |
+
" \"\"\"\n",
|
| 76 |
+
" for attempt in range(max_retries + 1):\n",
|
| 77 |
+
" try:\n",
|
| 78 |
+
" # Add random delay between 1-5 seconds\n",
|
| 79 |
+
" base_delay = random.uniform(1, 5)\n",
|
| 80 |
+
" time.sleep(base_delay)\n",
|
| 81 |
+
" \n",
|
| 82 |
+
" results = search(query, num_results=1, lang=\"en\")\n",
|
| 83 |
+
" url = list(results)[0]\n",
|
| 84 |
+
" \n",
|
| 85 |
+
" if url.startswith(\"https://books.google.com\"):\n",
|
| 86 |
+
" return url\n",
|
| 87 |
+
" else:\n",
|
| 88 |
+
" # Additional delay before backup search\n",
|
| 89 |
+
" time.sleep(random.uniform(2, 4))\n",
|
| 90 |
+
" results = search(query.replace(\"google\", \"amazon\"), num_results=1, lang=\"en\")\n",
|
| 91 |
+
" return list(results)[0]\n",
|
| 92 |
+
" \n",
|
| 93 |
+
" except Exception as e:\n",
|
| 94 |
+
" if \"429\" in str(e) or \"Too Many Requests\" in str(e):\n",
|
| 95 |
+
" if attempt < max_retries:\n",
|
| 96 |
+
" # Exponential backoff: 2^attempt * base_time + random jitter\n",
|
| 97 |
+
" wait_time = (2 ** attempt) * 10 + random.uniform(5, 15)\n",
|
| 98 |
+
" logger.warning(f\"Rate limited on attempt {attempt + 1}. Waiting {wait_time:.1f} seconds...\")\n",
|
| 99 |
+
" time.sleep(wait_time)\n",
|
| 100 |
+
" continue\n",
|
| 101 |
+
" else:\n",
|
| 102 |
+
" logger.error(f\"Max retries exceeded for query: {query}\")\n",
|
| 103 |
+
" return None\n",
|
| 104 |
+
" else:\n",
|
| 105 |
+
" logger.error(f\"Unexpected error for query '{query}': {e}\")\n",
|
| 106 |
+
" return None\n",
|
| 107 |
+
" \n",
|
| 108 |
+
" return None\n",
|
| 109 |
+
"\n",
|
| 110 |
+
"def process_queries_in_batches(queries_df: pd.DataFrame, batch_size: int = 50, \n",
|
| 111 |
+
" batch_delay: int = 300) -> pd.Series:\n",
|
| 112 |
+
" \"\"\"\n",
|
| 113 |
+
" Process queries in batches with delays between batches\n",
|
| 114 |
+
" \"\"\"\n",
|
| 115 |
+
" results = []\n",
|
| 116 |
+
" total_queries = len(queries_df)\n",
|
| 117 |
+
" \n",
|
| 118 |
+
" for i in range(0, total_queries, batch_size):\n",
|
| 119 |
+
" batch_end = min(i + batch_size, total_queries)\n",
|
| 120 |
+
" batch_queries = queries_df.iloc[i:batch_end]\n",
|
| 121 |
+
" \n",
|
| 122 |
+
" logger.info(f\"Processing batch {i//batch_size + 1}: queries {i+1}-{batch_end} of {total_queries}\")\n",
|
| 123 |
+
" \n",
|
| 124 |
+
" # Process batch\n",
|
| 125 |
+
" batch_results = batch_queries.apply(fetch_first_google_link_with_backoff)\n",
|
| 126 |
+
" results.extend(batch_results.tolist())\n",
|
| 127 |
+
" \n",
|
| 128 |
+
" # Delay between batches (except for the last batch)\n",
|
| 129 |
+
" if batch_end < total_queries:\n",
|
| 130 |
+
" logger.info(f\"Batch complete. Waiting {batch_delay} seconds before next batch...\")\n",
|
| 131 |
+
" time.sleep(batch_delay)\n",
|
| 132 |
+
" \n",
|
| 133 |
+
" return pd.Series(results, index=queries_df.index)\n",
|
| 134 |
+
"\n",
|
| 135 |
+
"# Alternative approach: Save progress incrementally\n",
|
| 136 |
+
"def process_queries_with_checkpoints(queries_df: pd.DataFrame, \n",
|
| 137 |
+
" checkpoint_file: str = \"search_progress.csv\",\n",
|
| 138 |
+
" start_index: int = 0) -> pd.Series:\n",
|
| 139 |
+
" \"\"\"\n",
|
| 140 |
+
" Process queries with periodic checkpoints to resume if interrupted\n",
|
| 141 |
+
" \"\"\"\n",
|
| 142 |
+
" results = [None] * len(queries_df)\n",
|
| 143 |
+
" \n",
|
| 144 |
+
" # Load existing progress if checkpoint exists\n",
|
| 145 |
+
" try:\n",
|
| 146 |
+
" checkpoint_df = pd.read_csv(checkpoint_file)\n",
|
| 147 |
+
" for idx, row in checkpoint_df.iterrows():\n",
|
| 148 |
+
" if row['result'] is not np.nan:\n",
|
| 149 |
+
" results[row['query_index']] = row['result']\n",
|
| 150 |
+
" logger.info(f\"Loaded {len(checkpoint_df)} previous results from checkpoint\")\n",
|
| 151 |
+
" except FileNotFoundError:\n",
|
| 152 |
+
" logger.info(\"No checkpoint file found, starting fresh\")\n",
|
| 153 |
+
" \n",
|
| 154 |
+
" for i in range(start_index, len(queries_df)):\n",
|
| 155 |
+
" if results[i] is not None: # Skip if already processed\n",
|
| 156 |
+
" continue\n",
|
| 157 |
+
" \n",
|
| 158 |
+
" query = queries_df.iloc[i]\n",
|
| 159 |
+
" logger.info(f\"Processing query {i+1}/{len(queries_df)}: {query}\")\n",
|
| 160 |
+
" \n",
|
| 161 |
+
" result = fetch_first_google_link_with_backoff(query)\n",
|
| 162 |
+
" results[i] = result\n",
|
| 163 |
+
" \n",
|
| 164 |
+
" # Save checkpoint every 10 queries\n",
|
| 165 |
+
" if (i + 1) % 10 == 0:\n",
|
| 166 |
+
" checkpoint_data = {\n",
|
| 167 |
+
" 'query_index': range(len(results)),\n",
|
| 168 |
+
" 'query': queries_df.tolist(),\n",
|
| 169 |
+
" 'result': results\n",
|
| 170 |
+
" }\n",
|
| 171 |
+
" pd.DataFrame(checkpoint_data).to_csv(checkpoint_file, index=False)\n",
|
| 172 |
+
" logger.info(f\"Checkpoint saved at query {i+1}\")\n",
|
| 173 |
+
" \n",
|
| 174 |
+
" return pd.Series(results, index=queries_df.index)\n",
|
| 175 |
+
"\n"
|
| 176 |
+
]
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"cell_type": "code",
|
| 180 |
+
"execution_count": null,
|
| 181 |
+
"metadata": {},
|
| 182 |
+
"outputs": [],
|
| 183 |
+
"source": []
|
| 184 |
+
},
|
| 185 |
+
{
|
| 186 |
+
"cell_type": "code",
|
| 187 |
+
"execution_count": 3,
|
| 188 |
+
"metadata": {
|
| 189 |
+
"id": "guCqCfDy_E_V"
|
| 190 |
+
},
|
| 191 |
+
"outputs": [],
|
| 192 |
+
"source": [
|
| 193 |
+
"df = pd.read_csv(\"books_cleaned.csv\", encoding=\"utf-8\")"
|
| 194 |
+
]
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"cell_type": "code",
|
| 198 |
+
"execution_count": 4,
|
| 199 |
+
"metadata": {
|
| 200 |
+
"id": "z1uMJic9_X73"
|
| 201 |
+
},
|
| 202 |
+
"outputs": [],
|
| 203 |
+
"source": [
|
| 204 |
+
"queries_df = df[\"title_and_subtitle\"] + \" by \" + df[\"authors\"] + \"- google books\""
|
| 205 |
+
]
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"cell_type": "code",
|
| 209 |
+
"execution_count": null,
|
| 210 |
+
"metadata": {},
|
| 211 |
+
"outputs": [],
|
| 212 |
+
"source": []
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"cell_type": "code",
|
| 216 |
+
"execution_count": null,
|
| 217 |
+
"metadata": {
|
| 218 |
+
"colab": {
|
| 219 |
+
"base_uri": "https://localhost:8080/",
|
| 220 |
+
"height": 428
|
| 221 |
+
},
|
| 222 |
+
"id": "ocpGW7wW_axk",
|
| 223 |
+
"outputId": "512704e8-6d1a-4b66-e171-45b9157f0942"
|
| 224 |
+
},
|
| 225 |
+
"outputs": [],
|
| 226 |
+
"source": []
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"cell_type": "code",
|
| 230 |
+
"execution_count": null,
|
| 231 |
+
"metadata": {
|
| 232 |
+
"id": "ydbB5Jd9_jcU"
|
| 233 |
+
},
|
| 234 |
+
"outputs": [
|
| 235 |
+
{
|
| 236 |
+
"name": "stderr",
|
| 237 |
+
"output_type": "stream",
|
| 238 |
+
"text": [
|
| 239 |
+
"INFO:__main__:Loaded 6397 previous results from checkpoint\n",
|
| 240 |
+
"INFO:__main__:Processing query 807/6397: City of God by Augustine;Henry Scowcroft Bettenson;Gillian Rosemary Evans- google books\n",
|
| 241 |
+
"INFO:__main__:Processing query 1618/6397: The Complete Stories of Evelyn Waugh by Evelyn Waugh- google books\n",
|
| 242 |
+
"INFO:__main__:Processing query 2355/6397: Hawthorne's Short Stories by Nathaniel Hawthorne- google books\n",
|
| 243 |
+
"INFO:__main__:Processing query 5781/6397: Little Butterfly by Hinako Takanaga;Sachiko Sato- google books\n",
|
| 244 |
+
"INFO:__main__:Processing query 6391/6397: Aspects of the Novel by E. M. Forster- google books\n",
|
| 245 |
+
"INFO:__main__:Processing query 6392/6397: Mistaken Identity by Nayantara Sahgal- google books\n",
|
| 246 |
+
"INFO:__main__:Processing query 6393/6397: Journey to the East by Hermann Hesse- google books\n",
|
| 247 |
+
"INFO:__main__:Processing query 6394/6397: The Monk Who Sold His Ferrari: A Fable About Fulfilling Your Dreams & Reaching Your Destiny by Robin Sharma- google books\n",
|
| 248 |
+
"INFO:__main__:Processing query 6395/6397: I Am that Talks with Sri Nisargadatta Maharaj by Sri Nisargadatta Maharaj;Sudhakar S. Dikshit- google books\n",
|
| 249 |
+
"INFO:__main__:Processing query 6396/6397: The Berlin Phenomenology by Georg Wilhelm Friedrich Hegel- google books\n",
|
| 250 |
+
"INFO:__main__:Processing query 6397/6397: 'I'm Telling You Stories' Jeanette Winterson and the Politics of Reading by Helena Grice;Tim Woods- google books\n"
|
| 251 |
+
]
|
| 252 |
+
}
|
| 253 |
+
],
|
| 254 |
+
"source": [
|
| 255 |
+
"process_queries_with_checkpoints(queries_df)"
|
| 256 |
+
]
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"cell_type": "code",
|
| 260 |
+
"execution_count": 5,
|
| 261 |
+
"metadata": {},
|
| 262 |
+
"outputs": [],
|
| 263 |
+
"source": [
|
| 264 |
+
"queries_df = pd.concat([queries_df,pd.read_csv(\"search_progress.csv\")[\"result\"]], axis=1)"
|
| 265 |
+
]
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"cell_type": "code",
|
| 269 |
+
"execution_count": 6,
|
| 270 |
+
"metadata": {},
|
| 271 |
+
"outputs": [],
|
| 272 |
+
"source": [
|
| 273 |
+
"queries_df.columns = [\"title\", \"url\"]"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"cell_type": "code",
|
| 278 |
+
"execution_count": 18,
|
| 279 |
+
"metadata": {},
|
| 280 |
+
"outputs": [],
|
| 281 |
+
"source": [
|
| 282 |
+
"unfinished = queries_df[(queries_df.isnull().any(axis=1)) | ~((queries_df[\"url\"].str.contains(\"amazon\", na=False)) | (queries_df[\"url\"].str.contains(\"google\", na=False)))]"
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"cell_type": "code",
|
| 287 |
+
"execution_count": 19,
|
| 288 |
+
"metadata": {},
|
| 289 |
+
"outputs": [
|
| 290 |
+
{
|
| 291 |
+
"data": {
|
| 292 |
+
"text/html": [
|
| 293 |
+
"<div>\n",
|
| 294 |
+
"<style scoped>\n",
|
| 295 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 296 |
+
" vertical-align: middle;\n",
|
| 297 |
+
" }\n",
|
| 298 |
+
"\n",
|
| 299 |
+
" .dataframe tbody tr th {\n",
|
| 300 |
+
" vertical-align: top;\n",
|
| 301 |
+
" }\n",
|
| 302 |
+
"\n",
|
| 303 |
+
" .dataframe thead th {\n",
|
| 304 |
+
" text-align: right;\n",
|
| 305 |
+
" }\n",
|
| 306 |
+
"</style>\n",
|
| 307 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 308 |
+
" <thead>\n",
|
| 309 |
+
" <tr style=\"text-align: right;\">\n",
|
| 310 |
+
" <th></th>\n",
|
| 311 |
+
" <th>title</th>\n",
|
| 312 |
+
" <th>url</th>\n",
|
| 313 |
+
" </tr>\n",
|
| 314 |
+
" </thead>\n",
|
| 315 |
+
" <tbody>\n",
|
| 316 |
+
" <tr>\n",
|
| 317 |
+
" <th>73</th>\n",
|
| 318 |
+
" <td>I Can Read with Me Eyes Shut! by Dr. Seuss- g...</td>\n",
|
| 319 |
+
" <td>/search?num=3</td>\n",
|
| 320 |
+
" </tr>\n",
|
| 321 |
+
" <tr>\n",
|
| 322 |
+
" <th>101</th>\n",
|
| 323 |
+
" <td>Tyranny of the Majority Funamental Fairness in...</td>\n",
|
| 324 |
+
" <td>/search?num=3</td>\n",
|
| 325 |
+
" </tr>\n",
|
| 326 |
+
" <tr>\n",
|
| 327 |
+
" <th>126</th>\n",
|
| 328 |
+
" <td>Mars and Venus Book of Days 365 Inspriations t...</td>\n",
|
| 329 |
+
" <td>/search?num=3</td>\n",
|
| 330 |
+
" </tr>\n",
|
| 331 |
+
" <tr>\n",
|
| 332 |
+
" <th>128</th>\n",
|
| 333 |
+
" <td>Today I Feel Silly & Other Moods That Make My ...</td>\n",
|
| 334 |
+
" <td>/search?num=3</td>\n",
|
| 335 |
+
" </tr>\n",
|
| 336 |
+
" <tr>\n",
|
| 337 |
+
" <th>314</th>\n",
|
| 338 |
+
" <td>DREAM & THE UNDERWOR by James Hillman- google...</td>\n",
|
| 339 |
+
" <td>/search?num=3</td>\n",
|
| 340 |
+
" </tr>\n",
|
| 341 |
+
" <tr>\n",
|
| 342 |
+
" <th>...</th>\n",
|
| 343 |
+
" <td>...</td>\n",
|
| 344 |
+
" <td>...</td>\n",
|
| 345 |
+
" </tr>\n",
|
| 346 |
+
" <tr>\n",
|
| 347 |
+
" <th>6392</th>\n",
|
| 348 |
+
" <td>Journey to the East by Hermann Hesse- google ...</td>\n",
|
| 349 |
+
" <td>NaN</td>\n",
|
| 350 |
+
" </tr>\n",
|
| 351 |
+
" <tr>\n",
|
| 352 |
+
" <th>6393</th>\n",
|
| 353 |
+
" <td>The Monk Who Sold His Ferrari: A Fable About F...</td>\n",
|
| 354 |
+
" <td>NaN</td>\n",
|
| 355 |
+
" </tr>\n",
|
| 356 |
+
" <tr>\n",
|
| 357 |
+
" <th>6394</th>\n",
|
| 358 |
+
" <td>I Am that Talks with Sri Nisargadatta Maharaj ...</td>\n",
|
| 359 |
+
" <td>NaN</td>\n",
|
| 360 |
+
" </tr>\n",
|
| 361 |
+
" <tr>\n",
|
| 362 |
+
" <th>6395</th>\n",
|
| 363 |
+
" <td>The Berlin Phenomenology by Georg Wilhelm Fri...</td>\n",
|
| 364 |
+
" <td>NaN</td>\n",
|
| 365 |
+
" </tr>\n",
|
| 366 |
+
" <tr>\n",
|
| 367 |
+
" <th>6396</th>\n",
|
| 368 |
+
" <td>'I'm Telling You Stories' Jeanette Winterson a...</td>\n",
|
| 369 |
+
" <td>NaN</td>\n",
|
| 370 |
+
" </tr>\n",
|
| 371 |
+
" </tbody>\n",
|
| 372 |
+
"</table>\n",
|
| 373 |
+
"<p>100 rows × 2 columns</p>\n",
|
| 374 |
+
"</div>"
|
| 375 |
+
],
|
| 376 |
+
"text/plain": [
|
| 377 |
+
" title url\n",
|
| 378 |
+
"73 I Can Read with Me Eyes Shut! by Dr. Seuss- g... /search?num=3\n",
|
| 379 |
+
"101 Tyranny of the Majority Funamental Fairness in... /search?num=3\n",
|
| 380 |
+
"126 Mars and Venus Book of Days 365 Inspriations t... /search?num=3\n",
|
| 381 |
+
"128 Today I Feel Silly & Other Moods That Make My ... /search?num=3\n",
|
| 382 |
+
"314 DREAM & THE UNDERWOR by James Hillman- google... /search?num=3\n",
|
| 383 |
+
"... ... ...\n",
|
| 384 |
+
"6392 Journey to the East by Hermann Hesse- google ... NaN\n",
|
| 385 |
+
"6393 The Monk Who Sold His Ferrari: A Fable About F... NaN\n",
|
| 386 |
+
"6394 I Am that Talks with Sri Nisargadatta Maharaj ... NaN\n",
|
| 387 |
+
"6395 The Berlin Phenomenology by Georg Wilhelm Fri... NaN\n",
|
| 388 |
+
"6396 'I'm Telling You Stories' Jeanette Winterson a... NaN\n",
|
| 389 |
+
"\n",
|
| 390 |
+
"[100 rows x 2 columns]"
|
| 391 |
+
]
|
| 392 |
+
},
|
| 393 |
+
"execution_count": 19,
|
| 394 |
+
"metadata": {},
|
| 395 |
+
"output_type": "execute_result"
|
| 396 |
+
}
|
| 397 |
+
],
|
| 398 |
+
"source": [
|
| 399 |
+
"unfinished"
|
| 400 |
+
]
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"cell_type": "code",
|
| 404 |
+
"execution_count": 20,
|
| 405 |
+
"metadata": {},
|
| 406 |
+
"outputs": [
|
| 407 |
+
{
|
| 408 |
+
"data": {
|
| 409 |
+
"text/plain": [
|
| 410 |
+
"[None,\n",
|
| 411 |
+
" None,\n",
|
| 412 |
+
" None,\n",
|
| 413 |
+
" None,\n",
|
| 414 |
+
" None,\n",
|
| 415 |
+
" None,\n",
|
| 416 |
+
" None,\n",
|
| 417 |
+
" None,\n",
|
| 418 |
+
" None,\n",
|
| 419 |
+
" None,\n",
|
| 420 |
+
" None,\n",
|
| 421 |
+
" None,\n",
|
| 422 |
+
" None,\n",
|
| 423 |
+
" None,\n",
|
| 424 |
+
" None,\n",
|
| 425 |
+
" None,\n",
|
| 426 |
+
" None,\n",
|
| 427 |
+
" None,\n",
|
| 428 |
+
" None,\n",
|
| 429 |
+
" None,\n",
|
| 430 |
+
" None,\n",
|
| 431 |
+
" None,\n",
|
| 432 |
+
" None,\n",
|
| 433 |
+
" None,\n",
|
| 434 |
+
" None,\n",
|
| 435 |
+
" None,\n",
|
| 436 |
+
" None,\n",
|
| 437 |
+
" None,\n",
|
| 438 |
+
" None,\n",
|
| 439 |
+
" None,\n",
|
| 440 |
+
" None,\n",
|
| 441 |
+
" None,\n",
|
| 442 |
+
" None,\n",
|
| 443 |
+
" None,\n",
|
| 444 |
+
" None,\n",
|
| 445 |
+
" None,\n",
|
| 446 |
+
" None,\n",
|
| 447 |
+
" None,\n",
|
| 448 |
+
" None,\n",
|
| 449 |
+
" None,\n",
|
| 450 |
+
" None,\n",
|
| 451 |
+
" None,\n",
|
| 452 |
+
" None,\n",
|
| 453 |
+
" None,\n",
|
| 454 |
+
" None,\n",
|
| 455 |
+
" None,\n",
|
| 456 |
+
" None,\n",
|
| 457 |
+
" None,\n",
|
| 458 |
+
" None,\n",
|
| 459 |
+
" None,\n",
|
| 460 |
+
" None,\n",
|
| 461 |
+
" None,\n",
|
| 462 |
+
" None,\n",
|
| 463 |
+
" None,\n",
|
| 464 |
+
" None,\n",
|
| 465 |
+
" None,\n",
|
| 466 |
+
" None,\n",
|
| 467 |
+
" None,\n",
|
| 468 |
+
" None,\n",
|
| 469 |
+
" None,\n",
|
| 470 |
+
" None,\n",
|
| 471 |
+
" None,\n",
|
| 472 |
+
" None,\n",
|
| 473 |
+
" None,\n",
|
| 474 |
+
" None,\n",
|
| 475 |
+
" None,\n",
|
| 476 |
+
" None,\n",
|
| 477 |
+
" None,\n",
|
| 478 |
+
" None,\n",
|
| 479 |
+
" None,\n",
|
| 480 |
+
" None,\n",
|
| 481 |
+
" None,\n",
|
| 482 |
+
" None,\n",
|
| 483 |
+
" None,\n",
|
| 484 |
+
" None,\n",
|
| 485 |
+
" None,\n",
|
| 486 |
+
" None,\n",
|
| 487 |
+
" None,\n",
|
| 488 |
+
" None,\n",
|
| 489 |
+
" None,\n",
|
| 490 |
+
" None,\n",
|
| 491 |
+
" None,\n",
|
| 492 |
+
" None,\n",
|
| 493 |
+
" None,\n",
|
| 494 |
+
" None,\n",
|
| 495 |
+
" None,\n",
|
| 496 |
+
" None,\n",
|
| 497 |
+
" None,\n",
|
| 498 |
+
" None,\n",
|
| 499 |
+
" None,\n",
|
| 500 |
+
" None,\n",
|
| 501 |
+
" None,\n",
|
| 502 |
+
" None,\n",
|
| 503 |
+
" None,\n",
|
| 504 |
+
" None,\n",
|
| 505 |
+
" None,\n",
|
| 506 |
+
" None,\n",
|
| 507 |
+
" None,\n",
|
| 508 |
+
" None,\n",
|
| 509 |
+
" None]"
|
| 510 |
+
]
|
| 511 |
+
},
|
| 512 |
+
"execution_count": 20,
|
| 513 |
+
"metadata": {},
|
| 514 |
+
"output_type": "execute_result"
|
| 515 |
+
}
|
| 516 |
+
],
|
| 517 |
+
"source": [
|
| 518 |
+
"unfinished_urls"
|
| 519 |
+
]
|
| 520 |
+
},
|
| 521 |
+
{
|
| 522 |
+
"cell_type": "code",
|
| 523 |
+
"execution_count": null,
|
| 524 |
+
"metadata": {},
|
| 525 |
+
"outputs": [
|
| 526 |
+
{
|
| 527 |
+
"data": {
|
| 528 |
+
"text/plain": [
|
| 529 |
+
"'/search?num=3'"
|
| 530 |
+
]
|
| 531 |
+
},
|
| 532 |
+
"execution_count": 8,
|
| 533 |
+
"metadata": {},
|
| 534 |
+
"output_type": "execute_result"
|
| 535 |
+
}
|
| 536 |
+
],
|
| 537 |
+
"source": [
|
| 538 |
+
"fetch_first_google_link_with_backoff(unfinished[\"title\"].tolist()[0])"
|
| 539 |
+
]
|
| 540 |
+
},
|
| 541 |
+
{
|
| 542 |
+
"cell_type": "code",
|
| 543 |
+
"execution_count": 21,
|
| 544 |
+
"metadata": {},
|
| 545 |
+
"outputs": [
|
| 546 |
+
{
|
| 547 |
+
"name": "stderr",
|
| 548 |
+
"output_type": "stream",
|
| 549 |
+
"text": [
|
| 550 |
+
"C:\\Users\\NonsoDev\\AppData\\Local\\Temp\\ipykernel_40848\\271023033.py:1: SettingWithCopyWarning: \n",
|
| 551 |
+
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
| 552 |
+
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
| 553 |
+
"\n",
|
| 554 |
+
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
| 555 |
+
" unfinished[\"hhh\"] = unfinished_urls\n"
|
| 556 |
+
]
|
| 557 |
+
}
|
| 558 |
+
],
|
| 559 |
+
"source": [
|
| 560 |
+
"unfinished[\"hhh\"] = unfinished_urls"
|
| 561 |
+
]
|
| 562 |
+
},
|
| 563 |
+
{
|
| 564 |
+
"cell_type": "code",
|
| 565 |
+
"execution_count": 22,
|
| 566 |
+
"metadata": {},
|
| 567 |
+
"outputs": [
|
| 568 |
+
{
|
| 569 |
+
"data": {
|
| 570 |
+
"text/html": [
|
| 571 |
+
"<div>\n",
|
| 572 |
+
"<style scoped>\n",
|
| 573 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 574 |
+
" vertical-align: middle;\n",
|
| 575 |
+
" }\n",
|
| 576 |
+
"\n",
|
| 577 |
+
" .dataframe tbody tr th {\n",
|
| 578 |
+
" vertical-align: top;\n",
|
| 579 |
+
" }\n",
|
| 580 |
+
"\n",
|
| 581 |
+
" .dataframe thead th {\n",
|
| 582 |
+
" text-align: right;\n",
|
| 583 |
+
" }\n",
|
| 584 |
+
"</style>\n",
|
| 585 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 586 |
+
" <thead>\n",
|
| 587 |
+
" <tr style=\"text-align: right;\">\n",
|
| 588 |
+
" <th></th>\n",
|
| 589 |
+
" <th>title</th>\n",
|
| 590 |
+
" <th>url</th>\n",
|
| 591 |
+
" <th>hhh</th>\n",
|
| 592 |
+
" </tr>\n",
|
| 593 |
+
" </thead>\n",
|
| 594 |
+
" <tbody>\n",
|
| 595 |
+
" <tr>\n",
|
| 596 |
+
" <th>73</th>\n",
|
| 597 |
+
" <td>I Can Read with Me Eyes Shut! by Dr. Seuss- g...</td>\n",
|
| 598 |
+
" <td>/search?num=3</td>\n",
|
| 599 |
+
" <td>None</td>\n",
|
| 600 |
+
" </tr>\n",
|
| 601 |
+
" <tr>\n",
|
| 602 |
+
" <th>101</th>\n",
|
| 603 |
+
" <td>Tyranny of the Majority Funamental Fairness in...</td>\n",
|
| 604 |
+
" <td>/search?num=3</td>\n",
|
| 605 |
+
" <td>None</td>\n",
|
| 606 |
+
" </tr>\n",
|
| 607 |
+
" <tr>\n",
|
| 608 |
+
" <th>126</th>\n",
|
| 609 |
+
" <td>Mars and Venus Book of Days 365 Inspriations t...</td>\n",
|
| 610 |
+
" <td>/search?num=3</td>\n",
|
| 611 |
+
" <td>None</td>\n",
|
| 612 |
+
" </tr>\n",
|
| 613 |
+
" <tr>\n",
|
| 614 |
+
" <th>128</th>\n",
|
| 615 |
+
" <td>Today I Feel Silly & Other Moods That Make My ...</td>\n",
|
| 616 |
+
" <td>/search?num=3</td>\n",
|
| 617 |
+
" <td>None</td>\n",
|
| 618 |
+
" </tr>\n",
|
| 619 |
+
" <tr>\n",
|
| 620 |
+
" <th>314</th>\n",
|
| 621 |
+
" <td>DREAM & THE UNDERWOR by James Hillman- google...</td>\n",
|
| 622 |
+
" <td>/search?num=3</td>\n",
|
| 623 |
+
" <td>None</td>\n",
|
| 624 |
+
" </tr>\n",
|
| 625 |
+
" </tbody>\n",
|
| 626 |
+
"</table>\n",
|
| 627 |
+
"</div>"
|
| 628 |
+
],
|
| 629 |
+
"text/plain": [
|
| 630 |
+
" title url hhh\n",
|
| 631 |
+
"73 I Can Read with Me Eyes Shut! by Dr. Seuss- g... /search?num=3 None\n",
|
| 632 |
+
"101 Tyranny of the Majority Funamental Fairness in... /search?num=3 None\n",
|
| 633 |
+
"126 Mars and Venus Book of Days 365 Inspriations t... /search?num=3 None\n",
|
| 634 |
+
"128 Today I Feel Silly & Other Moods That Make My ... /search?num=3 None\n",
|
| 635 |
+
"314 DREAM & THE UNDERWOR by James Hillman- google... /search?num=3 None"
|
| 636 |
+
]
|
| 637 |
+
},
|
| 638 |
+
"execution_count": 22,
|
| 639 |
+
"metadata": {},
|
| 640 |
+
"output_type": "execute_result"
|
| 641 |
+
}
|
| 642 |
+
],
|
| 643 |
+
"source": [
|
| 644 |
+
"unfinished.head()"
|
| 645 |
+
]
|
| 646 |
+
},
|
| 647 |
+
{
|
| 648 |
+
"cell_type": "code",
|
| 649 |
+
"execution_count": 23,
|
| 650 |
+
"metadata": {},
|
| 651 |
+
"outputs": [],
|
| 652 |
+
"source": [
|
| 653 |
+
"df1 = pd.read_csv(\"search_progress1.csv\")"
|
| 654 |
+
]
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"cell_type": "code",
|
| 658 |
+
"execution_count": 30,
|
| 659 |
+
"metadata": {},
|
| 660 |
+
"outputs": [
|
| 661 |
+
{
|
| 662 |
+
"data": {
|
| 663 |
+
"text/plain": [
|
| 664 |
+
"806 NaN\n",
|
| 665 |
+
"Name: url, dtype: object"
|
| 666 |
+
]
|
| 667 |
+
},
|
| 668 |
+
"execution_count": 30,
|
| 669 |
+
"metadata": {},
|
| 670 |
+
"output_type": "execute_result"
|
| 671 |
+
}
|
| 672 |
+
],
|
| 673 |
+
"source": [
|
| 674 |
+
"df1[\"url\"][df1[\"url\"].isna()]"
|
| 675 |
+
]
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"cell_type": "code",
|
| 679 |
+
"execution_count": 31,
|
| 680 |
+
"metadata": {},
|
| 681 |
+
"outputs": [
|
| 682 |
+
{
|
| 683 |
+
"data": {
|
| 684 |
+
"text/html": [
|
| 685 |
+
"<div>\n",
|
| 686 |
+
"<style scoped>\n",
|
| 687 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 688 |
+
" vertical-align: middle;\n",
|
| 689 |
+
" }\n",
|
| 690 |
+
"\n",
|
| 691 |
+
" .dataframe tbody tr th {\n",
|
| 692 |
+
" vertical-align: top;\n",
|
| 693 |
+
" }\n",
|
| 694 |
+
"\n",
|
| 695 |
+
" .dataframe thead th {\n",
|
| 696 |
+
" text-align: right;\n",
|
| 697 |
+
" }\n",
|
| 698 |
+
"</style>\n",
|
| 699 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 700 |
+
" <thead>\n",
|
| 701 |
+
" <tr style=\"text-align: right;\">\n",
|
| 702 |
+
" <th></th>\n",
|
| 703 |
+
" <th>isbn13</th>\n",
|
| 704 |
+
" <th>authors</th>\n",
|
| 705 |
+
" <th>categories</th>\n",
|
| 706 |
+
" <th>thumbnail</th>\n",
|
| 707 |
+
" <th>description</th>\n",
|
| 708 |
+
" <th>published_year</th>\n",
|
| 709 |
+
" <th>average_rating</th>\n",
|
| 710 |
+
" <th>num_pages</th>\n",
|
| 711 |
+
" <th>ratings_count</th>\n",
|
| 712 |
+
" <th>title_and_subtitle</th>\n",
|
| 713 |
+
" <th>tagged_description</th>\n",
|
| 714 |
+
" </tr>\n",
|
| 715 |
+
" </thead>\n",
|
| 716 |
+
" <tbody>\n",
|
| 717 |
+
" <tr>\n",
|
| 718 |
+
" <th>0</th>\n",
|
| 719 |
+
" <td>9780002005883</td>\n",
|
| 720 |
+
" <td>Marilynne Robinson</td>\n",
|
| 721 |
+
" <td>Fiction</td>\n",
|
| 722 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 723 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 724 |
+
" <td>2004.0</td>\n",
|
| 725 |
+
" <td>3.85</td>\n",
|
| 726 |
+
" <td>247.0</td>\n",
|
| 727 |
+
" <td>361.0</td>\n",
|
| 728 |
+
" <td>Gilead</td>\n",
|
| 729 |
+
" <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
|
| 730 |
+
" </tr>\n",
|
| 731 |
+
" <tr>\n",
|
| 732 |
+
" <th>1</th>\n",
|
| 733 |
+
" <td>9780002261982</td>\n",
|
| 734 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 735 |
+
" <td>Detective and mystery stories</td>\n",
|
| 736 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 737 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 738 |
+
" <td>2000.0</td>\n",
|
| 739 |
+
" <td>3.83</td>\n",
|
| 740 |
+
" <td>241.0</td>\n",
|
| 741 |
+
" <td>5164.0</td>\n",
|
| 742 |
+
" <td>Spider's Web A Novel</td>\n",
|
| 743 |
+
" <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
|
| 744 |
+
" </tr>\n",
|
| 745 |
+
" <tr>\n",
|
| 746 |
+
" <th>2</th>\n",
|
| 747 |
+
" <td>9780006163831</td>\n",
|
| 748 |
+
" <td>Stephen R. Donaldson</td>\n",
|
| 749 |
+
" <td>American fiction</td>\n",
|
| 750 |
+
" <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
|
| 751 |
+
" <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
|
| 752 |
+
" <td>1982.0</td>\n",
|
| 753 |
+
" <td>3.97</td>\n",
|
| 754 |
+
" <td>479.0</td>\n",
|
| 755 |
+
" <td>172.0</td>\n",
|
| 756 |
+
" <td>The One Tree</td>\n",
|
| 757 |
+
" <td>9780006163831 Volume Two of Stephen Donaldson'...</td>\n",
|
| 758 |
+
" </tr>\n",
|
| 759 |
+
" <tr>\n",
|
| 760 |
+
" <th>3</th>\n",
|
| 761 |
+
" <td>9780006178736</td>\n",
|
| 762 |
+
" <td>Sidney Sheldon</td>\n",
|
| 763 |
+
" <td>Fiction</td>\n",
|
| 764 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 765 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 766 |
+
" <td>1993.0</td>\n",
|
| 767 |
+
" <td>3.93</td>\n",
|
| 768 |
+
" <td>512.0</td>\n",
|
| 769 |
+
" <td>29532.0</td>\n",
|
| 770 |
+
" <td>Rage of angels</td>\n",
|
| 771 |
+
" <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
|
| 772 |
+
" </tr>\n",
|
| 773 |
+
" <tr>\n",
|
| 774 |
+
" <th>4</th>\n",
|
| 775 |
+
" <td>9780006280897</td>\n",
|
| 776 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 777 |
+
" <td>Christian life</td>\n",
|
| 778 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 779 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 780 |
+
" <td>2002.0</td>\n",
|
| 781 |
+
" <td>4.15</td>\n",
|
| 782 |
+
" <td>170.0</td>\n",
|
| 783 |
+
" <td>33684.0</td>\n",
|
| 784 |
+
" <td>The Four Loves</td>\n",
|
| 785 |
+
" <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
|
| 786 |
+
" </tr>\n",
|
| 787 |
+
" </tbody>\n",
|
| 788 |
+
"</table>\n",
|
| 789 |
+
"</div>"
|
| 790 |
+
],
|
| 791 |
+
"text/plain": [
|
| 792 |
+
" isbn13 authors \\\n",
|
| 793 |
+
"0 9780002005883 Marilynne Robinson \n",
|
| 794 |
+
"1 9780002261982 Charles Osborne;Agatha Christie \n",
|
| 795 |
+
"2 9780006163831 Stephen R. Donaldson \n",
|
| 796 |
+
"3 9780006178736 Sidney Sheldon \n",
|
| 797 |
+
"4 9780006280897 Clive Staples Lewis \n",
|
| 798 |
+
"\n",
|
| 799 |
+
" categories \\\n",
|
| 800 |
+
"0 Fiction \n",
|
| 801 |
+
"1 Detective and mystery stories \n",
|
| 802 |
+
"2 American fiction \n",
|
| 803 |
+
"3 Fiction \n",
|
| 804 |
+
"4 Christian life \n",
|
| 805 |
+
"\n",
|
| 806 |
+
" thumbnail \\\n",
|
| 807 |
+
"0 http://books.google.com/books/content?id=KQZCP... \n",
|
| 808 |
+
"1 http://books.google.com/books/content?id=gA5GP... \n",
|
| 809 |
+
"2 http://books.google.com/books/content?id=OmQaw... \n",
|
| 810 |
+
"3 http://books.google.com/books/content?id=FKo2T... \n",
|
| 811 |
+
"4 http://books.google.com/books/content?id=XhQ5X... \n",
|
| 812 |
+
"\n",
|
| 813 |
+
" description published_year \\\n",
|
| 814 |
+
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
|
| 815 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
|
| 816 |
+
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
|
| 817 |
+
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
|
| 818 |
+
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
|
| 819 |
+
"\n",
|
| 820 |
+
" average_rating num_pages ratings_count title_and_subtitle \\\n",
|
| 821 |
+
"0 3.85 247.0 361.0 Gilead \n",
|
| 822 |
+
"1 3.83 241.0 5164.0 Spider's Web A Novel \n",
|
| 823 |
+
"2 3.97 479.0 172.0 The One Tree \n",
|
| 824 |
+
"3 3.93 512.0 29532.0 Rage of angels \n",
|
| 825 |
+
"4 4.15 170.0 33684.0 The Four Loves \n",
|
| 826 |
+
"\n",
|
| 827 |
+
" tagged_description \n",
|
| 828 |
+
"0 9780002005883 A NOVEL THAT READERS and critics... \n",
|
| 829 |
+
"1 9780002261982 A new 'Christie for Christmas' -... \n",
|
| 830 |
+
"2 9780006163831 Volume Two of Stephen Donaldson'... \n",
|
| 831 |
+
"3 9780006178736 A memorable, mesmerizing heroine... \n",
|
| 832 |
+
"4 9780006280897 Lewis' work on the nature of lov... "
|
| 833 |
+
]
|
| 834 |
+
},
|
| 835 |
+
"execution_count": 31,
|
| 836 |
+
"metadata": {},
|
| 837 |
+
"output_type": "execute_result"
|
| 838 |
+
}
|
| 839 |
+
],
|
| 840 |
+
"source": [
|
| 841 |
+
"df.head()"
|
| 842 |
+
]
|
| 843 |
+
},
|
| 844 |
+
{
|
| 845 |
+
"cell_type": "code",
|
| 846 |
+
"execution_count": null,
|
| 847 |
+
"metadata": {},
|
| 848 |
+
"outputs": [
|
| 849 |
+
{
|
| 850 |
+
"data": {
|
| 851 |
+
"text/html": [
|
| 852 |
+
"<div>\n",
|
| 853 |
+
"<style scoped>\n",
|
| 854 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 855 |
+
" vertical-align: middle;\n",
|
| 856 |
+
" }\n",
|
| 857 |
+
"\n",
|
| 858 |
+
" .dataframe tbody tr th {\n",
|
| 859 |
+
" vertical-align: top;\n",
|
| 860 |
+
" }\n",
|
| 861 |
+
"\n",
|
| 862 |
+
" .dataframe thead th {\n",
|
| 863 |
+
" text-align: right;\n",
|
| 864 |
+
" }\n",
|
| 865 |
+
"</style>\n",
|
| 866 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 867 |
+
" <thead>\n",
|
| 868 |
+
" <tr style=\"text-align: right;\">\n",
|
| 869 |
+
" <th></th>\n",
|
| 870 |
+
" <th>title</th>\n",
|
| 871 |
+
" <th>url</th>\n",
|
| 872 |
+
" </tr>\n",
|
| 873 |
+
" </thead>\n",
|
| 874 |
+
" <tbody>\n",
|
| 875 |
+
" <tr>\n",
|
| 876 |
+
" <th>0</th>\n",
|
| 877 |
+
" <td>Gilead by Marilynne Robinson- google books</td>\n",
|
| 878 |
+
" <td>https://books.google.com/books/about/Gilead.ht...</td>\n",
|
| 879 |
+
" </tr>\n",
|
| 880 |
+
" <tr>\n",
|
| 881 |
+
" <th>1</th>\n",
|
| 882 |
+
" <td>Spider's Web A Novel by Charles Osborne;Agatha...</td>\n",
|
| 883 |
+
" <td>https://books.google.com/books/about/Spider_s_...</td>\n",
|
| 884 |
+
" </tr>\n",
|
| 885 |
+
" <tr>\n",
|
| 886 |
+
" <th>2</th>\n",
|
| 887 |
+
" <td>The One Tree by Stephen R. Donaldson- google ...</td>\n",
|
| 888 |
+
" <td>https://books.google.com/books/about/The_One_T...</td>\n",
|
| 889 |
+
" </tr>\n",
|
| 890 |
+
" <tr>\n",
|
| 891 |
+
" <th>3</th>\n",
|
| 892 |
+
" <td>Rage of angels by Sidney Sheldon- google books</td>\n",
|
| 893 |
+
" <td>https://books.google.com/books/about/Rage_of_A...</td>\n",
|
| 894 |
+
" </tr>\n",
|
| 895 |
+
" <tr>\n",
|
| 896 |
+
" <th>4</th>\n",
|
| 897 |
+
" <td>The Four Loves by Clive Staples Lewis- google...</td>\n",
|
| 898 |
+
" <td>https://books.google.com/books/about/The_Four_...</td>\n",
|
| 899 |
+
" </tr>\n",
|
| 900 |
+
" </tbody>\n",
|
| 901 |
+
"</table>\n",
|
| 902 |
+
"</div>"
|
| 903 |
+
],
|
| 904 |
+
"text/plain": [
|
| 905 |
+
" title \\\n",
|
| 906 |
+
"0 Gilead by Marilynne Robinson- google books \n",
|
| 907 |
+
"1 Spider's Web A Novel by Charles Osborne;Agatha... \n",
|
| 908 |
+
"2 The One Tree by Stephen R. Donaldson- google ... \n",
|
| 909 |
+
"3 Rage of angels by Sidney Sheldon- google books \n",
|
| 910 |
+
"4 The Four Loves by Clive Staples Lewis- google... \n",
|
| 911 |
+
"\n",
|
| 912 |
+
" url \n",
|
| 913 |
+
"0 https://books.google.com/books/about/Gilead.ht... \n",
|
| 914 |
+
"1 https://books.google.com/books/about/Spider_s_... \n",
|
| 915 |
+
"2 https://books.google.com/books/about/The_One_T... \n",
|
| 916 |
+
"3 https://books.google.com/books/about/Rage_of_A... \n",
|
| 917 |
+
"4 https://books.google.com/books/about/The_Four_... "
|
| 918 |
+
]
|
| 919 |
+
},
|
| 920 |
+
"execution_count": 32,
|
| 921 |
+
"metadata": {},
|
| 922 |
+
"output_type": "execute_result"
|
| 923 |
+
}
|
| 924 |
+
],
|
| 925 |
+
"source": []
|
| 926 |
+
},
|
| 927 |
+
{
|
| 928 |
+
"cell_type": "code",
|
| 929 |
+
"execution_count": 33,
|
| 930 |
+
"metadata": {},
|
| 931 |
+
"outputs": [
|
| 932 |
+
{
|
| 933 |
+
"data": {
|
| 934 |
+
"text/html": [
|
| 935 |
+
"<div>\n",
|
| 936 |
+
"<style scoped>\n",
|
| 937 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 938 |
+
" vertical-align: middle;\n",
|
| 939 |
+
" }\n",
|
| 940 |
+
"\n",
|
| 941 |
+
" .dataframe tbody tr th {\n",
|
| 942 |
+
" vertical-align: top;\n",
|
| 943 |
+
" }\n",
|
| 944 |
+
"\n",
|
| 945 |
+
" .dataframe thead th {\n",
|
| 946 |
+
" text-align: right;\n",
|
| 947 |
+
" }\n",
|
| 948 |
+
"</style>\n",
|
| 949 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 950 |
+
" <thead>\n",
|
| 951 |
+
" <tr style=\"text-align: right;\">\n",
|
| 952 |
+
" <th></th>\n",
|
| 953 |
+
" <th>title</th>\n",
|
| 954 |
+
" <th>url</th>\n",
|
| 955 |
+
" </tr>\n",
|
| 956 |
+
" </thead>\n",
|
| 957 |
+
" <tbody>\n",
|
| 958 |
+
" <tr>\n",
|
| 959 |
+
" <th>0</th>\n",
|
| 960 |
+
" <td>Gilead by Marilynne Robinson- google books</td>\n",
|
| 961 |
+
" <td>https://books.google.com/books/about/Gilead.ht...</td>\n",
|
| 962 |
+
" </tr>\n",
|
| 963 |
+
" <tr>\n",
|
| 964 |
+
" <th>1</th>\n",
|
| 965 |
+
" <td>Spider's Web A Novel by Charles Osborne;Agatha...</td>\n",
|
| 966 |
+
" <td>https://books.google.com/books/about/Spider_s_...</td>\n",
|
| 967 |
+
" </tr>\n",
|
| 968 |
+
" <tr>\n",
|
| 969 |
+
" <th>2</th>\n",
|
| 970 |
+
" <td>The One Tree by Stephen R. Donaldson- google ...</td>\n",
|
| 971 |
+
" <td>https://books.google.com/books/about/The_One_T...</td>\n",
|
| 972 |
+
" </tr>\n",
|
| 973 |
+
" <tr>\n",
|
| 974 |
+
" <th>3</th>\n",
|
| 975 |
+
" <td>Rage of angels by Sidney Sheldon- google books</td>\n",
|
| 976 |
+
" <td>https://books.google.com/books/about/Rage_of_A...</td>\n",
|
| 977 |
+
" </tr>\n",
|
| 978 |
+
" <tr>\n",
|
| 979 |
+
" <th>4</th>\n",
|
| 980 |
+
" <td>The Four Loves by Clive Staples Lewis- google...</td>\n",
|
| 981 |
+
" <td>https://books.google.com/books/about/The_Four_...</td>\n",
|
| 982 |
+
" </tr>\n",
|
| 983 |
+
" </tbody>\n",
|
| 984 |
+
"</table>\n",
|
| 985 |
+
"</div>"
|
| 986 |
+
],
|
| 987 |
+
"text/plain": [
|
| 988 |
+
" title \\\n",
|
| 989 |
+
"0 Gilead by Marilynne Robinson- google books \n",
|
| 990 |
+
"1 Spider's Web A Novel by Charles Osborne;Agatha... \n",
|
| 991 |
+
"2 The One Tree by Stephen R. Donaldson- google ... \n",
|
| 992 |
+
"3 Rage of angels by Sidney Sheldon- google books \n",
|
| 993 |
+
"4 The Four Loves by Clive Staples Lewis- google... \n",
|
| 994 |
+
"\n",
|
| 995 |
+
" url \n",
|
| 996 |
+
"0 https://books.google.com/books/about/Gilead.ht... \n",
|
| 997 |
+
"1 https://books.google.com/books/about/Spider_s_... \n",
|
| 998 |
+
"2 https://books.google.com/books/about/The_One_T... \n",
|
| 999 |
+
"3 https://books.google.com/books/about/Rage_of_A... \n",
|
| 1000 |
+
"4 https://books.google.com/books/about/The_Four_... "
|
| 1001 |
+
]
|
| 1002 |
+
},
|
| 1003 |
+
"execution_count": 33,
|
| 1004 |
+
"metadata": {},
|
| 1005 |
+
"output_type": "execute_result"
|
| 1006 |
+
}
|
| 1007 |
+
],
|
| 1008 |
+
"source": [
|
| 1009 |
+
"df1.head()"
|
| 1010 |
+
]
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"cell_type": "code",
|
| 1014 |
+
"execution_count": 35,
|
| 1015 |
+
"metadata": {},
|
| 1016 |
+
"outputs": [
|
| 1017 |
+
{
|
| 1018 |
+
"data": {
|
| 1019 |
+
"text/html": [
|
| 1020 |
+
"<div>\n",
|
| 1021 |
+
"<style scoped>\n",
|
| 1022 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 1023 |
+
" vertical-align: middle;\n",
|
| 1024 |
+
" }\n",
|
| 1025 |
+
"\n",
|
| 1026 |
+
" .dataframe tbody tr th {\n",
|
| 1027 |
+
" vertical-align: top;\n",
|
| 1028 |
+
" }\n",
|
| 1029 |
+
"\n",
|
| 1030 |
+
" .dataframe thead th {\n",
|
| 1031 |
+
" text-align: right;\n",
|
| 1032 |
+
" }\n",
|
| 1033 |
+
"</style>\n",
|
| 1034 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 1035 |
+
" <thead>\n",
|
| 1036 |
+
" <tr style=\"text-align: right;\">\n",
|
| 1037 |
+
" <th></th>\n",
|
| 1038 |
+
" <th>title</th>\n",
|
| 1039 |
+
" <th>url</th>\n",
|
| 1040 |
+
" </tr>\n",
|
| 1041 |
+
" </thead>\n",
|
| 1042 |
+
" <tbody>\n",
|
| 1043 |
+
" <tr>\n",
|
| 1044 |
+
" <th>73</th>\n",
|
| 1045 |
+
" <td>I Can Read with Me Eyes Shut! by Dr. Seuss- g...</td>\n",
|
| 1046 |
+
" <td>/search?num=3</td>\n",
|
| 1047 |
+
" </tr>\n",
|
| 1048 |
+
" <tr>\n",
|
| 1049 |
+
" <th>101</th>\n",
|
| 1050 |
+
" <td>Tyranny of the Majority Funamental Fairness in...</td>\n",
|
| 1051 |
+
" <td>/search?num=3</td>\n",
|
| 1052 |
+
" </tr>\n",
|
| 1053 |
+
" <tr>\n",
|
| 1054 |
+
" <th>126</th>\n",
|
| 1055 |
+
" <td>Mars and Venus Book of Days 365 Inspriations t...</td>\n",
|
| 1056 |
+
" <td>/search?num=3</td>\n",
|
| 1057 |
+
" </tr>\n",
|
| 1058 |
+
" <tr>\n",
|
| 1059 |
+
" <th>128</th>\n",
|
| 1060 |
+
" <td>Today I Feel Silly & Other Moods That Make My ...</td>\n",
|
| 1061 |
+
" <td>/search?num=3</td>\n",
|
| 1062 |
+
" </tr>\n",
|
| 1063 |
+
" <tr>\n",
|
| 1064 |
+
" <th>314</th>\n",
|
| 1065 |
+
" <td>DREAM & THE UNDERWOR by James Hillman- google...</td>\n",
|
| 1066 |
+
" <td>/search?num=3</td>\n",
|
| 1067 |
+
" </tr>\n",
|
| 1068 |
+
" <tr>\n",
|
| 1069 |
+
" <th>...</th>\n",
|
| 1070 |
+
" <td>...</td>\n",
|
| 1071 |
+
" <td>...</td>\n",
|
| 1072 |
+
" </tr>\n",
|
| 1073 |
+
" <tr>\n",
|
| 1074 |
+
" <th>6392</th>\n",
|
| 1075 |
+
" <td>Journey to the East by Hermann Hesse- google ...</td>\n",
|
| 1076 |
+
" <td>NaN</td>\n",
|
| 1077 |
+
" </tr>\n",
|
| 1078 |
+
" <tr>\n",
|
| 1079 |
+
" <th>6393</th>\n",
|
| 1080 |
+
" <td>The Monk Who Sold His Ferrari: A Fable About F...</td>\n",
|
| 1081 |
+
" <td>NaN</td>\n",
|
| 1082 |
+
" </tr>\n",
|
| 1083 |
+
" <tr>\n",
|
| 1084 |
+
" <th>6394</th>\n",
|
| 1085 |
+
" <td>I Am that Talks with Sri Nisargadatta Maharaj ...</td>\n",
|
| 1086 |
+
" <td>NaN</td>\n",
|
| 1087 |
+
" </tr>\n",
|
| 1088 |
+
" <tr>\n",
|
| 1089 |
+
" <th>6395</th>\n",
|
| 1090 |
+
" <td>The Berlin Phenomenology by Georg Wilhelm Fri...</td>\n",
|
| 1091 |
+
" <td>NaN</td>\n",
|
| 1092 |
+
" </tr>\n",
|
| 1093 |
+
" <tr>\n",
|
| 1094 |
+
" <th>6396</th>\n",
|
| 1095 |
+
" <td>'I'm Telling You Stories' Jeanette Winterson a...</td>\n",
|
| 1096 |
+
" <td>NaN</td>\n",
|
| 1097 |
+
" </tr>\n",
|
| 1098 |
+
" </tbody>\n",
|
| 1099 |
+
"</table>\n",
|
| 1100 |
+
"<p>100 rows × 2 columns</p>\n",
|
| 1101 |
+
"</div>"
|
| 1102 |
+
],
|
| 1103 |
+
"text/plain": [
|
| 1104 |
+
" title url\n",
|
| 1105 |
+
"73 I Can Read with Me Eyes Shut! by Dr. Seuss- g... /search?num=3\n",
|
| 1106 |
+
"101 Tyranny of the Majority Funamental Fairness in... /search?num=3\n",
|
| 1107 |
+
"126 Mars and Venus Book of Days 365 Inspriations t... /search?num=3\n",
|
| 1108 |
+
"128 Today I Feel Silly & Other Moods That Make My ... /search?num=3\n",
|
| 1109 |
+
"314 DREAM & THE UNDERWOR by James Hillman- google... /search?num=3\n",
|
| 1110 |
+
"... ... ...\n",
|
| 1111 |
+
"6392 Journey to the East by Hermann Hesse- google ... NaN\n",
|
| 1112 |
+
"6393 The Monk Who Sold His Ferrari: A Fable About F... NaN\n",
|
| 1113 |
+
"6394 I Am that Talks with Sri Nisargadatta Maharaj ... NaN\n",
|
| 1114 |
+
"6395 The Berlin Phenomenology by Georg Wilhelm Fri... NaN\n",
|
| 1115 |
+
"6396 'I'm Telling You Stories' Jeanette Winterson a... NaN\n",
|
| 1116 |
+
"\n",
|
| 1117 |
+
"[100 rows x 2 columns]"
|
| 1118 |
+
]
|
| 1119 |
+
},
|
| 1120 |
+
"execution_count": 35,
|
| 1121 |
+
"metadata": {},
|
| 1122 |
+
"output_type": "execute_result"
|
| 1123 |
+
}
|
| 1124 |
+
],
|
| 1125 |
+
"source": [
|
| 1126 |
+
"queries_df[(queries_df.isnull().any(axis=1)) | ~((queries_df[\"url\"].str.contains(\"amazon\", na=False)) | (queries_df[\"url\"].str.contains(\"google\", na=False)))]"
|
| 1127 |
+
]
|
| 1128 |
+
},
|
| 1129 |
+
{
|
| 1130 |
+
"cell_type": "code",
|
| 1131 |
+
"execution_count": null,
|
| 1132 |
+
"metadata": {},
|
| 1133 |
+
"outputs": [],
|
| 1134 |
+
"source": [
|
| 1135 |
+
"# i'll drop dataframes without a good url"
|
| 1136 |
+
]
|
| 1137 |
+
},
|
| 1138 |
+
{
|
| 1139 |
+
"cell_type": "code",
|
| 1140 |
+
"execution_count": 39,
|
| 1141 |
+
"metadata": {},
|
| 1142 |
+
"outputs": [
|
| 1143 |
+
{
|
| 1144 |
+
"data": {
|
| 1145 |
+
"text/plain": [
|
| 1146 |
+
"Index([ 768, 806, 1170, 1269, 1311, 1343, 2311, 2389, 2536, 3270, 3572, 4228,\n",
|
| 1147 |
+
" 4941, 5292, 5293, 6085],\n",
|
| 1148 |
+
" dtype='int64')"
|
| 1149 |
+
]
|
| 1150 |
+
},
|
| 1151 |
+
"execution_count": 39,
|
| 1152 |
+
"metadata": {},
|
| 1153 |
+
"output_type": "execute_result"
|
| 1154 |
+
}
|
| 1155 |
+
],
|
| 1156 |
+
"source": [
|
| 1157 |
+
"to_drop = df1[(df1.isnull().any(axis=1)) | ~((df1[\"url\"].str.contains(\"amazon\", na=False)) | (df1[\"url\"].str.contains(\"google\", na=False)))].index\n",
|
| 1158 |
+
"to_drop"
|
| 1159 |
+
]
|
| 1160 |
+
},
|
| 1161 |
+
{
|
| 1162 |
+
"cell_type": "code",
|
| 1163 |
+
"execution_count": 40,
|
| 1164 |
+
"metadata": {},
|
| 1165 |
+
"outputs": [],
|
| 1166 |
+
"source": [
|
| 1167 |
+
"df1 = df1.drop(index=to_drop)"
|
| 1168 |
+
]
|
| 1169 |
+
},
|
| 1170 |
+
{
|
| 1171 |
+
"cell_type": "code",
|
| 1172 |
+
"execution_count": 41,
|
| 1173 |
+
"metadata": {},
|
| 1174 |
+
"outputs": [
|
| 1175 |
+
{
|
| 1176 |
+
"data": {
|
| 1177 |
+
"text/html": [
|
| 1178 |
+
"<div>\n",
|
| 1179 |
+
"<style scoped>\n",
|
| 1180 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 1181 |
+
" vertical-align: middle;\n",
|
| 1182 |
+
" }\n",
|
| 1183 |
+
"\n",
|
| 1184 |
+
" .dataframe tbody tr th {\n",
|
| 1185 |
+
" vertical-align: top;\n",
|
| 1186 |
+
" }\n",
|
| 1187 |
+
"\n",
|
| 1188 |
+
" .dataframe thead th {\n",
|
| 1189 |
+
" text-align: right;\n",
|
| 1190 |
+
" }\n",
|
| 1191 |
+
"</style>\n",
|
| 1192 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 1193 |
+
" <thead>\n",
|
| 1194 |
+
" <tr style=\"text-align: right;\">\n",
|
| 1195 |
+
" <th></th>\n",
|
| 1196 |
+
" <th>title</th>\n",
|
| 1197 |
+
" <th>url</th>\n",
|
| 1198 |
+
" </tr>\n",
|
| 1199 |
+
" </thead>\n",
|
| 1200 |
+
" <tbody>\n",
|
| 1201 |
+
" <tr>\n",
|
| 1202 |
+
" <th>0</th>\n",
|
| 1203 |
+
" <td>Gilead by Marilynne Robinson- google books</td>\n",
|
| 1204 |
+
" <td>https://books.google.com/books/about/Gilead.ht...</td>\n",
|
| 1205 |
+
" </tr>\n",
|
| 1206 |
+
" <tr>\n",
|
| 1207 |
+
" <th>1</th>\n",
|
| 1208 |
+
" <td>Spider's Web A Novel by Charles Osborne;Agatha...</td>\n",
|
| 1209 |
+
" <td>https://books.google.com/books/about/Spider_s_...</td>\n",
|
| 1210 |
+
" </tr>\n",
|
| 1211 |
+
" <tr>\n",
|
| 1212 |
+
" <th>2</th>\n",
|
| 1213 |
+
" <td>The One Tree by Stephen R. Donaldson- google ...</td>\n",
|
| 1214 |
+
" <td>https://books.google.com/books/about/The_One_T...</td>\n",
|
| 1215 |
+
" </tr>\n",
|
| 1216 |
+
" <tr>\n",
|
| 1217 |
+
" <th>3</th>\n",
|
| 1218 |
+
" <td>Rage of angels by Sidney Sheldon- google books</td>\n",
|
| 1219 |
+
" <td>https://books.google.com/books/about/Rage_of_A...</td>\n",
|
| 1220 |
+
" </tr>\n",
|
| 1221 |
+
" <tr>\n",
|
| 1222 |
+
" <th>4</th>\n",
|
| 1223 |
+
" <td>The Four Loves by Clive Staples Lewis- google...</td>\n",
|
| 1224 |
+
" <td>https://books.google.com/books/about/The_Four_...</td>\n",
|
| 1225 |
+
" </tr>\n",
|
| 1226 |
+
" <tr>\n",
|
| 1227 |
+
" <th>...</th>\n",
|
| 1228 |
+
" <td>...</td>\n",
|
| 1229 |
+
" <td>...</td>\n",
|
| 1230 |
+
" </tr>\n",
|
| 1231 |
+
" <tr>\n",
|
| 1232 |
+
" <th>6392</th>\n",
|
| 1233 |
+
" <td>Journey to the East by Hermann Hesse- google ...</td>\n",
|
| 1234 |
+
" <td>https://books.google.com/books/about/The_Journ...</td>\n",
|
| 1235 |
+
" </tr>\n",
|
| 1236 |
+
" <tr>\n",
|
| 1237 |
+
" <th>6393</th>\n",
|
| 1238 |
+
" <td>The Monk Who Sold His Ferrari: A Fable About F...</td>\n",
|
| 1239 |
+
" <td>https://books.google.com/books/about/The_Monk_...</td>\n",
|
| 1240 |
+
" </tr>\n",
|
| 1241 |
+
" <tr>\n",
|
| 1242 |
+
" <th>6394</th>\n",
|
| 1243 |
+
" <td>I Am that Talks with Sri Nisargadatta Maharaj ...</td>\n",
|
| 1244 |
+
" <td>https://books.google.com/books/about/I_Am_that...</td>\n",
|
| 1245 |
+
" </tr>\n",
|
| 1246 |
+
" <tr>\n",
|
| 1247 |
+
" <th>6395</th>\n",
|
| 1248 |
+
" <td>The Berlin Phenomenology by Georg Wilhelm Fri...</td>\n",
|
| 1249 |
+
" <td>https://books.google.com/books/about/The_Berli...</td>\n",
|
| 1250 |
+
" </tr>\n",
|
| 1251 |
+
" <tr>\n",
|
| 1252 |
+
" <th>6396</th>\n",
|
| 1253 |
+
" <td>'I'm Telling You Stories' Jeanette Winterson a...</td>\n",
|
| 1254 |
+
" <td>https://books.google.com/books/about/I_m_Telli...</td>\n",
|
| 1255 |
+
" </tr>\n",
|
| 1256 |
+
" </tbody>\n",
|
| 1257 |
+
"</table>\n",
|
| 1258 |
+
"<p>6381 rows × 2 columns</p>\n",
|
| 1259 |
+
"</div>"
|
| 1260 |
+
],
|
| 1261 |
+
"text/plain": [
|
| 1262 |
+
" title \\\n",
|
| 1263 |
+
"0 Gilead by Marilynne Robinson- google books \n",
|
| 1264 |
+
"1 Spider's Web A Novel by Charles Osborne;Agatha... \n",
|
| 1265 |
+
"2 The One Tree by Stephen R. Donaldson- google ... \n",
|
| 1266 |
+
"3 Rage of angels by Sidney Sheldon- google books \n",
|
| 1267 |
+
"4 The Four Loves by Clive Staples Lewis- google... \n",
|
| 1268 |
+
"... ... \n",
|
| 1269 |
+
"6392 Journey to the East by Hermann Hesse- google ... \n",
|
| 1270 |
+
"6393 The Monk Who Sold His Ferrari: A Fable About F... \n",
|
| 1271 |
+
"6394 I Am that Talks with Sri Nisargadatta Maharaj ... \n",
|
| 1272 |
+
"6395 The Berlin Phenomenology by Georg Wilhelm Fri... \n",
|
| 1273 |
+
"6396 'I'm Telling You Stories' Jeanette Winterson a... \n",
|
| 1274 |
+
"\n",
|
| 1275 |
+
" url \n",
|
| 1276 |
+
"0 https://books.google.com/books/about/Gilead.ht... \n",
|
| 1277 |
+
"1 https://books.google.com/books/about/Spider_s_... \n",
|
| 1278 |
+
"2 https://books.google.com/books/about/The_One_T... \n",
|
| 1279 |
+
"3 https://books.google.com/books/about/Rage_of_A... \n",
|
| 1280 |
+
"4 https://books.google.com/books/about/The_Four_... \n",
|
| 1281 |
+
"... ... \n",
|
| 1282 |
+
"6392 https://books.google.com/books/about/The_Journ... \n",
|
| 1283 |
+
"6393 https://books.google.com/books/about/The_Monk_... \n",
|
| 1284 |
+
"6394 https://books.google.com/books/about/I_Am_that... \n",
|
| 1285 |
+
"6395 https://books.google.com/books/about/The_Berli... \n",
|
| 1286 |
+
"6396 https://books.google.com/books/about/I_m_Telli... \n",
|
| 1287 |
+
"\n",
|
| 1288 |
+
"[6381 rows x 2 columns]"
|
| 1289 |
+
]
|
| 1290 |
+
},
|
| 1291 |
+
"execution_count": 41,
|
| 1292 |
+
"metadata": {},
|
| 1293 |
+
"output_type": "execute_result"
|
| 1294 |
+
}
|
| 1295 |
+
],
|
| 1296 |
+
"source": [
|
| 1297 |
+
"df1"
|
| 1298 |
+
]
|
| 1299 |
+
},
|
| 1300 |
+
{
|
| 1301 |
+
"cell_type": "code",
|
| 1302 |
+
"execution_count": 46,
|
| 1303 |
+
"metadata": {},
|
| 1304 |
+
"outputs": [],
|
| 1305 |
+
"source": [
|
| 1306 |
+
"with open(\"to_drop.txt\",\"w\") as f:\n",
|
| 1307 |
+
" f.write(\"\\n\".join(to_drop.astype(str).tolist()))"
|
| 1308 |
+
]
|
| 1309 |
+
},
|
| 1310 |
+
{
|
| 1311 |
+
"cell_type": "code",
|
| 1312 |
+
"execution_count": 48,
|
| 1313 |
+
"metadata": {},
|
| 1314 |
+
"outputs": [],
|
| 1315 |
+
"source": [
|
| 1316 |
+
"df1.to_csv(\"books_with_urls.csv\", index=False)"
|
| 1317 |
+
]
|
| 1318 |
+
},
|
| 1319 |
+
{
|
| 1320 |
+
"cell_type": "code",
|
| 1321 |
+
"execution_count": null,
|
| 1322 |
+
"metadata": {},
|
| 1323 |
+
"outputs": [],
|
| 1324 |
+
"source": []
|
| 1325 |
+
}
|
| 1326 |
+
],
|
| 1327 |
+
"metadata": {
|
| 1328 |
+
"accelerator": "GPU",
|
| 1329 |
+
"colab": {
|
| 1330 |
+
"gpuType": "T4",
|
| 1331 |
+
"provenance": []
|
| 1332 |
+
},
|
| 1333 |
+
"kernelspec": {
|
| 1334 |
+
"display_name": "venv",
|
| 1335 |
+
"language": "python",
|
| 1336 |
+
"name": "python3"
|
| 1337 |
+
},
|
| 1338 |
+
"language_info": {
|
| 1339 |
+
"codemirror_mode": {
|
| 1340 |
+
"name": "ipython",
|
| 1341 |
+
"version": 3
|
| 1342 |
+
},
|
| 1343 |
+
"file_extension": ".py",
|
| 1344 |
+
"mimetype": "text/x-python",
|
| 1345 |
+
"name": "python",
|
| 1346 |
+
"nbconvert_exporter": "python",
|
| 1347 |
+
"pygments_lexer": "ipython3",
|
| 1348 |
+
"version": "3.11.9"
|
| 1349 |
+
}
|
| 1350 |
+
},
|
| 1351 |
+
"nbformat": 4,
|
| 1352 |
+
"nbformat_minor": 0
|
| 1353 |
+
}
|
final_book_df.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
final_df.ipynb
ADDED
|
@@ -0,0 +1,590 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "7cbe0a72",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import pandas as pd\n",
|
| 11 |
+
"import numpy as np"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "code",
|
| 16 |
+
"execution_count": null,
|
| 17 |
+
"id": "63c42422",
|
| 18 |
+
"metadata": {},
|
| 19 |
+
"outputs": [],
|
| 20 |
+
"source": [
|
| 21 |
+
"final_data_cols = [\n",
|
| 22 |
+
" 'id',\n",
|
| 23 |
+
" 'title',\n",
|
| 24 |
+
" \"authors\",\n",
|
| 25 |
+
" \"description\",\n",
|
| 26 |
+
" \"categories\",\n",
|
| 27 |
+
" \"thumbnail\",\n",
|
| 28 |
+
" \"published_year\",\n",
|
| 29 |
+
" \"average_rating\",\n",
|
| 30 |
+
" \"num_pages\",\n",
|
| 31 |
+
" \"download_url\",\n",
|
| 32 |
+
" \"anger\",\n",
|
| 33 |
+
" \"disgust\",\n",
|
| 34 |
+
" \"fear\",\n",
|
| 35 |
+
" \"joy\",\n",
|
| 36 |
+
" \"sadness\",\n",
|
| 37 |
+
" \"surprise\",\n",
|
| 38 |
+
" \"neutral\"\n",
|
| 39 |
+
" ]"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"cell_type": "code",
|
| 44 |
+
"execution_count": 22,
|
| 45 |
+
"id": "19fa8ab2",
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"outputs": [],
|
| 48 |
+
"source": [
|
| 49 |
+
"df_base = pd.read_csv(\"books_cleaned.csv\")\n",
|
| 50 |
+
"categories_df = pd.read_csv(\"books_with_categories.csv\")\n",
|
| 51 |
+
"df_sentiments = pd.read_csv(\"books_with_sentiment.csv\")\n",
|
| 52 |
+
"df_download_url = pd.read_csv(\"books_with_urls.csv\")"
|
| 53 |
+
]
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"cell_type": "code",
|
| 57 |
+
"execution_count": null,
|
| 58 |
+
"id": "8a9ebdc7",
|
| 59 |
+
"metadata": {},
|
| 60 |
+
"outputs": [
|
| 61 |
+
{
|
| 62 |
+
"data": {
|
| 63 |
+
"text/plain": [
|
| 64 |
+
"(6397, 11)"
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
"execution_count": 25,
|
| 68 |
+
"metadata": {},
|
| 69 |
+
"output_type": "execute_result"
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"source": []
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"cell_type": "code",
|
| 76 |
+
"execution_count": 4,
|
| 77 |
+
"id": "5e81abf9",
|
| 78 |
+
"metadata": {},
|
| 79 |
+
"outputs": [],
|
| 80 |
+
"source": [
|
| 81 |
+
"with open(\"to_drop.txt\", \"r\") as f:\n",
|
| 82 |
+
" to_drop = f.read().splitlines()"
|
| 83 |
+
]
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"cell_type": "code",
|
| 87 |
+
"execution_count": 7,
|
| 88 |
+
"id": "66e10c4c",
|
| 89 |
+
"metadata": {},
|
| 90 |
+
"outputs": [],
|
| 91 |
+
"source": [
|
| 92 |
+
"to_drop = [int(i) for i in to_drop]"
|
| 93 |
+
]
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"cell_type": "code",
|
| 97 |
+
"execution_count": 26,
|
| 98 |
+
"id": "919ed91b",
|
| 99 |
+
"metadata": {},
|
| 100 |
+
"outputs": [],
|
| 101 |
+
"source": [
|
| 102 |
+
"df_base = df_base.drop(to_drop, errors=\"ignore\")\n",
|
| 103 |
+
"categories_df = categories_df.drop(to_drop, errors=\"ignore\")\n",
|
| 104 |
+
"df_sentiments = df_sentiments.drop(to_drop, errors=\"ignore\")"
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"cell_type": "code",
|
| 109 |
+
"execution_count": 27,
|
| 110 |
+
"id": "2b140195",
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"outputs": [
|
| 113 |
+
{
|
| 114 |
+
"data": {
|
| 115 |
+
"text/plain": [
|
| 116 |
+
"(6381, 11)"
|
| 117 |
+
]
|
| 118 |
+
},
|
| 119 |
+
"execution_count": 27,
|
| 120 |
+
"metadata": {},
|
| 121 |
+
"output_type": "execute_result"
|
| 122 |
+
}
|
| 123 |
+
],
|
| 124 |
+
"source": [
|
| 125 |
+
"df_base.shape"
|
| 126 |
+
]
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"cell_type": "code",
|
| 130 |
+
"execution_count": 28,
|
| 131 |
+
"id": "4d1c9d6a",
|
| 132 |
+
"metadata": {},
|
| 133 |
+
"outputs": [
|
| 134 |
+
{
|
| 135 |
+
"data": {
|
| 136 |
+
"text/plain": [
|
| 137 |
+
"(6381, 2)"
|
| 138 |
+
]
|
| 139 |
+
},
|
| 140 |
+
"execution_count": 28,
|
| 141 |
+
"metadata": {},
|
| 142 |
+
"output_type": "execute_result"
|
| 143 |
+
}
|
| 144 |
+
],
|
| 145 |
+
"source": [
|
| 146 |
+
"df_download_url.shape"
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"cell_type": "code",
|
| 151 |
+
"execution_count": 16,
|
| 152 |
+
"id": "32427a77",
|
| 153 |
+
"metadata": {},
|
| 154 |
+
"outputs": [
|
| 155 |
+
{
|
| 156 |
+
"data": {
|
| 157 |
+
"text/plain": [
|
| 158 |
+
"(6381, 11)"
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
"execution_count": 16,
|
| 162 |
+
"metadata": {},
|
| 163 |
+
"output_type": "execute_result"
|
| 164 |
+
}
|
| 165 |
+
],
|
| 166 |
+
"source": [
|
| 167 |
+
"categories_df.shape"
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"cell_type": "code",
|
| 172 |
+
"execution_count": 40,
|
| 173 |
+
"id": "cbb04023",
|
| 174 |
+
"metadata": {},
|
| 175 |
+
"outputs": [],
|
| 176 |
+
"source": [
|
| 177 |
+
"df_download_url = df_download_url[[\"url\"]]"
|
| 178 |
+
]
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"cell_type": "code",
|
| 182 |
+
"execution_count": null,
|
| 183 |
+
"id": "18edb501",
|
| 184 |
+
"metadata": {},
|
| 185 |
+
"outputs": [],
|
| 186 |
+
"source": [
|
| 187 |
+
"df_sentiments = df_sentiments[[\"anger\",\"disgust\",\"fear\",\"joy\",\"sadness\",\"surprise\",\"neutral\"]]\n",
|
| 188 |
+
"df_sentiments.head()"
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"cell_type": "code",
|
| 193 |
+
"execution_count": null,
|
| 194 |
+
"id": "e72f2e81",
|
| 195 |
+
"metadata": {},
|
| 196 |
+
"outputs": [
|
| 197 |
+
{
|
| 198 |
+
"data": {
|
| 199 |
+
"text/html": [
|
| 200 |
+
"<div>\n",
|
| 201 |
+
"<style scoped>\n",
|
| 202 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 203 |
+
" vertical-align: middle;\n",
|
| 204 |
+
" }\n",
|
| 205 |
+
"\n",
|
| 206 |
+
" .dataframe tbody tr th {\n",
|
| 207 |
+
" vertical-align: top;\n",
|
| 208 |
+
" }\n",
|
| 209 |
+
"\n",
|
| 210 |
+
" .dataframe thead th {\n",
|
| 211 |
+
" text-align: right;\n",
|
| 212 |
+
" }\n",
|
| 213 |
+
"</style>\n",
|
| 214 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 215 |
+
" <thead>\n",
|
| 216 |
+
" <tr style=\"text-align: right;\">\n",
|
| 217 |
+
" <th></th>\n",
|
| 218 |
+
" <th>title</th>\n",
|
| 219 |
+
" <th>url</th>\n",
|
| 220 |
+
" </tr>\n",
|
| 221 |
+
" </thead>\n",
|
| 222 |
+
" <tbody>\n",
|
| 223 |
+
" <tr>\n",
|
| 224 |
+
" <th>0</th>\n",
|
| 225 |
+
" <td>Gilead by Marilynne Robinson- google books</td>\n",
|
| 226 |
+
" <td>https://books.google.com/books/about/Gilead.ht...</td>\n",
|
| 227 |
+
" </tr>\n",
|
| 228 |
+
" <tr>\n",
|
| 229 |
+
" <th>1</th>\n",
|
| 230 |
+
" <td>Spider's Web A Novel by Charles Osborne;Agatha...</td>\n",
|
| 231 |
+
" <td>https://books.google.com/books/about/Spider_s_...</td>\n",
|
| 232 |
+
" </tr>\n",
|
| 233 |
+
" <tr>\n",
|
| 234 |
+
" <th>2</th>\n",
|
| 235 |
+
" <td>The One Tree by Stephen R. Donaldson- google ...</td>\n",
|
| 236 |
+
" <td>https://books.google.com/books/about/The_One_T...</td>\n",
|
| 237 |
+
" </tr>\n",
|
| 238 |
+
" <tr>\n",
|
| 239 |
+
" <th>3</th>\n",
|
| 240 |
+
" <td>Rage of angels by Sidney Sheldon- google books</td>\n",
|
| 241 |
+
" <td>https://books.google.com/books/about/Rage_of_A...</td>\n",
|
| 242 |
+
" </tr>\n",
|
| 243 |
+
" <tr>\n",
|
| 244 |
+
" <th>4</th>\n",
|
| 245 |
+
" <td>The Four Loves by Clive Staples Lewis- google...</td>\n",
|
| 246 |
+
" <td>https://books.google.com/books/about/The_Four_...</td>\n",
|
| 247 |
+
" </tr>\n",
|
| 248 |
+
" </tbody>\n",
|
| 249 |
+
"</table>\n",
|
| 250 |
+
"</div>"
|
| 251 |
+
],
|
| 252 |
+
"text/plain": [
|
| 253 |
+
" title \\\n",
|
| 254 |
+
"0 Gilead by Marilynne Robinson- google books \n",
|
| 255 |
+
"1 Spider's Web A Novel by Charles Osborne;Agatha... \n",
|
| 256 |
+
"2 The One Tree by Stephen R. Donaldson- google ... \n",
|
| 257 |
+
"3 Rage of angels by Sidney Sheldon- google books \n",
|
| 258 |
+
"4 The Four Loves by Clive Staples Lewis- google... \n",
|
| 259 |
+
"\n",
|
| 260 |
+
" url \n",
|
| 261 |
+
"0 https://books.google.com/books/about/Gilead.ht... \n",
|
| 262 |
+
"1 https://books.google.com/books/about/Spider_s_... \n",
|
| 263 |
+
"2 https://books.google.com/books/about/The_One_T... \n",
|
| 264 |
+
"3 https://books.google.com/books/about/Rage_of_A... \n",
|
| 265 |
+
"4 https://books.google.com/books/about/The_Four_... "
|
| 266 |
+
]
|
| 267 |
+
},
|
| 268 |
+
"execution_count": 34,
|
| 269 |
+
"metadata": {},
|
| 270 |
+
"output_type": "execute_result"
|
| 271 |
+
}
|
| 272 |
+
],
|
| 273 |
+
"source": [
|
| 274 |
+
"df_base = df_base[[\"isbn13\", \"authors\",\"thumbnail\",\"description\",\"published_year\",\"average_rating\",\"num_pages\",\"ratings_count\",\"title_and_subtitle\",\"tagged_description\"]]"
|
| 275 |
+
]
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"cell_type": "code",
|
| 279 |
+
"execution_count": null,
|
| 280 |
+
"id": "e3f317bc",
|
| 281 |
+
"metadata": {},
|
| 282 |
+
"outputs": [],
|
| 283 |
+
"source": [
|
| 284 |
+
"categories_df = categories_df[[\"categories\"]]"
|
| 285 |
+
]
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"cell_type": "code",
|
| 289 |
+
"execution_count": 46,
|
| 290 |
+
"id": "d7126b63",
|
| 291 |
+
"metadata": {},
|
| 292 |
+
"outputs": [],
|
| 293 |
+
"source": [
|
| 294 |
+
"df_base = df_base.reset_index().drop(\"index\", axis=1)\n",
|
| 295 |
+
"categories_df = categories_df.reset_index().drop(\"index\", axis=1)\n",
|
| 296 |
+
"df_download_url = df_download_url.reset_index().drop(\"index\", axis=1)\n",
|
| 297 |
+
"df_sentiments = df_sentiments.reset_index().drop(\"index\", axis=1)"
|
| 298 |
+
]
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"cell_type": "code",
|
| 302 |
+
"execution_count": 50,
|
| 303 |
+
"id": "cc1e8c55",
|
| 304 |
+
"metadata": {},
|
| 305 |
+
"outputs": [],
|
| 306 |
+
"source": [
|
| 307 |
+
"final_df = pd.concat([df_base,categories_df,df_sentiments,df_download_url], axis=1)"
|
| 308 |
+
]
|
| 309 |
+
},
|
| 310 |
+
{
|
| 311 |
+
"cell_type": "code",
|
| 312 |
+
"execution_count": 51,
|
| 313 |
+
"id": "9ba30e30",
|
| 314 |
+
"metadata": {},
|
| 315 |
+
"outputs": [
|
| 316 |
+
{
|
| 317 |
+
"data": {
|
| 318 |
+
"text/html": [
|
| 319 |
+
"<div>\n",
|
| 320 |
+
"<style scoped>\n",
|
| 321 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 322 |
+
" vertical-align: middle;\n",
|
| 323 |
+
" }\n",
|
| 324 |
+
"\n",
|
| 325 |
+
" .dataframe tbody tr th {\n",
|
| 326 |
+
" vertical-align: top;\n",
|
| 327 |
+
" }\n",
|
| 328 |
+
"\n",
|
| 329 |
+
" .dataframe thead th {\n",
|
| 330 |
+
" text-align: right;\n",
|
| 331 |
+
" }\n",
|
| 332 |
+
"</style>\n",
|
| 333 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 334 |
+
" <thead>\n",
|
| 335 |
+
" <tr style=\"text-align: right;\">\n",
|
| 336 |
+
" <th></th>\n",
|
| 337 |
+
" <th>isbn13</th>\n",
|
| 338 |
+
" <th>authors</th>\n",
|
| 339 |
+
" <th>thumbnail</th>\n",
|
| 340 |
+
" <th>description</th>\n",
|
| 341 |
+
" <th>published_year</th>\n",
|
| 342 |
+
" <th>average_rating</th>\n",
|
| 343 |
+
" <th>num_pages</th>\n",
|
| 344 |
+
" <th>ratings_count</th>\n",
|
| 345 |
+
" <th>title_and_subtitle</th>\n",
|
| 346 |
+
" <th>tagged_description</th>\n",
|
| 347 |
+
" <th>categories</th>\n",
|
| 348 |
+
" <th>anger</th>\n",
|
| 349 |
+
" <th>disgust</th>\n",
|
| 350 |
+
" <th>fear</th>\n",
|
| 351 |
+
" <th>joy</th>\n",
|
| 352 |
+
" <th>sadness</th>\n",
|
| 353 |
+
" <th>surprise</th>\n",
|
| 354 |
+
" <th>neutral</th>\n",
|
| 355 |
+
" <th>url</th>\n",
|
| 356 |
+
" </tr>\n",
|
| 357 |
+
" </thead>\n",
|
| 358 |
+
" <tbody>\n",
|
| 359 |
+
" <tr>\n",
|
| 360 |
+
" <th>4039</th>\n",
|
| 361 |
+
" <td>9780727861153</td>\n",
|
| 362 |
+
" <td>Ja Jance;Judith A. Jance</td>\n",
|
| 363 |
+
" <td>http://books.google.com/books/content?id=YDFDP...</td>\n",
|
| 364 |
+
" <td>Life is good for Joanna Brady in the small des...</td>\n",
|
| 365 |
+
" <td>2004.0</td>\n",
|
| 366 |
+
" <td>4.00</td>\n",
|
| 367 |
+
" <td>256.0</td>\n",
|
| 368 |
+
" <td>39.0</td>\n",
|
| 369 |
+
" <td>Desert Heat</td>\n",
|
| 370 |
+
" <td>9780727861153 Life is good for Joanna Brady in...</td>\n",
|
| 371 |
+
" <td>mystery</td>\n",
|
| 372 |
+
" <td>0.839755</td>\n",
|
| 373 |
+
" <td>0.893530</td>\n",
|
| 374 |
+
" <td>0.051363</td>\n",
|
| 375 |
+
" <td>0.769920</td>\n",
|
| 376 |
+
" <td>0.111690</td>\n",
|
| 377 |
+
" <td>0.078765</td>\n",
|
| 378 |
+
" <td>0.558840</td>\n",
|
| 379 |
+
" <td>https://books.google.com/books/about/Desert_He...</td>\n",
|
| 380 |
+
" </tr>\n",
|
| 381 |
+
" <tr>\n",
|
| 382 |
+
" <th>2261</th>\n",
|
| 383 |
+
" <td>9780393059465</td>\n",
|
| 384 |
+
" <td>Harriet Beecher Stowe;Professor Harriet Beeche...</td>\n",
|
| 385 |
+
" <td>http://books.google.com/books/content?id=bSaWh...</td>\n",
|
| 386 |
+
" <td>An interpretation of the American classic refu...</td>\n",
|
| 387 |
+
" <td>2007.0</td>\n",
|
| 388 |
+
" <td>3.86</td>\n",
|
| 389 |
+
" <td>528.0</td>\n",
|
| 390 |
+
" <td>160.0</td>\n",
|
| 391 |
+
" <td>The Annotated Uncle Tom's Cabin</td>\n",
|
| 392 |
+
" <td>9780393059465 An interpretation of the America...</td>\n",
|
| 393 |
+
" <td>history</td>\n",
|
| 394 |
+
" <td>0.064134</td>\n",
|
| 395 |
+
" <td>0.728139</td>\n",
|
| 396 |
+
" <td>0.051363</td>\n",
|
| 397 |
+
" <td>0.040564</td>\n",
|
| 398 |
+
" <td>0.111690</td>\n",
|
| 399 |
+
" <td>0.348772</td>\n",
|
| 400 |
+
" <td>0.599532</td>\n",
|
| 401 |
+
" <td>https://books.google.com/books/about/Uncle_Tom...</td>\n",
|
| 402 |
+
" </tr>\n",
|
| 403 |
+
" <tr>\n",
|
| 404 |
+
" <th>6101</th>\n",
|
| 405 |
+
" <td>9781841157481</td>\n",
|
| 406 |
+
" <td>Jonathan Franzen</td>\n",
|
| 407 |
+
" <td>http://books.google.com/books/content?id=n9-ha...</td>\n",
|
| 408 |
+
" <td>Dying St. Louis is turned inside-out by the ap...</td>\n",
|
| 409 |
+
" <td>2003.0</td>\n",
|
| 410 |
+
" <td>3.12</td>\n",
|
| 411 |
+
" <td>528.0</td>\n",
|
| 412 |
+
" <td>119.0</td>\n",
|
| 413 |
+
" <td>The Twenty-seventh City</td>\n",
|
| 414 |
+
" <td>9781841157481 Dying St. Louis is turned inside...</td>\n",
|
| 415 |
+
" <td>fiction</td>\n",
|
| 416 |
+
" <td>0.470221</td>\n",
|
| 417 |
+
" <td>0.114413</td>\n",
|
| 418 |
+
" <td>0.066823</td>\n",
|
| 419 |
+
" <td>0.402793</td>\n",
|
| 420 |
+
" <td>0.111690</td>\n",
|
| 421 |
+
" <td>0.216259</td>\n",
|
| 422 |
+
" <td>0.735679</td>\n",
|
| 423 |
+
" <td>https://books.google.com/books/about/The_Twent...</td>\n",
|
| 424 |
+
" </tr>\n",
|
| 425 |
+
" <tr>\n",
|
| 426 |
+
" <th>5666</th>\n",
|
| 427 |
+
" <td>9781560258247</td>\n",
|
| 428 |
+
" <td>Norman Mailer;John Buffalo Mailer</td>\n",
|
| 429 |
+
" <td>http://books.google.com/books/content?id=9oBps...</td>\n",
|
| 430 |
+
" <td>Questions are posed, writes Norman Mailer, \"in...</td>\n",
|
| 431 |
+
" <td>2006.0</td>\n",
|
| 432 |
+
" <td>3.31</td>\n",
|
| 433 |
+
" <td>218.0</td>\n",
|
| 434 |
+
" <td>67.0</td>\n",
|
| 435 |
+
" <td>The Big Empty Dialogues on Politics, Sex, God,...</td>\n",
|
| 436 |
+
" <td>9781560258247 Questions are posed, writes Norm...</td>\n",
|
| 437 |
+
" <td>mystery</td>\n",
|
| 438 |
+
" <td>0.085885</td>\n",
|
| 439 |
+
" <td>0.104098</td>\n",
|
| 440 |
+
" <td>0.253858</td>\n",
|
| 441 |
+
" <td>0.370736</td>\n",
|
| 442 |
+
" <td>0.111690</td>\n",
|
| 443 |
+
" <td>0.313475</td>\n",
|
| 444 |
+
" <td>0.930554</td>\n",
|
| 445 |
+
" <td>https://books.google.com/books/about/The_Big_E...</td>\n",
|
| 446 |
+
" </tr>\n",
|
| 447 |
+
" <tr>\n",
|
| 448 |
+
" <th>1862</th>\n",
|
| 449 |
+
" <td>9780349107868</td>\n",
|
| 450 |
+
" <td>Daniel Jonah Goldhagen</td>\n",
|
| 451 |
+
" <td>http://books.google.com/books/content?id=L11gQ...</td>\n",
|
| 452 |
+
" <td>Daniel Goldhagen re-visits a question which hi...</td>\n",
|
| 453 |
+
" <td>1997.0</td>\n",
|
| 454 |
+
" <td>3.68</td>\n",
|
| 455 |
+
" <td>634.0</td>\n",
|
| 456 |
+
" <td>80.0</td>\n",
|
| 457 |
+
" <td>Hitler's Willing Executioners Ordinary Germans...</td>\n",
|
| 458 |
+
" <td>9780349107868 Daniel Goldhagen re-visits a que...</td>\n",
|
| 459 |
+
" <td>mystery</td>\n",
|
| 460 |
+
" <td>0.781836</td>\n",
|
| 461 |
+
" <td>0.129887</td>\n",
|
| 462 |
+
" <td>0.198395</td>\n",
|
| 463 |
+
" <td>0.040564</td>\n",
|
| 464 |
+
" <td>0.131437</td>\n",
|
| 465 |
+
" <td>0.088081</td>\n",
|
| 466 |
+
" <td>0.693353</td>\n",
|
| 467 |
+
" <td>https://books.google.com/books/about/Hitler_s_...</td>\n",
|
| 468 |
+
" </tr>\n",
|
| 469 |
+
" </tbody>\n",
|
| 470 |
+
"</table>\n",
|
| 471 |
+
"</div>"
|
| 472 |
+
],
|
| 473 |
+
"text/plain": [
|
| 474 |
+
" isbn13 authors \\\n",
|
| 475 |
+
"4039 9780727861153 Ja Jance;Judith A. Jance \n",
|
| 476 |
+
"2261 9780393059465 Harriet Beecher Stowe;Professor Harriet Beeche... \n",
|
| 477 |
+
"6101 9781841157481 Jonathan Franzen \n",
|
| 478 |
+
"5666 9781560258247 Norman Mailer;John Buffalo Mailer \n",
|
| 479 |
+
"1862 9780349107868 Daniel Jonah Goldhagen \n",
|
| 480 |
+
"\n",
|
| 481 |
+
" thumbnail \\\n",
|
| 482 |
+
"4039 http://books.google.com/books/content?id=YDFDP... \n",
|
| 483 |
+
"2261 http://books.google.com/books/content?id=bSaWh... \n",
|
| 484 |
+
"6101 http://books.google.com/books/content?id=n9-ha... \n",
|
| 485 |
+
"5666 http://books.google.com/books/content?id=9oBps... \n",
|
| 486 |
+
"1862 http://books.google.com/books/content?id=L11gQ... \n",
|
| 487 |
+
"\n",
|
| 488 |
+
" description published_year \\\n",
|
| 489 |
+
"4039 Life is good for Joanna Brady in the small des... 2004.0 \n",
|
| 490 |
+
"2261 An interpretation of the American classic refu... 2007.0 \n",
|
| 491 |
+
"6101 Dying St. Louis is turned inside-out by the ap... 2003.0 \n",
|
| 492 |
+
"5666 Questions are posed, writes Norman Mailer, \"in... 2006.0 \n",
|
| 493 |
+
"1862 Daniel Goldhagen re-visits a question which hi... 1997.0 \n",
|
| 494 |
+
"\n",
|
| 495 |
+
" average_rating num_pages ratings_count \\\n",
|
| 496 |
+
"4039 4.00 256.0 39.0 \n",
|
| 497 |
+
"2261 3.86 528.0 160.0 \n",
|
| 498 |
+
"6101 3.12 528.0 119.0 \n",
|
| 499 |
+
"5666 3.31 218.0 67.0 \n",
|
| 500 |
+
"1862 3.68 634.0 80.0 \n",
|
| 501 |
+
"\n",
|
| 502 |
+
" title_and_subtitle \\\n",
|
| 503 |
+
"4039 Desert Heat \n",
|
| 504 |
+
"2261 The Annotated Uncle Tom's Cabin \n",
|
| 505 |
+
"6101 The Twenty-seventh City \n",
|
| 506 |
+
"5666 The Big Empty Dialogues on Politics, Sex, God,... \n",
|
| 507 |
+
"1862 Hitler's Willing Executioners Ordinary Germans... \n",
|
| 508 |
+
"\n",
|
| 509 |
+
" tagged_description categories anger \\\n",
|
| 510 |
+
"4039 9780727861153 Life is good for Joanna Brady in... mystery 0.839755 \n",
|
| 511 |
+
"2261 9780393059465 An interpretation of the America... history 0.064134 \n",
|
| 512 |
+
"6101 9781841157481 Dying St. Louis is turned inside... fiction 0.470221 \n",
|
| 513 |
+
"5666 9781560258247 Questions are posed, writes Norm... mystery 0.085885 \n",
|
| 514 |
+
"1862 9780349107868 Daniel Goldhagen re-visits a que... mystery 0.781836 \n",
|
| 515 |
+
"\n",
|
| 516 |
+
" disgust fear joy sadness surprise neutral \\\n",
|
| 517 |
+
"4039 0.893530 0.051363 0.769920 0.111690 0.078765 0.558840 \n",
|
| 518 |
+
"2261 0.728139 0.051363 0.040564 0.111690 0.348772 0.599532 \n",
|
| 519 |
+
"6101 0.114413 0.066823 0.402793 0.111690 0.216259 0.735679 \n",
|
| 520 |
+
"5666 0.104098 0.253858 0.370736 0.111690 0.313475 0.930554 \n",
|
| 521 |
+
"1862 0.129887 0.198395 0.040564 0.131437 0.088081 0.693353 \n",
|
| 522 |
+
"\n",
|
| 523 |
+
" url \n",
|
| 524 |
+
"4039 https://books.google.com/books/about/Desert_He... \n",
|
| 525 |
+
"2261 https://books.google.com/books/about/Uncle_Tom... \n",
|
| 526 |
+
"6101 https://books.google.com/books/about/The_Twent... \n",
|
| 527 |
+
"5666 https://books.google.com/books/about/The_Big_E... \n",
|
| 528 |
+
"1862 https://books.google.com/books/about/Hitler_s_... "
|
| 529 |
+
]
|
| 530 |
+
},
|
| 531 |
+
"execution_count": 51,
|
| 532 |
+
"metadata": {},
|
| 533 |
+
"output_type": "execute_result"
|
| 534 |
+
}
|
| 535 |
+
],
|
| 536 |
+
"source": [
|
| 537 |
+
"final_df.sample(5)"
|
| 538 |
+
]
|
| 539 |
+
},
|
| 540 |
+
{
|
| 541 |
+
"cell_type": "code",
|
| 542 |
+
"execution_count": 53,
|
| 543 |
+
"id": "c90847c5",
|
| 544 |
+
"metadata": {},
|
| 545 |
+
"outputs": [],
|
| 546 |
+
"source": [
|
| 547 |
+
"final_df[\"tagged_description\"].to_csv(\"tagged_description.txt\", index=None, header=None)"
|
| 548 |
+
]
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
"cell_type": "code",
|
| 552 |
+
"execution_count": 54,
|
| 553 |
+
"id": "5419aa0e",
|
| 554 |
+
"metadata": {},
|
| 555 |
+
"outputs": [],
|
| 556 |
+
"source": [
|
| 557 |
+
"final_df.to_csv(\"final_book_df.csv\", index=None)"
|
| 558 |
+
]
|
| 559 |
+
},
|
| 560 |
+
{
|
| 561 |
+
"cell_type": "code",
|
| 562 |
+
"execution_count": null,
|
| 563 |
+
"id": "32b5edca",
|
| 564 |
+
"metadata": {},
|
| 565 |
+
"outputs": [],
|
| 566 |
+
"source": []
|
| 567 |
+
}
|
| 568 |
+
],
|
| 569 |
+
"metadata": {
|
| 570 |
+
"kernelspec": {
|
| 571 |
+
"display_name": "venv",
|
| 572 |
+
"language": "python",
|
| 573 |
+
"name": "python3"
|
| 574 |
+
},
|
| 575 |
+
"language_info": {
|
| 576 |
+
"codemirror_mode": {
|
| 577 |
+
"name": "ipython",
|
| 578 |
+
"version": 3
|
| 579 |
+
},
|
| 580 |
+
"file_extension": ".py",
|
| 581 |
+
"mimetype": "text/x-python",
|
| 582 |
+
"name": "python",
|
| 583 |
+
"nbconvert_exporter": "python",
|
| 584 |
+
"pygments_lexer": "ipython3",
|
| 585 |
+
"version": "3.11.9"
|
| 586 |
+
}
|
| 587 |
+
},
|
| 588 |
+
"nbformat": 4,
|
| 589 |
+
"nbformat_minor": 5
|
| 590 |
+
}
|
gradio_dashboard.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from langchain_chroma import Chroma
|
| 7 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
embeddings = HuggingFaceEmbeddings(
|
| 11 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2" # Fast and good quality
|
| 12 |
+
# or "sentence-transformers/all-mpnet-base-v2" # Higher quality, slower
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
books = pd.read_csv("final_book_df.csv")
|
| 17 |
+
books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
|
| 18 |
+
books["large_thumbnail"] = np.where(books["large_thumbnail"].isna(), "cover-not-found.jpg", books["large_thumbnail"])
|
| 19 |
+
|
| 20 |
+
db_books = Chroma(persist_directory="chroma_books", embedding_function=embeddings, collection_name="books")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def retrieve_semantic_recommendations(
|
| 24 |
+
query: str,
|
| 25 |
+
category: str = None,
|
| 26 |
+
tone: str = None,
|
| 27 |
+
initial_top_k: int = 50,
|
| 28 |
+
final_top_k: int = 16,
|
| 29 |
+
) -> pd.DataFrame:
|
| 30 |
+
|
| 31 |
+
recs = db_books.similarity_search(query, k=initial_top_k)
|
| 32 |
+
books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
|
| 33 |
+
book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
|
| 34 |
+
|
| 35 |
+
if category != "All":
|
| 36 |
+
book_recs = book_recs[book_recs["categories"] == category].head(final_top_k)
|
| 37 |
+
else:
|
| 38 |
+
book_recs = book_recs.head(final_top_k)
|
| 39 |
+
|
| 40 |
+
if tone == "Happy":
|
| 41 |
+
book_recs.sort_values(by="joy", ascending=False, inplace=True)
|
| 42 |
+
elif tone == "Surprising":
|
| 43 |
+
book_recs.sort_values(by="surprise", ascending=False, inplace=True)
|
| 44 |
+
elif tone == "Angry":
|
| 45 |
+
book_recs.sort_values(by="anger", ascending=False, inplace=True)
|
| 46 |
+
elif tone == "Suspenseful":
|
| 47 |
+
book_recs.sort_values(by="fear", ascending=False, inplace=True)
|
| 48 |
+
elif tone == "Sad":
|
| 49 |
+
book_recs.sort_values(by="sadness", ascending=False, inplace=True)
|
| 50 |
+
|
| 51 |
+
return book_recs
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def recommend_books(
|
| 55 |
+
query: str,
|
| 56 |
+
category: str,
|
| 57 |
+
tone: str
|
| 58 |
+
):
|
| 59 |
+
recommendations = retrieve_semantic_recommendations(query, category, tone)
|
| 60 |
+
results = []
|
| 61 |
+
|
| 62 |
+
for _, row in recommendations.iterrows():
|
| 63 |
+
description = row["description"]
|
| 64 |
+
truncated_desc_split = description.split()
|
| 65 |
+
truncated_description = " ".join(truncated_desc_split[:30]) + "..."
|
| 66 |
+
|
| 67 |
+
authors_split = row["authors"].split(";")
|
| 68 |
+
if len(authors_split) == 2:
|
| 69 |
+
authors_str = f"{authors_split[0]} and {authors_split[1]}"
|
| 70 |
+
elif len(authors_split) > 2:
|
| 71 |
+
authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
|
| 72 |
+
else:
|
| 73 |
+
authors_str = row["authors"]
|
| 74 |
+
|
| 75 |
+
caption = f"{row['title_and_subtitle']} by {authors_str}: {truncated_description}"
|
| 76 |
+
results.append((row["large_thumbnail"], caption))
|
| 77 |
+
return results
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
categories = ["All"] + sorted(books["categories"].unique())
|
| 82 |
+
tones = ["All"] + ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
|
| 83 |
+
|
| 84 |
+
with gr.Blocks(theme=gr.themes.Glass()) as dashboard:
|
| 85 |
+
gr.Markdown("# Semantic Book Recommender")
|
| 86 |
+
gr.Markdown("## Find your next favorite book!")
|
| 87 |
+
|
| 88 |
+
with gr.Row():
|
| 89 |
+
user_query = gr.Textbox(
|
| 90 |
+
label="please enter a description of your book:",
|
| 91 |
+
placeholder="Enter your query here...",
|
| 92 |
+
lines=1,
|
| 93 |
+
max_lines=1,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
category_dropdown = gr.Dropdown(
|
| 97 |
+
label="Select a category",
|
| 98 |
+
choices=categories,
|
| 99 |
+
value="All",
|
| 100 |
+
)
|
| 101 |
+
tone_dropdown = gr.Dropdown(
|
| 102 |
+
label="Select an emotional tone",
|
| 103 |
+
choices=tones,
|
| 104 |
+
value="All",
|
| 105 |
+
)
|
| 106 |
+
submit_button = gr.Button("Submit", variant="primary")
|
| 107 |
+
|
| 108 |
+
gr.Markdown("## Recommendations")
|
| 109 |
+
output = gr.Gallery(
|
| 110 |
+
label="Recommended Books",
|
| 111 |
+
columns=8,
|
| 112 |
+
rows=2,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
submit_button.click(
|
| 116 |
+
fn=recommend_books,
|
| 117 |
+
inputs=[user_query, category_dropdown, tone_dropdown],
|
| 118 |
+
outputs=output,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
if __name__ == "__main__":
|
| 122 |
+
dashboard.launch()
|
requirements.txt
ADDED
|
Binary file (7.09 kB). View file
|
|
|
search_progress.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sentiment_analysis.ipynb
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "9faa187f",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import pandas as pd\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"df = pd.read_csv(\"books_with_categories.csv\")"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"cell_type": "code",
|
| 17 |
+
"execution_count": 3,
|
| 18 |
+
"id": "606d7c6e",
|
| 19 |
+
"metadata": {},
|
| 20 |
+
"outputs": [
|
| 21 |
+
{
|
| 22 |
+
"data": {
|
| 23 |
+
"text/plain": [
|
| 24 |
+
"True"
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
"execution_count": 3,
|
| 28 |
+
"metadata": {},
|
| 29 |
+
"output_type": "execute_result"
|
| 30 |
+
}
|
| 31 |
+
],
|
| 32 |
+
"source": [
|
| 33 |
+
"from dotenv import load_dotenv\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"load_dotenv()"
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"cell_type": "code",
|
| 40 |
+
"execution_count": 7,
|
| 41 |
+
"id": "ca93c495",
|
| 42 |
+
"metadata": {},
|
| 43 |
+
"outputs": [
|
| 44 |
+
{
|
| 45 |
+
"name": "stderr",
|
| 46 |
+
"output_type": "stream",
|
| 47 |
+
"text": [
|
| 48 |
+
"Device set to use cpu\n"
|
| 49 |
+
]
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"data": {
|
| 53 |
+
"text/plain": [
|
| 54 |
+
"[[{'label': 'joy', 'score': 0.9771687984466553},\n",
|
| 55 |
+
" {'label': 'surprise', 'score': 0.00852868054062128},\n",
|
| 56 |
+
" {'label': 'neutral', 'score': 0.005764586851000786},\n",
|
| 57 |
+
" {'label': 'anger', 'score': 0.004419783595949411},\n",
|
| 58 |
+
" {'label': 'sadness', 'score': 0.002092392183840275},\n",
|
| 59 |
+
" {'label': 'disgust', 'score': 0.0016119900392368436},\n",
|
| 60 |
+
" {'label': 'fear', 'score': 0.0004138524236623198}]]"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
"execution_count": 7,
|
| 64 |
+
"metadata": {},
|
| 65 |
+
"output_type": "execute_result"
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"source": [
|
| 69 |
+
"from transformers import pipeline\n",
|
| 70 |
+
"classifier = pipeline(\"text-classification\", model=\"j-hartmann/emotion-english-distilroberta-base\", top_k=None)\n",
|
| 71 |
+
"classifier(\"I love this!\")\n"
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"cell_type": "code",
|
| 76 |
+
"execution_count": 14,
|
| 77 |
+
"id": "f3708b48",
|
| 78 |
+
"metadata": {},
|
| 79 |
+
"outputs": [],
|
| 80 |
+
"source": [
|
| 81 |
+
"preds = classifier(df[\"description\"][0].split(\".\"))"
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"cell_type": "code",
|
| 86 |
+
"execution_count": 17,
|
| 87 |
+
"id": "e8db387e",
|
| 88 |
+
"metadata": {},
|
| 89 |
+
"outputs": [
|
| 90 |
+
{
|
| 91 |
+
"data": {
|
| 92 |
+
"text/plain": [
|
| 93 |
+
"[[{'label': 'surprise', 'score': 0.7296020984649658},\n",
|
| 94 |
+
" {'label': 'neutral', 'score': 0.14038598537445068},\n",
|
| 95 |
+
" {'label': 'fear', 'score': 0.06816228479146957},\n",
|
| 96 |
+
" {'label': 'joy', 'score': 0.0479426383972168},\n",
|
| 97 |
+
" {'label': 'anger', 'score': 0.009156371466815472},\n",
|
| 98 |
+
" {'label': 'disgust', 'score': 0.0026284765917807817},\n",
|
| 99 |
+
" {'label': 'sadness', 'score': 0.002122163539752364}],\n",
|
| 100 |
+
" [{'label': 'neutral', 'score': 0.4493706524372101},\n",
|
| 101 |
+
" {'label': 'disgust', 'score': 0.2735912799835205},\n",
|
| 102 |
+
" {'label': 'joy', 'score': 0.10908322036266327},\n",
|
| 103 |
+
" {'label': 'sadness', 'score': 0.09362740069627762},\n",
|
| 104 |
+
" {'label': 'anger', 'score': 0.04047828167676926},\n",
|
| 105 |
+
" {'label': 'surprise', 'score': 0.026970166712999344},\n",
|
| 106 |
+
" {'label': 'fear', 'score': 0.006879047024995089}],\n",
|
| 107 |
+
" [{'label': 'neutral', 'score': 0.6462154984474182},\n",
|
| 108 |
+
" {'label': 'sadness', 'score': 0.24273382127285004},\n",
|
| 109 |
+
" {'label': 'disgust', 'score': 0.04342272877693176},\n",
|
| 110 |
+
" {'label': 'surprise', 'score': 0.028300544247031212},\n",
|
| 111 |
+
" {'label': 'joy', 'score': 0.014211482368409634},\n",
|
| 112 |
+
" {'label': 'fear', 'score': 0.014084099791944027},\n",
|
| 113 |
+
" {'label': 'anger', 'score': 0.01103190891444683}],\n",
|
| 114 |
+
" [{'label': 'fear', 'score': 0.9281682968139648},\n",
|
| 115 |
+
" {'label': 'anger', 'score': 0.032190896570682526},\n",
|
| 116 |
+
" {'label': 'neutral', 'score': 0.012808685190975666},\n",
|
| 117 |
+
" {'label': 'sadness', 'score': 0.008756878785789013},\n",
|
| 118 |
+
" {'label': 'surprise', 'score': 0.00859791412949562},\n",
|
| 119 |
+
" {'label': 'disgust', 'score': 0.008431827649474144},\n",
|
| 120 |
+
" {'label': 'joy', 'score': 0.0010455839801579714}],\n",
|
| 121 |
+
" [{'label': 'sadness', 'score': 0.9671575427055359},\n",
|
| 122 |
+
" {'label': 'neutral', 'score': 0.01510414108633995},\n",
|
| 123 |
+
" {'label': 'disgust', 'score': 0.006480586249381304},\n",
|
| 124 |
+
" {'label': 'fear', 'score': 0.005393984727561474},\n",
|
| 125 |
+
" {'label': 'surprise', 'score': 0.0022869384847581387},\n",
|
| 126 |
+
" {'label': 'anger', 'score': 0.0018428878393024206},\n",
|
| 127 |
+
" {'label': 'joy', 'score': 0.001733877114020288}],\n",
|
| 128 |
+
" [{'label': 'joy', 'score': 0.9327980279922485},\n",
|
| 129 |
+
" {'label': 'disgust', 'score': 0.03771715983748436},\n",
|
| 130 |
+
" {'label': 'neutral', 'score': 0.015891825780272484},\n",
|
| 131 |
+
" {'label': 'sadness', 'score': 0.006444509141147137},\n",
|
| 132 |
+
" {'label': 'anger', 'score': 0.005024974700063467},\n",
|
| 133 |
+
" {'label': 'surprise', 'score': 0.001581205753609538},\n",
|
| 134 |
+
" {'label': 'fear', 'score': 0.0005423063994385302}],\n",
|
| 135 |
+
" [{'label': 'joy', 'score': 0.6528708338737488},\n",
|
| 136 |
+
" {'label': 'neutral', 'score': 0.2542746663093567},\n",
|
| 137 |
+
" {'label': 'surprise', 'score': 0.06808295100927353},\n",
|
| 138 |
+
" {'label': 'sadness', 'score': 0.009908987209200859},\n",
|
| 139 |
+
" {'label': 'disgust', 'score': 0.0065122139640152454},\n",
|
| 140 |
+
" {'label': 'anger', 'score': 0.004821307025849819},\n",
|
| 141 |
+
" {'label': 'fear', 'score': 0.0035290210507810116}],\n",
|
| 142 |
+
" [{'label': 'neutral', 'score': 0.549477219581604},\n",
|
| 143 |
+
" {'label': 'sadness', 'score': 0.11169005185365677},\n",
|
| 144 |
+
" {'label': 'disgust', 'score': 0.1040065810084343},\n",
|
| 145 |
+
" {'label': 'surprise', 'score': 0.07876542955636978},\n",
|
| 146 |
+
" {'label': 'anger', 'score': 0.06413355469703674},\n",
|
| 147 |
+
" {'label': 'fear', 'score': 0.051362741738557816},\n",
|
| 148 |
+
" {'label': 'joy', 'score': 0.040564361959695816}]]"
|
| 149 |
+
]
|
| 150 |
+
},
|
| 151 |
+
"execution_count": 17,
|
| 152 |
+
"metadata": {},
|
| 153 |
+
"output_type": "execute_result"
|
| 154 |
+
}
|
| 155 |
+
],
|
| 156 |
+
"source": [
|
| 157 |
+
"preds"
|
| 158 |
+
]
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"cell_type": "code",
|
| 162 |
+
"execution_count": 32,
|
| 163 |
+
"id": "67dd5f0d",
|
| 164 |
+
"metadata": {},
|
| 165 |
+
"outputs": [
|
| 166 |
+
{
|
| 167 |
+
"name": "stderr",
|
| 168 |
+
"output_type": "stream",
|
| 169 |
+
"text": [
|
| 170 |
+
" 1%| | 55/6397 [00:16<31:32, 3.35it/s] \n"
|
| 171 |
+
]
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"ename": "KeyboardInterrupt",
|
| 175 |
+
"evalue": "",
|
| 176 |
+
"output_type": "error",
|
| 177 |
+
"traceback": [
|
| 178 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 179 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
| 180 |
+
"Cell \u001b[1;32mIn[32], line 23\u001b[0m\n\u001b[0;32m 21\u001b[0m isbns\u001b[38;5;241m.\u001b[39mappend(df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124misbn13\u001b[39m\u001b[38;5;124m\"\u001b[39m][i])\n\u001b[0;32m 22\u001b[0m sentences \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m\"\u001b[39m][i]\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 23\u001b[0m sentence_pred \u001b[38;5;241m=\u001b[39m \u001b[43mclassifier\u001b[49m\u001b[43m(\u001b[49m\u001b[43msentences\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m8\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# batching\u001b[39;00m\n\u001b[0;32m 24\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m emotion \u001b[38;5;129;01min\u001b[39;00m emotions_dict:\n\u001b[0;32m 25\u001b[0m max_score \u001b[38;5;241m=\u001b[39m get_max_emotion_score(emotion, sentence_pred)\n",
|
| 181 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\text_classification.py:159\u001b[0m, in \u001b[0;36mTextClassificationPipeline.__call__\u001b[1;34m(self, inputs, **kwargs)\u001b[0m\n\u001b[0;32m 124\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 125\u001b[0m \u001b[38;5;124;03mClassify the text(s) given as inputs.\u001b[39;00m\n\u001b[0;32m 126\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 156\u001b[0m \u001b[38;5;124;03m If `top_k` is used, one such dictionary is returned per label.\u001b[39;00m\n\u001b[0;32m 157\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 158\u001b[0m inputs \u001b[38;5;241m=\u001b[39m (inputs,)\n\u001b[1;32m--> 159\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 160\u001b[0m \u001b[38;5;66;03m# TODO try and retrieve it in a nicer way from _sanitize_parameters.\u001b[39;00m\n\u001b[0;32m 161\u001b[0m _legacy \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtop_k\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m kwargs\n",
|
| 182 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\base.py:1343\u001b[0m, in \u001b[0;36mPipeline.__call__\u001b[1;34m(self, inputs, num_workers, batch_size, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m can_use_iterator:\n\u001b[0;32m 1340\u001b[0m final_iterator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_iterator(\n\u001b[0;32m 1341\u001b[0m inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params\n\u001b[0;32m 1342\u001b[0m )\n\u001b[1;32m-> 1343\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(final_iterator)\n\u001b[0;32m 1344\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n\u001b[0;32m 1345\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
| 183 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\pt_utils.py:124\u001b[0m, in \u001b[0;36mPipelineIterator.__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloader_batch_item()\n\u001b[0;32m 123\u001b[0m \u001b[38;5;66;03m# We're out of items within a batch\u001b[39;00m\n\u001b[1;32m--> 124\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miterator)\n\u001b[0;32m 125\u001b[0m processed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfer(item, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparams)\n\u001b[0;32m 126\u001b[0m \u001b[38;5;66;03m# We now have a batch of \"inferred things\".\u001b[39;00m\n",
|
| 184 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\pt_utils.py:125\u001b[0m, in \u001b[0;36mPipelineIterator.__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 123\u001b[0m \u001b[38;5;66;03m# We're out of items within a batch\u001b[39;00m\n\u001b[0;32m 124\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miterator)\n\u001b[1;32m--> 125\u001b[0m processed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 126\u001b[0m \u001b[38;5;66;03m# We now have a batch of \"inferred things\".\u001b[39;00m\n\u001b[0;32m 127\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloader_batch_size \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 128\u001b[0m \u001b[38;5;66;03m# Try to infer the size of the batch\u001b[39;00m\n",
|
| 185 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\base.py:1269\u001b[0m, in \u001b[0;36mPipeline.forward\u001b[1;34m(self, model_inputs, **forward_params)\u001b[0m\n\u001b[0;32m 1267\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m inference_context():\n\u001b[0;32m 1268\u001b[0m model_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_ensure_tensor_on_device(model_inputs, device\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m-> 1269\u001b[0m model_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mforward_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1270\u001b[0m model_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_ensure_tensor_on_device(model_outputs, device\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mdevice(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[0;32m 1271\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
| 186 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\pipelines\\text_classification.py:190\u001b[0m, in \u001b[0;36mTextClassificationPipeline._forward\u001b[1;34m(self, model_inputs)\u001b[0m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muse_cache\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39msignature(model_forward)\u001b[38;5;241m.\u001b[39mparameters\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m 189\u001b[0m model_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muse_cache\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m--> 190\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 187 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 188 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 189 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:1318\u001b[0m, in \u001b[0;36mRobertaForSequenceClassification.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 1310\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[0;32m 1312\u001b[0m \u001b[38;5;124;03m Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[0;32m 1313\u001b[0m \u001b[38;5;124;03m config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\u001b[39;00m\n\u001b[0;32m 1314\u001b[0m \u001b[38;5;124;03m `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\u001b[39;00m\n\u001b[0;32m 1315\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1316\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m-> 1318\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroberta\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1319\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1320\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1321\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1322\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1323\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1324\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1325\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1326\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1327\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1328\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1329\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m 1330\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclassifier(sequence_output)\n",
|
| 190 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 191 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 192 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:976\u001b[0m, in \u001b[0;36mRobertaModel.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 969\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[0;32m 970\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[0;32m 971\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[0;32m 972\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[0;32m 973\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[0;32m 974\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[1;32m--> 976\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 979\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 980\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 981\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_extended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 982\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 983\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 984\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 985\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 986\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 987\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 988\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m 989\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 193 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 194 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 195 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:631\u001b[0m, in \u001b[0;36mRobertaEncoder.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 620\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[0;32m 621\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[0;32m 622\u001b[0m hidden_states,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 628\u001b[0m output_attentions,\n\u001b[0;32m 629\u001b[0m )\n\u001b[0;32m 630\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 631\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 632\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 633\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 634\u001b[0m \u001b[43m \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 635\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 636\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 637\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 638\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 639\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 641\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m 642\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n",
|
| 196 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 197 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 198 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:520\u001b[0m, in \u001b[0;36mRobertaLayer.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[0;32m 508\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\n\u001b[0;32m 509\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 510\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 517\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[0;32m 518\u001b[0m \u001b[38;5;66;03m# decoder uni-directional self-attention cached key/values tuple is at positions 1,2\u001b[39;00m\n\u001b[0;32m 519\u001b[0m self_attn_past_key_value \u001b[38;5;241m=\u001b[39m past_key_value[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m--> 520\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 521\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 522\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 524\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 525\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mself_attn_past_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 526\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 527\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m 529\u001b[0m \u001b[38;5;66;03m# if decoder, the last output is tuple of self-attn cache\u001b[39;00m\n",
|
| 199 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 200 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 201 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:447\u001b[0m, in \u001b[0;36mRobertaAttention.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[0;32m 437\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\n\u001b[0;32m 438\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 439\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 445\u001b[0m output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 446\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m--> 447\u001b[0m self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 448\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 449\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 450\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 451\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 452\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 453\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 454\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 455\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 456\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[0;32m 457\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n",
|
| 202 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 203 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 204 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\transformers\\models\\roberta\\modeling_roberta.py:325\u001b[0m, in \u001b[0;36mRobertaSdpaSelfAttention.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[0;32m 313\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mforward(\n\u001b[0;32m 314\u001b[0m hidden_states,\n\u001b[0;32m 315\u001b[0m attention_mask,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 320\u001b[0m output_attentions,\n\u001b[0;32m 321\u001b[0m )\n\u001b[0;32m 323\u001b[0m bsz, tgt_len, _ \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39msize()\n\u001b[1;32m--> 325\u001b[0m query_layer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtranspose_for_scores(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 327\u001b[0m \u001b[38;5;66;03m# If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention\u001b[39;00m\n\u001b[0;32m 328\u001b[0m \u001b[38;5;66;03m# mask needs to be such that the encoder's padding tokens are not attended to.\u001b[39;00m\n\u001b[0;32m 329\u001b[0m is_cross_attention \u001b[38;5;241m=\u001b[39m encoder_hidden_states \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 205 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1530\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 206 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1539\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1540\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1541\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1544\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
| 207 |
+
"File \u001b[1;32mc:\\Users\\NonsoDev\\anaconda3\\envs\\llms\\Lib\\site-packages\\torch\\nn\\modules\\linear.py:116\u001b[0m, in \u001b[0;36mLinear.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 115\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[1;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 208 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
| 209 |
+
]
|
| 210 |
+
}
|
| 211 |
+
],
|
| 212 |
+
"source": [
|
| 213 |
+
"from tqdm import tqdm\n",
|
| 214 |
+
"\n",
|
| 215 |
+
"isbns = []\n",
|
| 216 |
+
"\n",
|
| 217 |
+
"def get_max_emotion_score(emotion, sentence_pred):\n",
|
| 218 |
+
" scores = []\n",
|
| 219 |
+
" for i in sentence_pred:\n",
|
| 220 |
+
" for j in i:\n",
|
| 221 |
+
" if j[\"label\"] == emotion:\n",
|
| 222 |
+
" scores.append(j[\"score\"])\n",
|
| 223 |
+
" return max(scores)\n",
|
| 224 |
+
"\n",
|
| 225 |
+
"\n",
|
| 226 |
+
"emotions = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
|
| 227 |
+
"\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"emotions_dict = {emotion: [] for emotion in emotions}\n",
|
| 230 |
+
"\n",
|
| 231 |
+
"\n",
|
| 232 |
+
"for i in tqdm(range(len(df))):\n",
|
| 233 |
+
" isbns.append(df[\"isbn13\"][i])\n",
|
| 234 |
+
" sentences = df[\"description\"][i].split(\".\")\n",
|
| 235 |
+
" sentence_pred = classifier(sentences, batch_size=8) # batching\n",
|
| 236 |
+
" for emotion in emotions_dict:\n",
|
| 237 |
+
" max_score = get_max_emotion_score(emotion, sentence_pred)\n",
|
| 238 |
+
" emotions_dict[emotion].append(max_score)\n",
|
| 239 |
+
"\n"
|
| 240 |
+
]
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"cell_type": "code",
|
| 244 |
+
"execution_count": 30,
|
| 245 |
+
"id": "6f596cee",
|
| 246 |
+
"metadata": {},
|
| 247 |
+
"outputs": [
|
| 248 |
+
{
|
| 249 |
+
"data": {
|
| 250 |
+
"text/plain": [
|
| 251 |
+
"{'anger': array([0.04047828, 0.61261988, 0.01603621, 0.35148466, 0.0814124 ,\n",
|
| 252 |
+
" 0.53818434, 0.13283803, 0. ]),\n",
|
| 253 |
+
" 'disgust': array([0.27359128, 0.34828481, 0.0606952 , 0.15072225, 0.18449552,\n",
|
| 254 |
+
" 0.72717494, 0.064666 , 0. ]),\n",
|
| 255 |
+
" 'fear': array([0.9281683 , 0.9723208 , 0.00191786, 0.36070606, 0.04019525,\n",
|
| 256 |
+
" 0.26585764, 0.74742717, 0. ]),\n",
|
| 257 |
+
" 'joy': array([0.93279803, 0.7672382 , 0.2518813 , 0.02480991, 0.035207 ,\n",
|
| 258 |
+
" 0.87256557, 0.00796585, 0. ]),\n",
|
| 259 |
+
" 'sadness': array([0.96715754, 0.06179974, 0.02098823, 0.47588021, 0.16030179,\n",
|
| 260 |
+
" 0.1565351 , 0.40800145, 0. ]),\n",
|
| 261 |
+
" 'surprise': array([0.7296021 , 0.25254625, 0.02968322, 0.07487808, 0.07487808,\n",
|
| 262 |
+
" 0.27190357, 0.02882093, 0. ]),\n",
|
| 263 |
+
" 'neutral': array([0.6462155 , 0.88793951, 0.73268509, 0.56766838, 0.8843897 ,\n",
|
| 264 |
+
" 0.71219414, 0.38535854, 0. ])}"
|
| 265 |
+
]
|
| 266 |
+
},
|
| 267 |
+
"execution_count": 30,
|
| 268 |
+
"metadata": {},
|
| 269 |
+
"output_type": "execute_result"
|
| 270 |
+
}
|
| 271 |
+
],
|
| 272 |
+
"source": [
|
| 273 |
+
"emotions_dict"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"cell_type": "code",
|
| 278 |
+
"execution_count": 31,
|
| 279 |
+
"id": "d3331aa5",
|
| 280 |
+
"metadata": {},
|
| 281 |
+
"outputs": [
|
| 282 |
+
{
|
| 283 |
+
"data": {
|
| 284 |
+
"text/plain": [
|
| 285 |
+
"[9780002005883,\n",
|
| 286 |
+
" 9780002261982,\n",
|
| 287 |
+
" 9780006163831,\n",
|
| 288 |
+
" 9780006178736,\n",
|
| 289 |
+
" 9780006280897,\n",
|
| 290 |
+
" 9780006280934,\n",
|
| 291 |
+
" 9780006380832,\n",
|
| 292 |
+
" 9780006470229]"
|
| 293 |
+
]
|
| 294 |
+
},
|
| 295 |
+
"execution_count": 31,
|
| 296 |
+
"metadata": {},
|
| 297 |
+
"output_type": "execute_result"
|
| 298 |
+
}
|
| 299 |
+
],
|
| 300 |
+
"source": [
|
| 301 |
+
"isbns"
|
| 302 |
+
]
|
| 303 |
+
},
|
| 304 |
+
{
|
| 305 |
+
"cell_type": "code",
|
| 306 |
+
"execution_count": null,
|
| 307 |
+
"id": "0e8fabc9",
|
| 308 |
+
"metadata": {},
|
| 309 |
+
"outputs": [],
|
| 310 |
+
"source": []
|
| 311 |
+
}
|
| 312 |
+
],
|
| 313 |
+
"metadata": {
|
| 314 |
+
"kernelspec": {
|
| 315 |
+
"display_name": "llms",
|
| 316 |
+
"language": "python",
|
| 317 |
+
"name": "python3"
|
| 318 |
+
},
|
| 319 |
+
"language_info": {
|
| 320 |
+
"codemirror_mode": {
|
| 321 |
+
"name": "ipython",
|
| 322 |
+
"version": 3
|
| 323 |
+
},
|
| 324 |
+
"file_extension": ".py",
|
| 325 |
+
"mimetype": "text/x-python",
|
| 326 |
+
"name": "python",
|
| 327 |
+
"nbconvert_exporter": "python",
|
| 328 |
+
"pygments_lexer": "ipython3",
|
| 329 |
+
"version": "3.11.11"
|
| 330 |
+
}
|
| 331 |
+
},
|
| 332 |
+
"nbformat": 4,
|
| 333 |
+
"nbformat_minor": 5
|
| 334 |
+
}
|
supervised_clean.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from googlesearch import search
|
| 5 |
+
import time
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
df = pd.read_csv("search_progress.csv")
|
| 9 |
+
df1 = df.drop("query_index", axis=1)
|
| 10 |
+
|
| 11 |
+
print("Initial DataFrame:")
|
| 12 |
+
print(df1.head())
|
| 13 |
+
|
| 14 |
+
df1.columns = ["title", "url"]
|
| 15 |
+
|
| 16 |
+
unfinished = df1[(df1.isnull().any(axis=1)) | ~((df1["url"].str.contains("amazon", na=False)) | (df1["url"].str.contains("google", na=False)))]
|
| 17 |
+
|
| 18 |
+
unfinished_list = unfinished["title"].tolist()
|
| 19 |
+
unfinished_urls = [None] * len(unfinished_list)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
for idx,i in enumerate(unfinished_list):
|
| 24 |
+
print()
|
| 25 |
+
print(f"Processing title {idx + 1}/{len(unfinished_list)}: {i}")
|
| 26 |
+
try:
|
| 27 |
+
results1 = search(i, num_results=3, lang="en")
|
| 28 |
+
results2 = search(i.replace("google", "amazon"), num_results=3, lang="en")
|
| 29 |
+
url = list(results1) + list(results2)
|
| 30 |
+
count = 0
|
| 31 |
+
print("\n")
|
| 32 |
+
print(f"Searching for: {i}")
|
| 33 |
+
for j in url:
|
| 34 |
+
count += 1
|
| 35 |
+
print(count, j)
|
| 36 |
+
index = int(input("Enter the index of the correct URL (1-3): ")) - 1
|
| 37 |
+
unfinished_urls[idx] = url[index]
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"Error occurred while searching for {i}: {e}")
|
| 40 |
+
unfinished_urls[idx] = None
|
| 41 |
+
time.sleep(random.randint(1,5)) # Sleep to avoid hitting the search API too quickly
|
| 42 |
+
|
| 43 |
+
unfinished["url"] = unfinished_urls
|
| 44 |
+
print("Updated DataFrame with URLs:")
|
| 45 |
+
print(unfinished.head())
|
| 46 |
+
|
| 47 |
+
df1.update(unfinished)
|
| 48 |
+
df1.to_csv("search_progress1.csv", index=False)
|
| 49 |
+
|
| 50 |
+
|
tagged_description.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
test_classification.ipynb
ADDED
|
@@ -0,0 +1,649 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 2,
|
| 6 |
+
"id": "290dff84",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import pandas as pd\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"df = pd.read_csv(\"books_cleaned.csv\", encoding=\"utf-8\")"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"cell_type": "code",
|
| 17 |
+
"execution_count": 3,
|
| 18 |
+
"id": "2e2d9604",
|
| 19 |
+
"metadata": {},
|
| 20 |
+
"outputs": [
|
| 21 |
+
{
|
| 22 |
+
"name": "stdout",
|
| 23 |
+
"output_type": "stream",
|
| 24 |
+
"text": [
|
| 25 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
| 26 |
+
"RangeIndex: 6397 entries, 0 to 6396\n",
|
| 27 |
+
"Data columns (total 11 columns):\n",
|
| 28 |
+
" # Column Non-Null Count Dtype \n",
|
| 29 |
+
"--- ------ -------------- ----- \n",
|
| 30 |
+
" 0 isbn13 6397 non-null int64 \n",
|
| 31 |
+
" 1 authors 6397 non-null object \n",
|
| 32 |
+
" 2 categories 6364 non-null object \n",
|
| 33 |
+
" 3 thumbnail 6190 non-null object \n",
|
| 34 |
+
" 4 description 6397 non-null object \n",
|
| 35 |
+
" 5 published_year 6397 non-null float64\n",
|
| 36 |
+
" 6 average_rating 6397 non-null float64\n",
|
| 37 |
+
" 7 num_pages 6397 non-null float64\n",
|
| 38 |
+
" 8 ratings_count 6397 non-null float64\n",
|
| 39 |
+
" 9 title_and_subtitle 6397 non-null object \n",
|
| 40 |
+
" 10 tagged_description 6397 non-null object \n",
|
| 41 |
+
"dtypes: float64(4), int64(1), object(6)\n",
|
| 42 |
+
"memory usage: 549.9+ KB\n"
|
| 43 |
+
]
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
"source": [
|
| 47 |
+
"df.info()"
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"cell_type": "code",
|
| 52 |
+
"execution_count": 4,
|
| 53 |
+
"id": "06585b26",
|
| 54 |
+
"metadata": {},
|
| 55 |
+
"outputs": [
|
| 56 |
+
{
|
| 57 |
+
"data": {
|
| 58 |
+
"text/html": [
|
| 59 |
+
"<div>\n",
|
| 60 |
+
"<style scoped>\n",
|
| 61 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 62 |
+
" vertical-align: middle;\n",
|
| 63 |
+
" }\n",
|
| 64 |
+
"\n",
|
| 65 |
+
" .dataframe tbody tr th {\n",
|
| 66 |
+
" vertical-align: top;\n",
|
| 67 |
+
" }\n",
|
| 68 |
+
"\n",
|
| 69 |
+
" .dataframe thead th {\n",
|
| 70 |
+
" text-align: right;\n",
|
| 71 |
+
" }\n",
|
| 72 |
+
"</style>\n",
|
| 73 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 74 |
+
" <thead>\n",
|
| 75 |
+
" <tr style=\"text-align: right;\">\n",
|
| 76 |
+
" <th></th>\n",
|
| 77 |
+
" <th>categories</th>\n",
|
| 78 |
+
" <th>count</th>\n",
|
| 79 |
+
" </tr>\n",
|
| 80 |
+
" </thead>\n",
|
| 81 |
+
" <tbody>\n",
|
| 82 |
+
" <tr>\n",
|
| 83 |
+
" <th>0</th>\n",
|
| 84 |
+
" <td>Fiction</td>\n",
|
| 85 |
+
" <td>2491</td>\n",
|
| 86 |
+
" </tr>\n",
|
| 87 |
+
" <tr>\n",
|
| 88 |
+
" <th>1</th>\n",
|
| 89 |
+
" <td>Juvenile Fiction</td>\n",
|
| 90 |
+
" <td>519</td>\n",
|
| 91 |
+
" </tr>\n",
|
| 92 |
+
" <tr>\n",
|
| 93 |
+
" <th>2</th>\n",
|
| 94 |
+
" <td>Biography & Autobiography</td>\n",
|
| 95 |
+
" <td>388</td>\n",
|
| 96 |
+
" </tr>\n",
|
| 97 |
+
" <tr>\n",
|
| 98 |
+
" <th>3</th>\n",
|
| 99 |
+
" <td>History</td>\n",
|
| 100 |
+
" <td>255</td>\n",
|
| 101 |
+
" </tr>\n",
|
| 102 |
+
" <tr>\n",
|
| 103 |
+
" <th>4</th>\n",
|
| 104 |
+
" <td>Literary Criticism</td>\n",
|
| 105 |
+
" <td>163</td>\n",
|
| 106 |
+
" </tr>\n",
|
| 107 |
+
" <tr>\n",
|
| 108 |
+
" <th>...</th>\n",
|
| 109 |
+
" <td>...</td>\n",
|
| 110 |
+
" <td>...</td>\n",
|
| 111 |
+
" </tr>\n",
|
| 112 |
+
" <tr>\n",
|
| 113 |
+
" <th>520</th>\n",
|
| 114 |
+
" <td>Humorous stories</td>\n",
|
| 115 |
+
" <td>1</td>\n",
|
| 116 |
+
" </tr>\n",
|
| 117 |
+
" <tr>\n",
|
| 118 |
+
" <th>521</th>\n",
|
| 119 |
+
" <td>Ballets</td>\n",
|
| 120 |
+
" <td>1</td>\n",
|
| 121 |
+
" </tr>\n",
|
| 122 |
+
" <tr>\n",
|
| 123 |
+
" <th>522</th>\n",
|
| 124 |
+
" <td>Aged women</td>\n",
|
| 125 |
+
" <td>1</td>\n",
|
| 126 |
+
" </tr>\n",
|
| 127 |
+
" <tr>\n",
|
| 128 |
+
" <th>523</th>\n",
|
| 129 |
+
" <td>Imperialism</td>\n",
|
| 130 |
+
" <td>1</td>\n",
|
| 131 |
+
" </tr>\n",
|
| 132 |
+
" <tr>\n",
|
| 133 |
+
" <th>524</th>\n",
|
| 134 |
+
" <td>Illinois</td>\n",
|
| 135 |
+
" <td>1</td>\n",
|
| 136 |
+
" </tr>\n",
|
| 137 |
+
" </tbody>\n",
|
| 138 |
+
"</table>\n",
|
| 139 |
+
"<p>525 rows × 2 columns</p>\n",
|
| 140 |
+
"</div>"
|
| 141 |
+
],
|
| 142 |
+
"text/plain": [
|
| 143 |
+
" categories count\n",
|
| 144 |
+
"0 Fiction 2491\n",
|
| 145 |
+
"1 Juvenile Fiction 519\n",
|
| 146 |
+
"2 Biography & Autobiography 388\n",
|
| 147 |
+
"3 History 255\n",
|
| 148 |
+
"4 Literary Criticism 163\n",
|
| 149 |
+
".. ... ...\n",
|
| 150 |
+
"520 Humorous stories 1\n",
|
| 151 |
+
"521 Ballets 1\n",
|
| 152 |
+
"522 Aged women 1\n",
|
| 153 |
+
"523 Imperialism 1\n",
|
| 154 |
+
"524 Illinois 1\n",
|
| 155 |
+
"\n",
|
| 156 |
+
"[525 rows x 2 columns]"
|
| 157 |
+
]
|
| 158 |
+
},
|
| 159 |
+
"execution_count": 4,
|
| 160 |
+
"metadata": {},
|
| 161 |
+
"output_type": "execute_result"
|
| 162 |
+
}
|
| 163 |
+
],
|
| 164 |
+
"source": [
|
| 165 |
+
"# too many categories\n",
|
| 166 |
+
"df[\"categories\"].value_counts().reset_index()"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"cell_type": "code",
|
| 171 |
+
"execution_count": 5,
|
| 172 |
+
"id": "1976240c",
|
| 173 |
+
"metadata": {},
|
| 174 |
+
"outputs": [
|
| 175 |
+
{
|
| 176 |
+
"data": {
|
| 177 |
+
"text/html": [
|
| 178 |
+
"<div>\n",
|
| 179 |
+
"<style scoped>\n",
|
| 180 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 181 |
+
" vertical-align: middle;\n",
|
| 182 |
+
" }\n",
|
| 183 |
+
"\n",
|
| 184 |
+
" .dataframe tbody tr th {\n",
|
| 185 |
+
" vertical-align: top;\n",
|
| 186 |
+
" }\n",
|
| 187 |
+
"\n",
|
| 188 |
+
" .dataframe thead th {\n",
|
| 189 |
+
" text-align: right;\n",
|
| 190 |
+
" }\n",
|
| 191 |
+
"</style>\n",
|
| 192 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 193 |
+
" <thead>\n",
|
| 194 |
+
" <tr style=\"text-align: right;\">\n",
|
| 195 |
+
" <th></th>\n",
|
| 196 |
+
" <th>isbn13</th>\n",
|
| 197 |
+
" <th>authors</th>\n",
|
| 198 |
+
" <th>categories</th>\n",
|
| 199 |
+
" <th>thumbnail</th>\n",
|
| 200 |
+
" <th>description</th>\n",
|
| 201 |
+
" <th>published_year</th>\n",
|
| 202 |
+
" <th>average_rating</th>\n",
|
| 203 |
+
" <th>num_pages</th>\n",
|
| 204 |
+
" <th>ratings_count</th>\n",
|
| 205 |
+
" <th>title_and_subtitle</th>\n",
|
| 206 |
+
" <th>tagged_description</th>\n",
|
| 207 |
+
" </tr>\n",
|
| 208 |
+
" </thead>\n",
|
| 209 |
+
" <tbody>\n",
|
| 210 |
+
" </tbody>\n",
|
| 211 |
+
"</table>\n",
|
| 212 |
+
"</div>"
|
| 213 |
+
],
|
| 214 |
+
"text/plain": [
|
| 215 |
+
"Empty DataFrame\n",
|
| 216 |
+
"Columns: [isbn13, authors, categories, thumbnail, description, published_year, average_rating, num_pages, ratings_count, title_and_subtitle, tagged_description]\n",
|
| 217 |
+
"Index: []"
|
| 218 |
+
]
|
| 219 |
+
},
|
| 220 |
+
"execution_count": 5,
|
| 221 |
+
"metadata": {},
|
| 222 |
+
"output_type": "execute_result"
|
| 223 |
+
}
|
| 224 |
+
],
|
| 225 |
+
"source": [
|
| 226 |
+
"df[df[\"description\"].str.len() < 25 ]"
|
| 227 |
+
]
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"cell_type": "code",
|
| 231 |
+
"execution_count": null,
|
| 232 |
+
"id": "8effbaa7",
|
| 233 |
+
"metadata": {},
|
| 234 |
+
"outputs": [],
|
| 235 |
+
"source": []
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"cell_type": "code",
|
| 239 |
+
"execution_count": 6,
|
| 240 |
+
"id": "7a11c3d3",
|
| 241 |
+
"metadata": {},
|
| 242 |
+
"outputs": [
|
| 243 |
+
{
|
| 244 |
+
"name": "stderr",
|
| 245 |
+
"output_type": "stream",
|
| 246 |
+
"text": [
|
| 247 |
+
"c:\\Users\\NonsoDev\\Documents\\Allcodes\\Projects_DL_for resume\\Recommender systems\\book reccomender - llm\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 248 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
| 249 |
+
"Device set to use cpu\n"
|
| 250 |
+
]
|
| 251 |
+
}
|
| 252 |
+
],
|
| 253 |
+
"source": [
|
| 254 |
+
"# let us change the categories to a more manageable number, fiction and non fiction with a zero shot classifier\n",
|
| 255 |
+
"from transformers import pipeline\n",
|
| 256 |
+
"classifier = pipeline(\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")"
|
| 257 |
+
]
|
| 258 |
+
},
|
| 259 |
+
{
|
| 260 |
+
"cell_type": "code",
|
| 261 |
+
"execution_count": 7,
|
| 262 |
+
"id": "3cc8882a",
|
| 263 |
+
"metadata": {},
|
| 264 |
+
"outputs": [
|
| 265 |
+
{
|
| 266 |
+
"data": {
|
| 267 |
+
"text/html": [
|
| 268 |
+
"<div>\n",
|
| 269 |
+
"<style scoped>\n",
|
| 270 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 271 |
+
" vertical-align: middle;\n",
|
| 272 |
+
" }\n",
|
| 273 |
+
"\n",
|
| 274 |
+
" .dataframe tbody tr th {\n",
|
| 275 |
+
" vertical-align: top;\n",
|
| 276 |
+
" }\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" .dataframe thead th {\n",
|
| 279 |
+
" text-align: right;\n",
|
| 280 |
+
" }\n",
|
| 281 |
+
"</style>\n",
|
| 282 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 283 |
+
" <thead>\n",
|
| 284 |
+
" <tr style=\"text-align: right;\">\n",
|
| 285 |
+
" <th></th>\n",
|
| 286 |
+
" <th>sequence</th>\n",
|
| 287 |
+
" <th>categories</th>\n",
|
| 288 |
+
" <th>scores</th>\n",
|
| 289 |
+
" </tr>\n",
|
| 290 |
+
" </thead>\n",
|
| 291 |
+
" <tbody>\n",
|
| 292 |
+
" <tr>\n",
|
| 293 |
+
" <th>0</th>\n",
|
| 294 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 295 |
+
" <td>[fiction, history, biography, fantasy, mystery...</td>\n",
|
| 296 |
+
" <td>[0.8558421730995178, 0.6128803491592407, 0.296...</td>\n",
|
| 297 |
+
" </tr>\n",
|
| 298 |
+
" <tr>\n",
|
| 299 |
+
" <th>1</th>\n",
|
| 300 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 301 |
+
" <td>[mystery, fiction, fantasy, scifi, biography, ...</td>\n",
|
| 302 |
+
" <td>[0.9339157342910767, 0.5139176249504089, 0.155...</td>\n",
|
| 303 |
+
" </tr>\n",
|
| 304 |
+
" <tr>\n",
|
| 305 |
+
" <th>2</th>\n",
|
| 306 |
+
" <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
|
| 307 |
+
" <td>[fiction, fantasy, history, scifi, biography, ...</td>\n",
|
| 308 |
+
" <td>[0.5638813972473145, 0.2660749554634094, 0.249...</td>\n",
|
| 309 |
+
" </tr>\n",
|
| 310 |
+
" <tr>\n",
|
| 311 |
+
" <th>3</th>\n",
|
| 312 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 313 |
+
" <td>[scifi, biography, fiction, history, romance, ...</td>\n",
|
| 314 |
+
" <td>[0.19755955040454865, 0.09938773512840271, 0.0...</td>\n",
|
| 315 |
+
" </tr>\n",
|
| 316 |
+
" <tr>\n",
|
| 317 |
+
" <th>4</th>\n",
|
| 318 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 319 |
+
" <td>[mystery, romance, history, biography, scifi, ...</td>\n",
|
| 320 |
+
" <td>[0.16078977286815643, 0.06188512220978737, 0.0...</td>\n",
|
| 321 |
+
" </tr>\n",
|
| 322 |
+
" <tr>\n",
|
| 323 |
+
" <th>5</th>\n",
|
| 324 |
+
" <td>\"In The Problem of Pain, C.S. Lewis, one of th...</td>\n",
|
| 325 |
+
" <td>[mystery, history, biography, scifi, romance, ...</td>\n",
|
| 326 |
+
" <td>[0.6848734021186829, 0.11091233044862747, 0.08...</td>\n",
|
| 327 |
+
" </tr>\n",
|
| 328 |
+
" <tr>\n",
|
| 329 |
+
" <th>6</th>\n",
|
| 330 |
+
" <td>Until Vasco da Gama discovered the sea-route t...</td>\n",
|
| 331 |
+
" <td>[history, mystery, biography, scifi, fiction, ...</td>\n",
|
| 332 |
+
" <td>[0.9738430976867676, 0.19697055220603943, 0.18...</td>\n",
|
| 333 |
+
" </tr>\n",
|
| 334 |
+
" <tr>\n",
|
| 335 |
+
" <th>7</th>\n",
|
| 336 |
+
" <td>A new-cover reissue of the fourth book in the ...</td>\n",
|
| 337 |
+
" <td>[scifi, fantasy, fiction, mystery, history, ro...</td>\n",
|
| 338 |
+
" <td>[0.9945376515388489, 0.9806752800941467, 0.934...</td>\n",
|
| 339 |
+
" </tr>\n",
|
| 340 |
+
" <tr>\n",
|
| 341 |
+
" <th>8</th>\n",
|
| 342 |
+
" <td>Kate Blackwell is an enigma and one of the mos...</td>\n",
|
| 343 |
+
" <td>[mystery, biography, fiction, scifi, history, ...</td>\n",
|
| 344 |
+
" <td>[0.9990025162696838, 0.43301281332969666, 0.04...</td>\n",
|
| 345 |
+
" </tr>\n",
|
| 346 |
+
" <tr>\n",
|
| 347 |
+
" <th>9</th>\n",
|
| 348 |
+
" <td>One of Sidney Sheldon's most popular and bests...</td>\n",
|
| 349 |
+
" <td>[romance, mystery, biography, fantasy, scifi, ...</td>\n",
|
| 350 |
+
" <td>[0.6518456935882568, 0.4315004348754883, 0.367...</td>\n",
|
| 351 |
+
" </tr>\n",
|
| 352 |
+
" </tbody>\n",
|
| 353 |
+
"</table>\n",
|
| 354 |
+
"</div>"
|
| 355 |
+
],
|
| 356 |
+
"text/plain": [
|
| 357 |
+
" sequence \\\n",
|
| 358 |
+
"0 A NOVEL THAT READERS and critics have been eag... \n",
|
| 359 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... \n",
|
| 360 |
+
"2 Volume Two of Stephen Donaldson's acclaimed se... \n",
|
| 361 |
+
"3 A memorable, mesmerizing heroine Jennifer -- b... \n",
|
| 362 |
+
"4 Lewis' work on the nature of love divides love... \n",
|
| 363 |
+
"5 \"In The Problem of Pain, C.S. Lewis, one of th... \n",
|
| 364 |
+
"6 Until Vasco da Gama discovered the sea-route t... \n",
|
| 365 |
+
"7 A new-cover reissue of the fourth book in the ... \n",
|
| 366 |
+
"8 Kate Blackwell is an enigma and one of the mos... \n",
|
| 367 |
+
"9 One of Sidney Sheldon's most popular and bests... \n",
|
| 368 |
+
"\n",
|
| 369 |
+
" categories \\\n",
|
| 370 |
+
"0 [fiction, history, biography, fantasy, mystery... \n",
|
| 371 |
+
"1 [mystery, fiction, fantasy, scifi, biography, ... \n",
|
| 372 |
+
"2 [fiction, fantasy, history, scifi, biography, ... \n",
|
| 373 |
+
"3 [scifi, biography, fiction, history, romance, ... \n",
|
| 374 |
+
"4 [mystery, romance, history, biography, scifi, ... \n",
|
| 375 |
+
"5 [mystery, history, biography, scifi, romance, ... \n",
|
| 376 |
+
"6 [history, mystery, biography, scifi, fiction, ... \n",
|
| 377 |
+
"7 [scifi, fantasy, fiction, mystery, history, ro... \n",
|
| 378 |
+
"8 [mystery, biography, fiction, scifi, history, ... \n",
|
| 379 |
+
"9 [romance, mystery, biography, fantasy, scifi, ... \n",
|
| 380 |
+
"\n",
|
| 381 |
+
" scores \n",
|
| 382 |
+
"0 [0.8558421730995178, 0.6128803491592407, 0.296... \n",
|
| 383 |
+
"1 [0.9339157342910767, 0.5139176249504089, 0.155... \n",
|
| 384 |
+
"2 [0.5638813972473145, 0.2660749554634094, 0.249... \n",
|
| 385 |
+
"3 [0.19755955040454865, 0.09938773512840271, 0.0... \n",
|
| 386 |
+
"4 [0.16078977286815643, 0.06188512220978737, 0.0... \n",
|
| 387 |
+
"5 [0.6848734021186829, 0.11091233044862747, 0.08... \n",
|
| 388 |
+
"6 [0.9738430976867676, 0.19697055220603943, 0.18... \n",
|
| 389 |
+
"7 [0.9945376515388489, 0.9806752800941467, 0.934... \n",
|
| 390 |
+
"8 [0.9990025162696838, 0.43301281332969666, 0.04... \n",
|
| 391 |
+
"9 [0.6518456935882568, 0.4315004348754883, 0.367... "
|
| 392 |
+
]
|
| 393 |
+
},
|
| 394 |
+
"execution_count": 7,
|
| 395 |
+
"metadata": {},
|
| 396 |
+
"output_type": "execute_result"
|
| 397 |
+
}
|
| 398 |
+
],
|
| 399 |
+
"source": [
|
| 400 |
+
"fiction_categories = [\"fiction\",\"mystery\",\"romance\",\"scifi\",\"fantasy\",\"biography\",\"history\"]\n",
|
| 401 |
+
"\n",
|
| 402 |
+
"df.head(10).apply(\n",
|
| 403 |
+
" lambda x: classifier(x[\"description\"], candidate_labels=fiction_categories, multi_label=True),\n",
|
| 404 |
+
" axis=1,\n",
|
| 405 |
+
" result_type=\"expand\",\n",
|
| 406 |
+
").rename(columns={\"labels\": \"categories\", \"scores\": \"scores\"})"
|
| 407 |
+
]
|
| 408 |
+
},
|
| 409 |
+
{
|
| 410 |
+
"cell_type": "code",
|
| 411 |
+
"execution_count": 8,
|
| 412 |
+
"id": "365964c7",
|
| 413 |
+
"metadata": {},
|
| 414 |
+
"outputs": [
|
| 415 |
+
{
|
| 416 |
+
"data": {
|
| 417 |
+
"text/html": [
|
| 418 |
+
"<div>\n",
|
| 419 |
+
"<style scoped>\n",
|
| 420 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 421 |
+
" vertical-align: middle;\n",
|
| 422 |
+
" }\n",
|
| 423 |
+
"\n",
|
| 424 |
+
" .dataframe tbody tr th {\n",
|
| 425 |
+
" vertical-align: top;\n",
|
| 426 |
+
" }\n",
|
| 427 |
+
"\n",
|
| 428 |
+
" .dataframe thead th {\n",
|
| 429 |
+
" text-align: right;\n",
|
| 430 |
+
" }\n",
|
| 431 |
+
"</style>\n",
|
| 432 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 433 |
+
" <thead>\n",
|
| 434 |
+
" <tr style=\"text-align: right;\">\n",
|
| 435 |
+
" <th></th>\n",
|
| 436 |
+
" <th>isbn13</th>\n",
|
| 437 |
+
" <th>authors</th>\n",
|
| 438 |
+
" <th>categories</th>\n",
|
| 439 |
+
" <th>thumbnail</th>\n",
|
| 440 |
+
" <th>description</th>\n",
|
| 441 |
+
" <th>published_year</th>\n",
|
| 442 |
+
" <th>average_rating</th>\n",
|
| 443 |
+
" <th>num_pages</th>\n",
|
| 444 |
+
" <th>ratings_count</th>\n",
|
| 445 |
+
" <th>title_and_subtitle</th>\n",
|
| 446 |
+
" <th>tagged_description</th>\n",
|
| 447 |
+
" </tr>\n",
|
| 448 |
+
" </thead>\n",
|
| 449 |
+
" <tbody>\n",
|
| 450 |
+
" <tr>\n",
|
| 451 |
+
" <th>0</th>\n",
|
| 452 |
+
" <td>9780002005883</td>\n",
|
| 453 |
+
" <td>Marilynne Robinson</td>\n",
|
| 454 |
+
" <td>Fiction</td>\n",
|
| 455 |
+
" <td>http://books.google.com/books/content?id=KQZCP...</td>\n",
|
| 456 |
+
" <td>A NOVEL THAT READERS and critics have been eag...</td>\n",
|
| 457 |
+
" <td>2004.0</td>\n",
|
| 458 |
+
" <td>3.85</td>\n",
|
| 459 |
+
" <td>247.0</td>\n",
|
| 460 |
+
" <td>361.0</td>\n",
|
| 461 |
+
" <td>Gilead</td>\n",
|
| 462 |
+
" <td>9780002005883 A NOVEL THAT READERS and critics...</td>\n",
|
| 463 |
+
" </tr>\n",
|
| 464 |
+
" <tr>\n",
|
| 465 |
+
" <th>1</th>\n",
|
| 466 |
+
" <td>9780002261982</td>\n",
|
| 467 |
+
" <td>Charles Osborne;Agatha Christie</td>\n",
|
| 468 |
+
" <td>Detective and mystery stories</td>\n",
|
| 469 |
+
" <td>http://books.google.com/books/content?id=gA5GP...</td>\n",
|
| 470 |
+
" <td>A new 'Christie for Christmas' -- a full-lengt...</td>\n",
|
| 471 |
+
" <td>2000.0</td>\n",
|
| 472 |
+
" <td>3.83</td>\n",
|
| 473 |
+
" <td>241.0</td>\n",
|
| 474 |
+
" <td>5164.0</td>\n",
|
| 475 |
+
" <td>Spider's Web A Novel</td>\n",
|
| 476 |
+
" <td>9780002261982 A new 'Christie for Christmas' -...</td>\n",
|
| 477 |
+
" </tr>\n",
|
| 478 |
+
" <tr>\n",
|
| 479 |
+
" <th>2</th>\n",
|
| 480 |
+
" <td>9780006163831</td>\n",
|
| 481 |
+
" <td>Stephen R. Donaldson</td>\n",
|
| 482 |
+
" <td>American fiction</td>\n",
|
| 483 |
+
" <td>http://books.google.com/books/content?id=OmQaw...</td>\n",
|
| 484 |
+
" <td>Volume Two of Stephen Donaldson's acclaimed se...</td>\n",
|
| 485 |
+
" <td>1982.0</td>\n",
|
| 486 |
+
" <td>3.97</td>\n",
|
| 487 |
+
" <td>479.0</td>\n",
|
| 488 |
+
" <td>172.0</td>\n",
|
| 489 |
+
" <td>The One Tree</td>\n",
|
| 490 |
+
" <td>9780006163831 Volume Two of Stephen Donaldson'...</td>\n",
|
| 491 |
+
" </tr>\n",
|
| 492 |
+
" <tr>\n",
|
| 493 |
+
" <th>3</th>\n",
|
| 494 |
+
" <td>9780006178736</td>\n",
|
| 495 |
+
" <td>Sidney Sheldon</td>\n",
|
| 496 |
+
" <td>Fiction</td>\n",
|
| 497 |
+
" <td>http://books.google.com/books/content?id=FKo2T...</td>\n",
|
| 498 |
+
" <td>A memorable, mesmerizing heroine Jennifer -- b...</td>\n",
|
| 499 |
+
" <td>1993.0</td>\n",
|
| 500 |
+
" <td>3.93</td>\n",
|
| 501 |
+
" <td>512.0</td>\n",
|
| 502 |
+
" <td>29532.0</td>\n",
|
| 503 |
+
" <td>Rage of angels</td>\n",
|
| 504 |
+
" <td>9780006178736 A memorable, mesmerizing heroine...</td>\n",
|
| 505 |
+
" </tr>\n",
|
| 506 |
+
" <tr>\n",
|
| 507 |
+
" <th>4</th>\n",
|
| 508 |
+
" <td>9780006280897</td>\n",
|
| 509 |
+
" <td>Clive Staples Lewis</td>\n",
|
| 510 |
+
" <td>Christian life</td>\n",
|
| 511 |
+
" <td>http://books.google.com/books/content?id=XhQ5X...</td>\n",
|
| 512 |
+
" <td>Lewis' work on the nature of love divides love...</td>\n",
|
| 513 |
+
" <td>2002.0</td>\n",
|
| 514 |
+
" <td>4.15</td>\n",
|
| 515 |
+
" <td>170.0</td>\n",
|
| 516 |
+
" <td>33684.0</td>\n",
|
| 517 |
+
" <td>The Four Loves</td>\n",
|
| 518 |
+
" <td>9780006280897 Lewis' work on the nature of lov...</td>\n",
|
| 519 |
+
" </tr>\n",
|
| 520 |
+
" </tbody>\n",
|
| 521 |
+
"</table>\n",
|
| 522 |
+
"</div>"
|
| 523 |
+
],
|
| 524 |
+
"text/plain": [
|
| 525 |
+
" isbn13 authors \\\n",
|
| 526 |
+
"0 9780002005883 Marilynne Robinson \n",
|
| 527 |
+
"1 9780002261982 Charles Osborne;Agatha Christie \n",
|
| 528 |
+
"2 9780006163831 Stephen R. Donaldson \n",
|
| 529 |
+
"3 9780006178736 Sidney Sheldon \n",
|
| 530 |
+
"4 9780006280897 Clive Staples Lewis \n",
|
| 531 |
+
"\n",
|
| 532 |
+
" categories \\\n",
|
| 533 |
+
"0 Fiction \n",
|
| 534 |
+
"1 Detective and mystery stories \n",
|
| 535 |
+
"2 American fiction \n",
|
| 536 |
+
"3 Fiction \n",
|
| 537 |
+
"4 Christian life \n",
|
| 538 |
+
"\n",
|
| 539 |
+
" thumbnail \\\n",
|
| 540 |
+
"0 http://books.google.com/books/content?id=KQZCP... \n",
|
| 541 |
+
"1 http://books.google.com/books/content?id=gA5GP... \n",
|
| 542 |
+
"2 http://books.google.com/books/content?id=OmQaw... \n",
|
| 543 |
+
"3 http://books.google.com/books/content?id=FKo2T... \n",
|
| 544 |
+
"4 http://books.google.com/books/content?id=XhQ5X... \n",
|
| 545 |
+
"\n",
|
| 546 |
+
" description published_year \\\n",
|
| 547 |
+
"0 A NOVEL THAT READERS and critics have been eag... 2004.0 \n",
|
| 548 |
+
"1 A new 'Christie for Christmas' -- a full-lengt... 2000.0 \n",
|
| 549 |
+
"2 Volume Two of Stephen Donaldson's acclaimed se... 1982.0 \n",
|
| 550 |
+
"3 A memorable, mesmerizing heroine Jennifer -- b... 1993.0 \n",
|
| 551 |
+
"4 Lewis' work on the nature of love divides love... 2002.0 \n",
|
| 552 |
+
"\n",
|
| 553 |
+
" average_rating num_pages ratings_count title_and_subtitle \\\n",
|
| 554 |
+
"0 3.85 247.0 361.0 Gilead \n",
|
| 555 |
+
"1 3.83 241.0 5164.0 Spider's Web A Novel \n",
|
| 556 |
+
"2 3.97 479.0 172.0 The One Tree \n",
|
| 557 |
+
"3 3.93 512.0 29532.0 Rage of angels \n",
|
| 558 |
+
"4 4.15 170.0 33684.0 The Four Loves \n",
|
| 559 |
+
"\n",
|
| 560 |
+
" tagged_description \n",
|
| 561 |
+
"0 9780002005883 A NOVEL THAT READERS and critics... \n",
|
| 562 |
+
"1 9780002261982 A new 'Christie for Christmas' -... \n",
|
| 563 |
+
"2 9780006163831 Volume Two of Stephen Donaldson'... \n",
|
| 564 |
+
"3 9780006178736 A memorable, mesmerizing heroine... \n",
|
| 565 |
+
"4 9780006280897 Lewis' work on the nature of lov... "
|
| 566 |
+
]
|
| 567 |
+
},
|
| 568 |
+
"execution_count": 8,
|
| 569 |
+
"metadata": {},
|
| 570 |
+
"output_type": "execute_result"
|
| 571 |
+
}
|
| 572 |
+
],
|
| 573 |
+
"source": [
|
| 574 |
+
"df.head()"
|
| 575 |
+
]
|
| 576 |
+
},
|
| 577 |
+
{
|
| 578 |
+
"cell_type": "code",
|
| 579 |
+
"execution_count": null,
|
| 580 |
+
"id": "96c4d3c9",
|
| 581 |
+
"metadata": {},
|
| 582 |
+
"outputs": [],
|
| 583 |
+
"source": [
|
| 584 |
+
"from googlesearch import search\n",
|
| 585 |
+
"\n",
|
| 586 |
+
"def fetch_first_google_link(query):\n",
|
| 587 |
+
" results = search(query, num_results=1, lang=\"en\")\n",
|
| 588 |
+
" return list(results) if results else None\n",
|
| 589 |
+
"\n",
|
| 590 |
+
"\n"
|
| 591 |
+
]
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"cell_type": "code",
|
| 595 |
+
"execution_count": 25,
|
| 596 |
+
"id": "402a59f3",
|
| 597 |
+
"metadata": {},
|
| 598 |
+
"outputs": [
|
| 599 |
+
{
|
| 600 |
+
"name": "stdout",
|
| 601 |
+
"output_type": "stream",
|
| 602 |
+
"text": [
|
| 603 |
+
"['https://books.google.com/books/about/The_One_Tree.html?id=dXzwAAAAQBAJ&source=kp_cover']\n"
|
| 604 |
+
]
|
| 605 |
+
}
|
| 606 |
+
],
|
| 607 |
+
"source": [
|
| 608 |
+
"print(fetch_first_google_link(\"The One Tree by Stephen R. Donaldson -google books\"))"
|
| 609 |
+
]
|
| 610 |
+
},
|
| 611 |
+
{
|
| 612 |
+
"cell_type": "code",
|
| 613 |
+
"execution_count": null,
|
| 614 |
+
"id": "be3349dc",
|
| 615 |
+
"metadata": {},
|
| 616 |
+
"outputs": [],
|
| 617 |
+
"source": []
|
| 618 |
+
},
|
| 619 |
+
{
|
| 620 |
+
"cell_type": "code",
|
| 621 |
+
"execution_count": null,
|
| 622 |
+
"id": "7da59931",
|
| 623 |
+
"metadata": {},
|
| 624 |
+
"outputs": [],
|
| 625 |
+
"source": []
|
| 626 |
+
}
|
| 627 |
+
],
|
| 628 |
+
"metadata": {
|
| 629 |
+
"kernelspec": {
|
| 630 |
+
"display_name": "venv",
|
| 631 |
+
"language": "python",
|
| 632 |
+
"name": "python3"
|
| 633 |
+
},
|
| 634 |
+
"language_info": {
|
| 635 |
+
"codemirror_mode": {
|
| 636 |
+
"name": "ipython",
|
| 637 |
+
"version": 3
|
| 638 |
+
},
|
| 639 |
+
"file_extension": ".py",
|
| 640 |
+
"mimetype": "text/x-python",
|
| 641 |
+
"name": "python",
|
| 642 |
+
"nbconvert_exporter": "python",
|
| 643 |
+
"pygments_lexer": "ipython3",
|
| 644 |
+
"version": "3.11.9"
|
| 645 |
+
}
|
| 646 |
+
},
|
| 647 |
+
"nbformat": 4,
|
| 648 |
+
"nbformat_minor": 5
|
| 649 |
+
}
|
to_drop.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
768
|
| 2 |
+
806
|
| 3 |
+
1170
|
| 4 |
+
1269
|
| 5 |
+
1311
|
| 6 |
+
1343
|
| 7 |
+
2311
|
| 8 |
+
2389
|
| 9 |
+
2536
|
| 10 |
+
3270
|
| 11 |
+
3572
|
| 12 |
+
4228
|
| 13 |
+
4941
|
| 14 |
+
5292
|
| 15 |
+
5293
|
| 16 |
+
6085
|
vector_search.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|