isan2001 commited on
Commit
4d599ef
·
verified ·
1 Parent(s): 22904bf

Upload scrap.ipynb

Browse files
Files changed (1) hide show
  1. scrap.ipynb +266 -0
scrap.ipynb ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 11,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Requirement already satisfied: selenium in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (4.23.1)\n",
13
+ "Requirement already satisfied: urllib3<3,>=1.26 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from urllib3[socks]<3,>=1.26->selenium) (2.2.2)\n",
14
+ "Requirement already satisfied: trio~=0.17 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from selenium) (0.26.0)\n",
15
+ "Requirement already satisfied: trio-websocket~=0.9 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from selenium) (0.11.1)\n",
16
+ "Requirement already satisfied: certifi>=2021.10.8 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from selenium) (2024.7.4)\n",
17
+ "Requirement already satisfied: typing_extensions~=4.9 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from selenium) (4.12.2)\n",
18
+ "Requirement already satisfied: websocket-client~=1.8 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from selenium) (1.8.0)\n",
19
+ "Requirement already satisfied: attrs>=23.2.0 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from trio~=0.17->selenium) (23.2.0)\n",
20
+ "Requirement already satisfied: sortedcontainers in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from trio~=0.17->selenium) (2.4.0)\n",
21
+ "Requirement already satisfied: idna in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from trio~=0.17->selenium) (3.7)\n",
22
+ "Requirement already satisfied: outcome in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from trio~=0.17->selenium) (1.3.0.post0)\n",
23
+ "Requirement already satisfied: sniffio>=1.3.0 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from trio~=0.17->selenium) (1.3.1)\n",
24
+ "Requirement already satisfied: cffi>=1.14 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from trio~=0.17->selenium) (1.16.0)\n",
25
+ "Requirement already satisfied: wsproto>=0.14 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from trio-websocket~=0.9->selenium) (1.2.0)\n",
26
+ "Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from urllib3[socks]<3,>=1.26->selenium) (1.7.1)\n",
27
+ "Requirement already satisfied: pycparser in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from cffi>=1.14->trio~=0.17->selenium) (2.22)\n",
28
+ "Requirement already satisfied: h11<1,>=0.9.0 in c:\\users\\isan\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.14.0)\n"
29
+ ]
30
+ },
31
+ {
32
+ "name": "stderr",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "\n",
36
+ "[notice] A new release of pip is available: 24.0 -> 24.1.2\n",
37
+ "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
38
+ ]
39
+ },
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "^C\n"
45
+ ]
46
+ },
47
+ {
48
+ "name": "stderr",
49
+ "output_type": "stream",
50
+ "text": [
51
+ "ERROR: unknown command \"Install\" - maybe you meant \"install\"\n",
52
+ "\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "!pip install selenium\n",
58
+ "!pip Install wget"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "from selenium import webdriver\n",
68
+ "from selenium.webdriver.common.keys import Keys\n",
69
+ "from selenium.webdriver.support import expected_conditions as EC\n",
70
+ "from selenium.webdriver.common.by import By\n",
71
+ "from selenium.webdriver.support.wait import WebDriverWait\n",
72
+ "import time, urllib.request, requests"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "metadata": {},
79
+ "outputs": [],
80
+ "source": [
81
+ "driver = webdriver.Chrome()"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": null,
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "driver.get(\"http://instagram.com\")"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": null,
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, \"input[name='username']\")))\n",
100
+ "password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, \"input[name='password']\")))"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": null,
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "username.clear()\n",
110
+ "username.send_keys(\"firdaus_ihsan20\")\n",
111
+ "password.clear()\n",
112
+ "password.send_keys(\"Herlina20\")"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": null,
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, \"button[type='submit']\"))).click()"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": null,
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "time.sleep(5)\n",
131
+ "alert = WebDriverWait(driver,15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '[role=\"button\"][tabindex=\"0\"]'))).click()"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": 10,
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "time.sleep(5)\n",
141
+ "alert = WebDriverWait(driver,15).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), \"Not Now\")]'))).click()"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 18,
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "driver.get(\"http://instagram.com/detikcom/\")"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": 20,
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "n_scrolls = 3\n",
160
+ "for j in range(0, n_scrolls):\n",
161
+ " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
162
+ " time.sleep(5) "
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 21,
168
+ "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "name": "stdout",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "['https://www.instagram.com/p/C92k86NyqGx/', 'https://www.instagram.com/p/C92gRMDs6R-/', 'https://www.instagram.com/p/C92bj6EshrZ/', 'https://www.instagram.com/p/C92ZUWos_ZT/', 'https://www.instagram.com/p/C92SY51MAZ_/', 'https://www.instagram.com/p/C92O9cvMXqt/', 'https://www.instagram.com/p/C92BOUIMj-b/', 'https://www.instagram.com/p/C92AS9ySjlx/', 'https://www.instagram.com/p/C911GNGJvnp/', 'https://www.instagram.com/p/C91wgp0sAAp/', 'https://www.instagram.com/p/C91vj4LxOxx/', 'https://www.instagram.com/p/C91kDvlsyro/', 'https://www.instagram.com/p/C91ii9vM0hq/', 'https://www.instagram.com/p/C91YS5lvfoy/', 'https://www.instagram.com/p/C91UnYXo8rP/', 'https://www.instagram.com/p/C91Q9C9JDEG/', 'https://www.instagram.com/p/C91Ojq3SYFL/', 'https://www.instagram.com/p/C91KotiI_pA/', 'https://www.instagram.com/p/C91Ecr7PqoX/', 'https://www.instagram.com/p/C908jj3oeFU/', 'https://www.instagram.com/p/C9ztoFIs3hf/', 'https://www.instagram.com/p/C9zqKtHsvwJ/', 'https://www.instagram.com/p/C9zpJgBsdnC/', 'https://www.instagram.com/p/C9zocQMMjOy/']\n"
175
+ ]
176
+ }
177
+ ],
178
+ "source": [
179
+ "posts = []\n",
180
+ "links = driver.find_elements(By.TAG_NAME, 'a')\n",
181
+ "for link in links:\n",
182
+ " post = link.get_attribute('href')\n",
183
+ " if post and '/p/' in post:\n",
184
+ " posts.append(post)\n",
185
+ "\n",
186
+ "print(posts)"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 9,
192
+ "metadata": {},
193
+ "outputs": [],
194
+ "source": [
195
+ "searchbox = WebDriverWait(driver,15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '[aria-label = \"Search\"]'))).click()"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": 10,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "searchboxx = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, \"//input[@placeholder='Search']\")))"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 11,
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "searchboxx.clear()\n",
214
+ "keyword = \"detikcom\"\n",
215
+ "searchboxx.send_keys(keyword)\n",
216
+ "time.sleep(5)\n",
217
+ "searchboxx.send_keys(Keys.ENTER)\n",
218
+ "searchboxx.send_keys(Keys.ENTER)"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 74,
224
+ "metadata": {},
225
+ "outputs": [
226
+ {
227
+ "ename": "TimeoutException",
228
+ "evalue": "Message: \nStacktrace:\n\tGetHandleVerifier [0x00007FF6C2E99632+30946]\n\t(No symbol) [0x00007FF6C2E4E3C9]\n\t(No symbol) [0x00007FF6C2D46FDA]\n\t(No symbol) [0x00007FF6C2D9822C]\n\t(No symbol) [0x00007FF6C2D9850C]\n\t(No symbol) [0x00007FF6C2DDDCB7]\n\t(No symbol) [0x00007FF6C2DBCAAF]\n\t(No symbol) [0x00007FF6C2DDB041]\n\t(No symbol) [0x00007FF6C2DBC813]\n\t(No symbol) [0x00007FF6C2D8A6E5]\n\t(No symbol) [0x00007FF6C2D8B021]\n\tGetHandleVerifier [0x00007FF6C2FCF83D+1301229]\n\tGetHandleVerifier [0x00007FF6C2FDBDB7+1351783]\n\tGetHandleVerifier [0x00007FF6C2FD2A03+1313971]\n\tGetHandleVerifier [0x00007FF6C2ECDD06+245686]\n\t(No symbol) [0x00007FF6C2E5758F]\n\t(No symbol) [0x00007FF6C2E53804]\n\t(No symbol) [0x00007FF6C2E53992]\n\t(No symbol) [0x00007FF6C2E4A3EF]\n\tBaseThreadInitThunk [0x00007FF80AC1257D+29]\n\tRtlUserThreadStart [0x00007FF80C16AF28+40]\n",
229
+ "output_type": "error",
230
+ "traceback": [
231
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
232
+ "\u001b[1;31mTimeoutException\u001b[0m Traceback (most recent call last)",
233
+ "Cell \u001b[1;32mIn[74], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m5\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m ambil \u001b[38;5;241m=\u001b[39m \u001b[43mWebDriverWait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdriver\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muntil\u001b[49m\u001b[43m(\u001b[49m\u001b[43mEC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43melement_to_be_clickable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mXPATH\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m//input[@placeholder=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSearch\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m]// a[1]\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mclick()\n",
234
+ "File \u001b[1;32mc:\\Users\\isan\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\selenium\\webdriver\\support\\wait.py:105\u001b[0m, in \u001b[0;36mWebDriverWait.until\u001b[1;34m(self, method, message)\u001b[0m\n\u001b[0;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time\u001b[38;5;241m.\u001b[39mmonotonic() \u001b[38;5;241m>\u001b[39m end_time:\n\u001b[0;32m 104\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m--> 105\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m TimeoutException(message, screen, stacktrace)\n",
235
+ "\u001b[1;31mTimeoutException\u001b[0m: Message: \nStacktrace:\n\tGetHandleVerifier [0x00007FF6C2E99632+30946]\n\t(No symbol) [0x00007FF6C2E4E3C9]\n\t(No symbol) [0x00007FF6C2D46FDA]\n\t(No symbol) [0x00007FF6C2D9822C]\n\t(No symbol) [0x00007FF6C2D9850C]\n\t(No symbol) [0x00007FF6C2DDDCB7]\n\t(No symbol) [0x00007FF6C2DBCAAF]\n\t(No symbol) [0x00007FF6C2DDB041]\n\t(No symbol) [0x00007FF6C2DBC813]\n\t(No symbol) [0x00007FF6C2D8A6E5]\n\t(No symbol) [0x00007FF6C2D8B021]\n\tGetHandleVerifier [0x00007FF6C2FCF83D+1301229]\n\tGetHandleVerifier [0x00007FF6C2FDBDB7+1351783]\n\tGetHandleVerifier [0x00007FF6C2FD2A03+1313971]\n\tGetHandleVerifier [0x00007FF6C2ECDD06+245686]\n\t(No symbol) [0x00007FF6C2E5758F]\n\t(No symbol) [0x00007FF6C2E53804]\n\t(No symbol) [0x00007FF6C2E53992]\n\t(No symbol) [0x00007FF6C2E4A3EF]\n\tBaseThreadInitThunk [0x00007FF80AC1257D+29]\n\tRtlUserThreadStart [0x00007FF80C16AF28+40]\n"
236
+ ]
237
+ }
238
+ ],
239
+ "source": [
240
+ "time.sleep(5)\n",
241
+ "ambil = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, \"//input[@placeholder='Search']// a[1]\"))).click()"
242
+ ]
243
+ }
244
+ ],
245
+ "metadata": {
246
+ "kernelspec": {
247
+ "display_name": "Python 3",
248
+ "language": "python",
249
+ "name": "python3"
250
+ },
251
+ "language_info": {
252
+ "codemirror_mode": {
253
+ "name": "ipython",
254
+ "version": 3
255
+ },
256
+ "file_extension": ".py",
257
+ "mimetype": "text/x-python",
258
+ "name": "python",
259
+ "nbconvert_exporter": "python",
260
+ "pygments_lexer": "ipython3",
261
+ "version": "3.12.4"
262
+ }
263
+ },
264
+ "nbformat": 4,
265
+ "nbformat_minor": 2
266
+ }