| |
| """WarOnlineForum.ipynb""" |
|
|
| |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| import re |
| import pandas as pd |
| import urllib.request as urllib |
| import warnings |
| warnings.filterwarnings("ignore") |
|
|
| |
| corpus = pd.DataFrame(columns=['Quote', 'Response']) |
|
|
| def remove_substring(string, substring): |
| index = string.find(substring) |
| if index != -1: |
| start_index = string.rfind(" ", 0, index) + 1 |
| end_index = string.find(" ", index) |
| if end_index == -1: |
| end_index = len(string) |
| return string[:start_index] + string[end_index:] |
| return string |
|
|
| def remove_attachments(string, substring='Посмотреть вложение'): |
| index = string.find(substring) |
| if index != -1: |
| end_index = string.find(" ", index) |
| if end_index == -1: |
| end_index = len(string) |
| return string[:index] + string[end_index:] |
| return string |
|
|
| def collectDataFromPage(url): |
| |
|
|
| |
| response = requests.get(url) |
| html = response.content |
|
|
| |
| soup = BeautifulSoup(response.content, "html.parser") |
|
|
| |
| message_contents = soup.find_all("div", class_="bbWrapper") |
|
|
| |
| for message_content in message_contents: |
| |
| message_text = message_content.text.strip() |
| |
| |
| try: |
| quoted_text = message_content.find("blockquote").text.strip() |
| quoted_text = ''.join(BeautifulSoup(quoted_text, "html.parser").findAll(string=True)) |
| quoted_text = quoted_text.replace('Нажмите для раскрытия...', '') |
| message_text = message_text.replace('Нажмите для раскрытия...', '') |
| |
| |
| |
| |
| Quote = re.sub(r'http\S+', '', ' '.join(quoted_text.split()).partition('(а): ')[2]) |
| Quote = remove_substring(Quote,".com") |
| Quote = remove_attachments(Quote) |
| Quote = ' '.join(remove_substring(Quote,"@").split()) |
| |
| Message = ' '.join(message_text.replace(quoted_text,'').split()) |
| Message = remove_substring(Message,".com") |
| Message = remove_attachments(Message) |
| Message = ' '.join(remove_substring(Message,"@").split()) |
|
|
| if Message and Quote: |
| |
| corpus.loc[len(corpus)]=[Quote,Message] |
| |
| |
| |
| except: |
| pass |
|
|
| def compare_pages(url1, url2): |
| page1 = requests.get(url1).text |
| page2 = requests.get(url2).text |
| |
| return len(page1) == len(page2) |
|
|
| def compare_pages2(url1, url2): |
| return urllib.urlopen(url1).geturl() == urllib.urlopen(url2).geturl() |
|
|
|
|
| def pages_of_thread(thread,startingPage=1): |
| page = startingPage |
| lastPage = False |
| while not lastPage: |
| response = requests.get(thread+'/page-'+str(page)) |
| if response.status_code == 200: |
| collectDataFromPage(url = thread+'/page-'+str(page)) |
| print(f'finished page #{page}') |
| if not compare_pages2(thread+'/page-'+str(page),thread+'/page-'+str(page+1)): |
| page+=1 |
| else: |
| lastPage = True |
| else: |
| lastPage = True |
|
|
| |
| |
|
|
| """______________________________________ Main Code __________________________________________""" |
|
|
| |
| base_url = 'https://waronline.org' |
| |
| |
| |
| |
| |
| url = "https://waronline.org/fora/index.php?forums/%D0%92%D0%9C%D0%A4-%D0%B3%D1%80%D0%B0%D0%B6%D0%B4%D0%B0%D0%BD%D1%81%D0%BA%D0%B8%D0%B9-%D1%84%D0%BB%D0%BE%D1%82.12/" |
|
|
| base_page = 1 |
| lastSubForumPage = False |
|
|
| while not lastSubForumPage: |
|
|
| |
| response = requests.get(url+'page-'+str(base_page)) |
| forum_threads = [] |
|
|
| |
| if response.status_code == 200: |
| |
| soup = BeautifulSoup(response.content, "html.parser") |
| |
| |
| links = soup.find_all("a") |
| |
| |
| for link in links: |
| lnk = link.get("href") |
| if lnk: |
| if 'threads' in lnk: |
| forum_threads.append((base_url+lnk).rsplit("/", 1)[0]) |
|
|
| |
| forum_threads = list(set(forum_threads)) |
| |
| for trd in forum_threads: |
| pages_of_thread(trd) |
| print(f'finished thread: {trd}') |
|
|
| if not compare_pages2(url+'page-'+str(base_page),url+'page-'+str(base_page+1)): |
| print(f'finished subforum page #{base_page}') |
| base_page+=1 |
| else: |
| lastSubForumPage = True |
|
|
| else: |
| print("Failed to load the page") |
| lastSubForumPage = True |
|
|
| |
| corpus['Quote'] = corpus['Quote'].apply(lambda x: x.lower() if isinstance(x,str) else x) |
| corpus['Response'] = corpus['Response'].apply(lambda x: x.lower() if isinstance(x,str) else x) |
|
|
| |
| corpus.Quote.str.replace('[^a-zA-Z]', '') |
| corpus.Response.str.replace('[^a-zA-Z]', '') |
|
|
| |
| pathToDrive = '' |
| filename = 'part5.csv' |
| corpus.to_csv(pathToDrive+filename,index=False) |