| |
| from dataclasses import dataclass |
| from collections import namedtuple |
| from typing import List, Tuple, Any |
| from gazpacho import Soup, get |
| from source import Source, Summary |
| import streamlit as st |
|
|
| stub = namedtuple('npr_stub',[ 'link','hed','entities', 'source']) |
| stub.__doc__ = f""" |
| • A namedtuple to represent an unscraped news article. |
| |
| • link is the extension of the article. Added to the source's source_url |
| it is used to retrieve the full article and data. |
| • hed is the headline ('hed' is journalism jargon, as is 'dek' for 'subheader') |
| • entities is the list of entity names discovered in this headline, |
| each entity representing one cluster the article is in. |
| • source is a reference to the Source object that created the stub. |
| """ |
|
|
|
|
| @dataclass |
| class NPRLite(Source): |
| """Implementation of abstract Source class that retrieves via webscraping at text.npr.org/1001""" |
| |
| |
| |
| def retrieve_cluster_data(self, limit=None) -> List[namedtuple]: |
| |
| """Creates article stubs for articles listed on text.npr.org""" |
| |
| soup = Soup(get(self.source_url)) |
| |
| npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')] |
| |
| npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')] |
| |
| if limit is not None: |
| npr_hed = npr_hed[:limit] |
| npr_links = npr_links[:limit] |
| |
| |
| article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)] |
| |
| return article_tuples, len(npr_hed) |
|
|
| |
| def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]: |
| """Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text""" |
| st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""") |
| container = Soup(get(self.source_url[:-5] + indata.link)) |
| text_container = container.find('div', {'class': "paragraphs-container"}).find('p') |
| if isinstance(text_container, Soup): |
| return None, None |
| whole_text = ''.join([art.strip() for art in text_container]) |
| story_head = container.find('div', {'class':'story-head'}) |
| auth_and_date = [i.text for i in story_head.find('p')] |
| author = auth_and_date[0] |
| story_date = auth_and_date[1] |
| author = author[3:] |
| |
| |
| return whole_text, [ |
| self, |
| indata.entities, |
| indata.link, |
| indata.hed, |
| None, |
| story_date, |
| [author], |
| len(whole_text.split(' ')), |
| ] |
| |
|
|
| @dataclass |
| class CNNText(Source): |
| """Implementation of abstract Source class that retrieves via webscraping at lite.cnn.com""" |
|
|
| |
| |
| |
| def retrieve_cluster_data(self, limit=None) -> List[namedtuple]: |
| """Creates a stub for each article listed on lite.cnn.com""" |
| soup = Soup(get(self.source_url)) |
| |
| cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')] |
| cnn_links = [i.attrs['href'] for i in soup.find('div', {'class': 'afe4286c'}).find('a')] |
| |
| if limit is not None: |
| cnn_heds = cnn_heds[:limit] |
| cnn_links = cnn_links[:limit] |
| |
| article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]] |
| |
| return article_tuples, len(cnn_heds) |
|
|
| |
| def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]: |
| """Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text""" |
| |
| st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""") |
| repeat = 0 |
| good = False |
| while repeat < 2 and not good: |
| try: |
| container = Soup(get(self.source_url + indata.link)) |
| good = True |
| except Exception as e: |
| print(f"Error:\n{e}") |
| print(f"Problem url: \n\t{self.source_url + indata.link}") |
| repeat += 1 |
| if good: |
| story_container = container.find('div', {'class': 'afe4286c'}) |
| |
| author = story_container.find('p',{'id':'byline'}).text |
| story_date = story_container.find('p',{'id':'published datetime'}).text[9:] |
| |
| |
| scp = story_container.find('p')[4:] |
| |
| whole_text = ''.join([i.text for i in scp if i.text is not None]) |
| article_data = [ |
| self, |
| indata.entities, |
| indata.link, |
| indata.hed, |
| None, |
| story_date, |
| [author], |
| len(whole_text.split(' ')), |
| ] |
| else: |
| whole_text = None |
| article_data = None |
| |
| return whole_text, article_data |
|
|