| |
| from abc import ABC, abstractmethod |
| from dataclasses import dataclass |
| from collections import namedtuple |
| from typing import List, Optional |
|
|
| Summary = namedtuple('Summary',['source','cluster_list','link_ext','hed','dek','date','authors','original_length','summary_text','summary_length','chunk_time', 'query_time', 'mean_query_time', 'summary_time']) |
| Summary.__doc__ = f""" |
| Summary: a namedtuple for storing Summaries and relevant metadata. |
| |
| • Source: A Source object for the source of the summarized document. |
| • cluster_list: A list of the NER entities detected in this article's hed (headline). |
| • link_ext: The link extension of the article (on the base url, source's source_url) |
| • hed, dek: headline and subheader. These are standard industry terms. |
| Dek is None if not applicable. |
| • date: Date of publication/update listed in article. |
| • authors: list of authors, currently a string containing the byline. |
| • original_length: length of the original article |
| • cluster_num: Number of clusters the source article appears in |
| • summary_text: List of summarized chunks. |
| • summary_length: Length of summary text |
| • stats for stats |
| """ |
|
|
| @dataclass |
| class Source(ABC): |
| source_name: Optional[str] = "" |
| source_url: Optional[str] = "" |
| |
| source_summarization_checkpoint: Optional[str] = "" |
| source_ner_checkpoint: Optional[str] = "" |
|
|
| """ |
| User must implement a source-dependent method |
| to retrieve data used to create clusters. |
| |
| This gets called when clustering is performed. |
| """ |
| @abstractmethod |
| def retrieve_cluster_data(self) -> List[namedtuple]: |
| pass |
|
|
| """ |
| User must implement a source-dependent method |
| to retrieve texts for summarization. |
| |
| This gets called once topics for digestion have been selected. |
| """ |
| @abstractmethod |
| def retrieve_article(self) -> List[namedtuple]: |
| pass |
|
|