| from langchain.docstore.document import Document |
| import feedparser |
| import html2text |
| import ssl |
| import time |
|
|
|
|
| class RSS_Url_loader: |
| def __init__(self, urls=None,interval=60): |
| '''可用参数urls数组或者是字符串形式的url列表''' |
| self.urls = [] |
| self.interval = interval |
| if urls is not None: |
| try: |
| if isinstance(urls, str): |
| urls = [urls] |
| elif isinstance(urls, list): |
| pass |
| else: |
| raise TypeError('urls must be a list or a string.') |
| self.urls = urls |
| except: |
| Warning('urls must be a list or a string.') |
| |
| |
| def scheduled_execution(self): |
| while True: |
| docs = self.load() |
| return docs |
| time.sleep(self.interval) |
|
|
| def load(self): |
| if hasattr(ssl, '_create_unverified_context'): |
| ssl._create_default_https_context = ssl._create_unverified_context |
| documents = [] |
| for url in self.urls: |
| parsed = feedparser.parse(url) |
| for entry in parsed.entries: |
| if "content" in entry: |
| data = entry.content[0].value |
| else: |
| data = entry.description or entry.summary |
| data = html2text.html2text(data) |
| metadata = {"title": entry.title, "link": entry.link} |
| documents.append(Document(page_content=data, metadata=metadata)) |
| return documents |
|
|
| if __name__=="__main__": |
| |
| urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"] |
| loader = RSS_Url_loader(urls) |
| docs = loader.load() |
| for doc in docs: |
| print(doc) |