|
|
| import json |
| import string |
|
|
| import wikipedia |
| from langchain import PromptTemplate |
| from langchain.vectorstores import Chroma |
| from langchain.text_splitter import CharacterTextSplitter |
|
|
| from src.tools.llms import openai_llm |
| from src.tools.wiki import Wiki |
|
|
|
|
|
|
|
|
| def get_wikilist(task: {}) -> str: |
| """ |
| get the titles of wiki pages interesting for solving the given task |
| """ |
|
|
| llm = openai_llm |
| template = (f"\n" |
| f" Your task consists in finding the list of wikipedia page titles which provide useful content " |
| f" for a paragraph whose description is delimited by triple backticks: ```{task['description']}```\n" |
| f" \n" |
| f" The paragraph belongs at the top level of the hierarchy to a document" |
| f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
| f" Make sure that the paragraph relates the top level of the document\n" |
| f" \n" |
| f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
| f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
| f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
| f" \n" |
| f" The paragraphs comes after previous paragraphs \\n" |
| f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
| f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
| f" \n" |
| f" The paragraphs comes before next paragraphs \\n" |
| f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
| f" \n" |
| f" Format your response as a JSON list of strings separated by commas.\n" |
| f" \n" |
| f"\n" |
| f" ") |
|
|
| prompt = PromptTemplate( |
| input_variables=[], |
| template=template |
| ) |
|
|
| |
| llm_list = llm(template) |
| wikilist = extract_list(llm_list) |
|
|
| expanded_wikilist = [] |
|
|
| expand_factor = 2 |
|
|
| for wikipage in wikilist: |
| expanded_wikilist += wikipedia.search(wikipage, expand_factor) |
|
|
| wikilist = list(set(expanded_wikilist)) |
|
|
| return wikilist |
|
|
|
|
| def extract_list(llm_list: str): |
| print(llm_list) |
|
|
| def filter_(el: str): |
| resp = 2 < len(el) |
| usable_length = len([c for c in el if c in string.ascii_letters]) |
| resp = resp and len(el)*3/4 < usable_length |
| return resp |
|
|
| try: |
| wikilist = llm_list[1:-1].split('"') |
| wikilist = [el for el in wikilist if filter_(el)] |
| print(wikilist) |
| except: |
| wikilist = [] |
| print('issues with the wikilist') |
| return wikilist |
|
|
|
|
| def get_public_paragraph(task: {}) -> str: |
| """returns the task directly performed by chat GPT""" |
|
|
| llm = openai_llm |
| template = (f"\n" |
| f" Your task consists in generating a paragraph\\n" |
| f" whose description is delimited by triple backticks: ```{task['description']}```\n" |
| f"\n" |
| f" The paragraph belongs at the top level of the hierarchy to a document \\n" |
| f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
| f" Make sure that the paragraph relates the top level of the document\n" |
| f" \n" |
| f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
| f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
| f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
| f" \n" |
| f" The paragraphs comes after previous paragraphs \\n" |
| f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
| f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
| f" \n" |
| f" The paragraphs comes before next paragraphs \\n" |
| f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
| f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" |
| f" \n" |
| f" \n" |
| f"\n" |
| f" ") |
|
|
| p = llm(template) |
|
|
| return p |
|
|
|
|
| def create_index(wikilist: [str]): |
| """ |
| useful for creating the index of wikipages |
| """ |
| fetch = Wiki().fetch |
|
|
| pages = [(title, fetch(title)) for title in wikilist if type(fetch(title)) != str] |
| texts = [] |
| chunk = 800 |
| for title, page in pages: |
| texts.append(WikiPage(title=title, fulltext=page.page_content)) |
|
|
| doc_splitter = CharacterTextSplitter( |
| separator=".", |
| chunk_size=chunk, |
| chunk_overlap=100, |
| length_function=len, |
| ) |
|
|
| paragraphs = texts[0].get_paragraphs(chunk=800) |
|
|
| split_texts = [] |
| for p in paragraphs: |
| split_texts += doc_splitter.split_text(p) |
|
|
| for split_text in split_texts: |
| assert type(split_text) == str |
| assert 0 < len(split_text) < 2 * 500 |
|
|
| wiki_index = Chroma.from_texts(split_texts) |
|
|
| return wiki_index |
|
|
|
|
| def get_wiki_paragraph(wiki_index, task: {}) -> str: |
| """useful to get a summary in one line from wiki index""" |
|
|
| task_description = get_public_paragraph(task) |
| wiki_paragraphs = semantic_search(wiki_index, task_description) |
| text_content = "" |
| for p in wiki_paragraphs: |
| text_content += p.page_content + "/n/n" |
|
|
| template = (f"\n" |
| f" Your task consists in generating a paragraph\\n" |
| f" whose description is delimited by triple backticks: ```{task['description']}```\n" |
| f"\n" |
| f" The text generation is based in the documents provided in these sections \n" |
| f" delimited by by triple backticks: ``` {text_content}``` \n" |
| f" The paragraph belongs at the top level of the hierarchy to a document \\n" |
| f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
| f" Make sure that the paragraph relates the top level of the document\n" |
| f" \n" |
| f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
| f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
| f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
| f" \n" |
| f" The paragraphs comes after previous paragraphs \\n" |
| f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
| f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
| f" \n" |
| f" The paragraphs comes before next paragraphs \\n" |
| f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
| f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" |
| f" \n" |
| f" \n" |
| f"\n" |
| f" ") |
|
|
| llm = openai_llm |
| p = llm(template) |
|
|
| return p |
|
|
|
|
| def get_private_paragraph(texts, task: {}) -> str: |
| """useful to get a summary in one line from wiki index""" |
|
|
| text_content = "" |
| for t in texts: |
| text_content += t + "/n/n" |
|
|
| template = (f"\n" |
| f" Your task consists in generating a paragraph\\n" |
| f" whose description is delimited by triple backticks: ```{task['description']}```\n" |
| f"\n" |
| f" The text generation is based in the documents provided in these sections \n" |
| f" delimited by by triple backticks: ``` {text_content}``` \n" |
| f" The paragraph belongs at the top level of the hierarchy to a document \\n" |
| f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
| f" Make sure that the paragraph relates the top level of the document\n" |
| f" \n" |
| f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
| f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
| f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
| f" \n" |
| f" The paragraphs comes after previous paragraphs \\n" |
| f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
| f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
| f" \n" |
| f" The paragraphs comes before next paragraphs \\n" |
| f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
| f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" |
| f" \n" |
| f" \n" |
| f"\n" |
| f" ") |
|
|
| llm = openai_llm |
| p = llm(template) |
|
|
| return p |
|
|