| import re |
| import streamlit as st |
| from modelcards import CardData, ModelCard |
| from markdownTagExtract import tag_checker,listToString,to_markdown |
| |
|
|
|
|
| |
| |
|
|
|
|
| |
| |
| |
|
|
| def file_upload(): |
| bytes_data = st.session_state.markdown_upload |
| return bytes_data |
|
|
|
|
| |
| model_card_md = file_upload() |
| model_card_md = model_card_md |
| |
| metadata_re = re.compile("^---(.*?)---", re.DOTALL) |
| header_re = re.compile("^\s*# (.*)", re.MULTILINE) |
| subheader_re = re.compile("^\s*## (.*)", re.MULTILINE) |
| subsubheader_re = re.compile("^\s*### (.*)", re.MULTILINE) |
| subsubsubheader_re = re.compile("^\s*#### (.*)", re.MULTILINE) |
| |
| |
| |
| |
| |
| |
| key_value_re = re.compile("^\s*([*_]{2}[^*_]+[*_]{2})([^\n]*)", re.MULTILINE) |
| |
| |
| list_item_re = re.compile("^\s*[-*+]\s+.*", re.MULTILINE) |
| |
| enum_re = re.compile("^\s*[0-9].*", re.MULTILINE) |
| table_re = re.compile("^\s*\|.*", re.MULTILINE) |
| text_item_re = re.compile("^\s*[A-Za-z(](.*)", re.MULTILINE) |
| |
| |
| italicized_text_item_re = re.compile( |
| "^[_*][^_*\s].*\n?.*[^_*][_*]$", flags=re.MULTILINE |
| ) |
| tag_re = re.compile("^\s*<.*", re.MULTILINE) |
| image_re = re.compile("!\[.*\]\(.*\)", re.MULTILINE) |
|
|
|
|
| subheader_re_dict = {} |
| subheader_re_dict[header_re] = subheader_re |
| subheader_re_dict[subheader_re] = subsubheader_re |
| subheader_re_dict[subsubheader_re] = subsubsubheader_re |
|
|
|
|
| def get_metadata(section_text): |
| return list(metadata_re.finditer(section_text)) |
|
|
|
|
| def find_images(section_text): |
| return list(image_re.finditer(section_text)) |
|
|
|
|
| def find_tags(section_text): |
| return list(tag_re.finditer(section_text)) |
|
|
|
|
| def find_tables(section_text): |
| return list(table_re.finditer(section_text)) |
|
|
|
|
| def find_enums(section_text): |
| return list(enum_re.finditer(section_text)) |
|
|
|
|
| |
| def find_key_values(section_text): |
| return list(key_value_re.finditer(section_text)) |
|
|
|
|
| def find_lists(section_text): |
| |
| return list(list_item_re.finditer(section_text)) |
|
|
|
|
| def find_texts(section_text): |
| |
| basic_text = list(text_item_re.finditer(section_text)) |
| ital_text = list(italicized_text_item_re.finditer(section_text)) |
| free_text = basic_text + ital_text |
| return free_text |
|
|
|
|
| def find_headers(full_text): |
| headers = list(header_re.finditer(full_text)) |
| subheaders = list(subheader_re.finditer(full_text)) |
| subsubheaders = list(subsubheader_re.finditer(full_text)) |
| subsubsubheaders = list(subsubsubheader_re.finditer(full_text)) |
| return (headers, subheaders, subsubheaders, subsubsubheaders) |
|
|
|
|
| metadata_list = get_metadata(model_card_md) |
| if metadata_list != []: |
| metadata_end = metadata_list[-1].span()[-1] |
| print("Metadata extracted") |
| |
| |
| model_card_md = model_card_md[metadata_end:] |
| else: |
| print("No metadata found") |
|
|
| |
| headers_list = find_headers(model_card_md) |
| print("Headers extracted") |
| |
| headers = headers_list[0] |
| |
| subheaders = headers_list[1] |
| |
| subsubheaders = headers_list[2] |
| |
| subsubsubheaders = headers_list[3] |
|
|
| |
| lists_list = find_lists(model_card_md) |
| print("Bulleted lists extracted") |
|
|
| enums_list = find_enums(model_card_md) |
| print("Enumerated lists extracted") |
|
|
| key_value_list = find_key_values(model_card_md) |
| print("Key values extracted") |
|
|
| tables_list = find_tables(model_card_md) |
| print("Tables extracted") |
|
|
| tags_list = find_tags(model_card_md) |
| print("Markup tags extracted") |
|
|
| images_list = find_images(model_card_md) |
| print("Images extracted") |
|
|
| |
| texts_list = find_texts(model_card_md) |
| print("Free text extracted") |
|
|
|
|
| |
| |
| |
| LIST_ITEM = "List item" |
| KEY_VALUE = "Key: Value" |
| FREE_TEXT = "Free text" |
| ENUM_LIST_ITEM = "Enum item" |
| TABLE_ITEM = "Table item" |
| TAG_ITEM = "Markup tag" |
| IMAGE_ITEM = "Image" |
|
|
|
|
| def create_span_dict(match_list, match_type): |
| """ |
| Creates a dictionary made out of all the spans. |
| This is useful for knowing which types to fill out with what in the app. |
| Also useful for checking if there are spans in the .md file that we've missed. |
| """ |
| span_dict = {} |
| for match in match_list: |
| if len(match.group().strip()) > 0: |
| span_dict[(match.span())] = (match.group(), match_type) |
| return span_dict |
|
|
|
|
| metadata_span_dict = create_span_dict(metadata_list, "Metadata") |
| |
| header_span_dict = create_span_dict(headers, "# Header") |
| subheader_span_dict = create_span_dict(subheaders, "## Subheader") |
| subsubheader_span_dict = create_span_dict(subsubheaders, "### Subsubheader") |
| subsubsubheader_span_dict = create_span_dict(subsubsubheaders, "#### Subsubsubheader") |
| key_value_span_dict = create_span_dict(key_value_list, KEY_VALUE) |
| lists_span_dict = create_span_dict(lists_list, LIST_ITEM) |
| enums_span_dict = create_span_dict(enums_list, ENUM_LIST_ITEM) |
| tables_span_dict = create_span_dict(tables_list, TABLE_ITEM) |
| tags_span_dict = create_span_dict(tags_list, TAG_ITEM) |
| images_span_dict = create_span_dict(images_list, IMAGE_ITEM) |
| texts_span_dict = create_span_dict(texts_list, FREE_TEXT) |
|
|
| |
| |
| all_spans_dict = {} |
| all_spans_dict["headers"] = header_span_dict |
| all_spans_dict["subheaders"] = subheader_span_dict |
| all_spans_dict["subsubheaders"] = subsubheader_span_dict |
| all_spans_dict["subsubsubheaders"] = subsubsubheader_span_dict |
| all_spans_dict[LIST_ITEM] = lists_span_dict |
| all_spans_dict[KEY_VALUE] = key_value_span_dict |
| all_spans_dict[TABLE_ITEM] = tables_span_dict |
| all_spans_dict[ENUM_LIST_ITEM] = enums_span_dict |
| all_spans_dict[TAG_ITEM] = tags_span_dict |
| all_spans_dict[IMAGE_ITEM] = images_span_dict |
| all_spans_dict[FREE_TEXT] = texts_span_dict |
|
|
|
|
| def get_sorted_spans(spans_dict): |
| merged_spans = {} |
| for span_dict in spans_dict.values(): |
| merged_spans.update(span_dict) |
| sorted_spans = sorted(merged_spans) |
| return sorted_spans, merged_spans |
|
|
|
|
| sorted_spans, merged_spans = get_sorted_spans(all_spans_dict) |
|
|
| |
| if sorted_spans[0][0] != 0: |
| print("FYI, our spans don't start at the start of the file.") |
| print("We did not catch this start:") |
| print(model_card_md[: sorted_spans[0][0]]) |
|
|
| for idx in range(len(sorted_spans) - 1): |
| last_span_end = sorted_spans[idx][1] |
| new_span_start = sorted_spans[idx + 1][0] |
| if new_span_start > last_span_end + 1: |
| start_nonparse = sorted_spans[idx] |
| end_nonparse = sorted_spans[idx + 1] |
| text = model_card_md[start_nonparse[1] : end_nonparse[0]] |
| if text.strip(): |
| print("Found an unparsed span in the file:") |
| print(start_nonparse) |
| print(" ---> ") |
| print(end_nonparse) |
| print(text) |
|
|
| |
| def section_map_to_help_text(text_retrieved): |
|
|
| presit_states = { |
| "## Model Details": "Give an overview of your model, the relevant research paper, who trained it, etc.", |
| "## How to Get Started with the Model": "Give an overview of how to get started with the model", |
| "## Limitations and Biases": "Provide an overview of the possible Limitations and Risks that may be associated with this model", |
| "## Uses": "Detail the potential uses, intended use and out-of-scope uses for this model", |
| "## Training": "Provide an overview of the Training Data and Training Procedure for this model", |
| "## Evaluation Results": "Detail the Evaluation Results for this model", |
| "## Environmental Impact": "Provide an estimate for the carbon emissions: Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here.", |
| "## Citation Information": "How to best cite the model authors", |
| "## Glossary": "If relevant, include terms and calculations in this section that can help readers understand the model or model card.", |
| "## More Information": "Any additional information", |
| "## Model Card Authors": "This section provides another layer of transparency and accountability. Whose views is this model card representing? How many voices were included in its construction? Etc.", |
| "Model Card Contact": "Mediums to use, in order to contact the model creators", |
| "## Technical Specifications": " Additional technical information", |
| '## Model Examination': " Examining the model", |
| } |
|
|
| for key in presit_states: |
| if key == text_retrieved: |
| return presit_states(key) |
|
|
|
|
| def section_map_to_persist(text_retrieved): |
|
|
| presit_states = { |
| "Model_details_text": "## Model Details", |
| "Model_how_to": "## How to Get Started with the Model", |
| "Model_Limits_n_Risks": "## Limitations and Biases", |
| "Model_uses": "## Uses", |
| "Model_training": "## Training", |
| "Model_Eval": "## Evaluation Results", |
| "Model_carbon": "## Environmental Impact", |
| "Model_cite": "## Citation Information", |
| "Glossary": "## Glossary", |
| "More_info": "## More Information", |
| "Model_card_authors": "## Model Card Authors", |
| "Model_card_contact": "## Model Card Contact", |
| "Technical_specs": "## Technical specifications", |
| "Model_examin": "## Model Examination", |
| } |
|
|
| for key in presit_states: |
| if presit_states[key] == text_retrieved: |
| return key |
|
|
|
|
| def main(): |
| |
| print(extract_it("Model_details_text")) |
|
|
|
|
| def extract_headers(): |
| headers = {} |
| subheaders = {} |
| subsubheaders = {} |
| subsubsubheaders = {} |
| previous = (None, None, None, None) |
|
|
| for s in sorted_spans: |
| if merged_spans[s][1] == "# Header": |
| headers[s] = (sorted_spans.index(s), previous[0]) |
| previous = (sorted_spans.index(s), previous[1], previous[2], previous[3]) |
| if merged_spans[s][1] == "## Subheader": |
| subheaders[s] = (sorted_spans.index(s), previous[1]) |
| previous = (previous[0], sorted_spans.index(s), previous[2], previous[3]) |
| if merged_spans[s][1] == "### Subsubheader": |
| subsubheaders[s] = (sorted_spans.index(s), previous[2]) |
| previous = (previous[0], previous[1], sorted_spans.index(s), previous[3]) |
| if merged_spans[s][1] == "#### Subsubsubheader": |
| subsubsubheaders[s] = (sorted_spans.index(s), previous[3]) |
| previous = (previous[0], previous[1], previous[2], sorted_spans.index(s)) |
|
|
| return headers, subheaders, subsubheaders, subsubsubheaders |
|
|
|
|
| def stringify(): |
| headers, subheaders, subsubheaders, subsubsubheaders = extract_headers() |
| headers_strings = {} |
| subheaders_strings = {} |
| subsubheaders_strings = {} |
| subsubsubheaders_strings = {} |
|
|
| first = None |
| for i in headers: |
| if headers[i][1] == None: |
| continue |
| sub_spans = sorted_spans[headers[i][1] : headers[i][0]] |
| lines = [] |
| for x in sub_spans: |
| lines.append(merged_spans[x][0]) |
| try: |
| name = lines[0] |
| except: |
| name = "Model Details" |
| lines = "".join(lines) |
| |
| |
| headers_strings[ |
| name.replace("\n# ", "") |
| .replace(" ", "") |
| .replace(" ", "") |
| .replace("\n", "") |
| .replace("{{", "") |
| .replace("}}", "") |
| ] = lines |
| first = i |
|
|
| first = None |
| for i in subheaders: |
| if subheaders[i][1] == None: |
| continue |
| sub_spans = sorted_spans[subheaders[i][1] : subheaders[i][0]] |
| lines = [] |
| for x in sub_spans: |
| if merged_spans[x][1] == "## Subheader" and first == None: |
| break |
| elif merged_spans[x][1] == "# Header": |
| break |
| else: |
| lines.append(merged_spans[x][0]) |
| try: |
| name = lines[0] |
| except: |
| name = "Model Details" |
| lines = "".join(lines) |
| |
| |
| subheaders_strings[ |
| name.replace("\n# ", "").replace(" ", "").replace(" ", "") |
| ] = lines |
| first = i |
|
|
| first = None |
| for i in subsubheaders: |
| if subsubheaders[i][1] == None: |
| continue |
| sub_spans = sorted_spans[subsubheaders[i][1] : subsubheaders[i][0]] |
| lines = [] |
| for x in sub_spans: |
| if merged_spans[x][1] == "## Subheader" or ( |
| merged_spans[x][1] == "### Subsubheader" and first == None |
| ): |
| break |
| else: |
| lines.append(merged_spans[x][0]) |
| lines = "".join(lines) |
|
|
| subsubheaders_strings[ |
| merged_spans[i][0].replace("\n", "").replace("### ", "").replace(" ", "") |
| ] = lines |
| first = i |
|
|
| for i in subsubsubheaders: |
| if subsubsubheaders[i][1] == None: |
| continue |
| sub_spans = sorted_spans[subsubsubheaders[i][1] : subsubsubheaders[i][0]] |
| lines = [] |
| for x in sub_spans: |
| if ( |
| merged_spans[x][1] == "## Subheader" |
| or merged_spans[x][1] == "### Subsubheader" |
| ): |
| break |
| else: |
| lines.append(merged_spans[x][0]) |
| lines = "".join(lines) |
|
|
| subsubsubheaders_strings[ |
| merged_spans[i][0].replace("#### ", "").replace("**", "").replace("\n", "") |
| ] = lines |
|
|
| return ( |
| headers_strings, |
| subheaders_strings, |
| subsubheaders_strings, |
| subsubsubheaders_strings, |
| ) |
|
|
|
|
| def extract_it(text_to_retrieve): |
| print("Span\t\tType\t\tText") |
| print("-------------------------------------") |
| found_subheader = False |
| current_subheader = " " |
| page_state = " " |
| help_text = " " |
| |
|
|
| ( |
| headers_strings, |
| subheaders_strings, |
| subsubheaders_strings, |
| subsubsubheaders_strings, |
| ) = stringify() |
|
|
| h_keys = list(headers_strings.keys()) |
| sh_keys = list(subheaders_strings.keys()) |
| ssh_keys = list(subsubheaders_strings.keys()) |
| sssh_keys = list(subsubsubheaders_strings.keys()) |
|
|
| needed = [ |
| "model details", |
| "howto", |
| "limitations", |
| "uses", |
| "training", |
| "evaluation", |
| "environmental", |
| "citation", |
| "glossary", |
| "more information", |
| "authors", |
| "contact", |
| ] |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| info_strings = { |
| "model details": "", |
| "howto": "", |
| "limitations": "", |
| "uses": "", |
| "training": "", |
| "evaluation": "", |
| "environmental": "", |
| "citation": "", |
| "glossary": "", |
| "more information": "", |
| "authors": "", |
| "contact": "", |
| } |
|
|
| for x in needed: |
| for l in h_keys: |
| if x in l.lower(): |
| info_strings[x] = info_strings[x] + headers_strings[l] |
| for i in sh_keys: |
| if x in i.lower(): |
| info_strings[x] = info_strings[x] + subheaders_strings[i] |
| for z in ssh_keys: |
| try: |
| if x in z.lower(): |
| info_strings[x] = info_strings[x] + subsubheaders_strings[z] |
| except: |
| continue |
| for y in sssh_keys: |
| try: |
| if x in y.lower(): |
| info_strings[x] = info_strings[x] + subsubsubheaders_strings[y] |
| except: |
| continue |
|
|
| extracted_info = { |
| "Model_details_text": info_strings["model details"], |
| "Model_how_to": info_strings["howto"], |
| "Model_Limits_n_Risks": info_strings["limitations"], |
| "Model_uses": info_strings["uses"], |
| "Model_training": info_strings["training"], |
| "Model_Eval": info_strings["evaluation"], |
| "Model_carbon": info_strings["environmental"], |
| "Model_cite": info_strings["citation"], |
| "Glossary": info_strings["glossary"], |
| "More_info": info_strings["more information"], |
| "Model_card_authors": info_strings["authors"], |
| "Model_card_contact": info_strings["contact"], |
| "Technical_specs": "## Technical specifications", |
| "Model_examin": "## Model Examination", |
| } |
|
|
| |
|
|
| new_t = extracted_info[text_to_retrieve] + " " |
|
|
| return(new_t) |
|
|
|
|
| if __name__ == "__main__": |
|
|
| main() |
|
|