| from transformers import pipeline |
| import spacy |
|
|
| class ClinicalNERProcessor: |
| """ |
| A class for Named Entity Recognition and POS tagging. |
| """ |
|
|
| def __init__(self, use_pos=True, use_anatomy=True): |
| |
| self.ner_pipeline = pipeline( |
| "ner", |
| model="samrawal/bert-base-uncased_clinical-ner", |
| aggregation_strategy="simple" |
| ) |
|
|
| |
| |
| |
| |
| |
| self.anatomy_pipeline = None |
| if use_anatomy: |
| try: |
| self.anatomy_pipeline = pipeline( |
| "ner", |
| model="OpenMed/OpenMed-NER-AnatomyDetect-BioPatient-108M", |
| aggregation_strategy="simple" |
| ) |
| except Exception as e: |
| print(f"Warning: Could not load anatomy model: {e}") |
|
|
| |
| self.nlp = None |
| if use_pos: |
| try: |
| self.nlp = spacy.load("en_core_web_sm") |
| except OSError: |
| print("Warning: spaCy model 'en_core_web_sm' not found.") |
| print("Install it with: python -m spacy download en_core_web_sm") |
|
|
| def _merge_subwords(self, entities): |
| if not entities: |
| return [] |
|
|
| merged = [] |
| i = 0 |
|
|
| while i < len(entities): |
| current = entities[i].copy() |
| word = current['word'] |
| end = current['end'] |
|
|
| |
| j = i + 1 |
| while j < len(entities): |
| next_entity = entities[j] |
|
|
| |
| if (next_entity['word'].startswith('##') and |
| next_entity['entity_group'] == current['entity_group']): |
| |
| word += next_entity['word'][2:] |
| end = next_entity['end'] |
| j += 1 |
| else: |
| break |
|
|
| |
| current['word'] = word |
| current['end'] = end |
| merged.append(current) |
|
|
| |
| i = j |
|
|
| return merged |
|
|
| def basic_ner(self, text): |
| """Clinical NER only""" |
| entities = self.ner_pipeline(text) |
| return self._merge_subwords(entities) |
|
|
| def prolog_ner(self, text): |
| """Clinical NER as Prolog facts""" |
| entities = self.ner_pipeline(text) |
| merged_entities = self._merge_subwords(entities) |
|
|
| prolog_facts = [] |
| for i, entity in enumerate(merged_entities): |
| |
| word = entity['word'].replace("'", "\\'") |
|
|
| |
| fact = ( |
| f"entity({i}, '{entity['entity_group']}', " |
| f"'{word}', {entity['start']}, " |
| f"{entity['end']}, {entity['score']:.4f})." |
| ) |
| prolog_facts.append(fact) |
|
|
| return "\n".join(prolog_facts) |
|
|
| def anatomy_ner(self, text): |
| """Anatomy NER only""" |
| if self.anatomy_pipeline is None: |
| raise RuntimeError("Anatomy NER pipeline not initialized.") |
|
|
| entities = self.anatomy_pipeline(text) |
| return self._merge_subwords(entities) |
|
|
| def prolog_anatomy(self, text): |
| """Anatomy NER as Prolog facts""" |
| if self.anatomy_pipeline is None: |
| raise RuntimeError("Anatomy NER pipeline not initialized.") |
|
|
| entities = self.anatomy_pipeline(text) |
| merged_entities = self._merge_subwords(entities) |
|
|
| prolog_facts = [] |
| for i, entity in enumerate(merged_entities): |
| |
| word = entity['word'].replace("'", "\\'") |
|
|
| |
| fact = ( |
| f"anatomy({i}, '{entity['entity_group']}', " |
| f"'{word}', {entity['start']}, " |
| f"{entity['end']}, {entity['score']:.4f})." |
| ) |
| prolog_facts.append(fact) |
|
|
| return "\n".join(prolog_facts) |
|
|
| def pos_tagging(self, text): |
| """POS tagging only""" |
| if self.nlp is None: |
| raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm") |
|
|
| doc = self.nlp(text) |
|
|
| pos_results = [] |
| for token in doc: |
| pos_results.append({ |
| 'token': token.text, |
| 'lemma': token.lemma_, |
| 'pos': token.pos_, |
| 'tag': token.tag_, |
| 'dep': token.dep_, |
| 'start': token.idx, |
| 'end': token.idx + len(token.text) |
| }) |
|
|
| return pos_results |
|
|
| def prolog_pos(self, text): |
| """POS tagging as Prolog facts""" |
| if self.nlp is None: |
| raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm") |
|
|
| pos_results = self.pos_tagging(text) |
|
|
| prolog_facts = [] |
| for i, token_info in enumerate(pos_results): |
| |
| token = token_info['token'].replace("'", "\\'") |
| lemma = token_info['lemma'].replace("'", "\\'") |
|
|
| |
| fact = ( |
| f"pos({i}, '{token}', '{lemma}', '{token_info['pos']}', " |
| f"'{token_info['tag']}', '{token_info['dep']}', " |
| f"{token_info['start']}, {token_info['end']})." |
| ) |
| prolog_facts.append(fact) |
|
|
| return "\n".join(prolog_facts) |
|
|
| def combined_analysis(self, text): |
| """Combined analysis: Clinical NER + Anatomy NER + POS tagging""" |
| result = { |
| 'clinical_entities': self.basic_ner(text), |
| 'anatomy_entities': [], |
| 'pos_tags': [] |
| } |
|
|
| if self.anatomy_pipeline: |
| result['anatomy_entities'] = self.anatomy_ner(text) |
|
|
| if self.nlp: |
| result['pos_tags'] = self.pos_tagging(text) |
|
|
| return result |
|
|
| def prolog_combined(self, text): |
| """Combined Prolog output: Clinical NER + Anatomy NER + POS tagging""" |
| sections = [] |
|
|
| |
| clinical_facts = self.prolog_ner(text) |
| if clinical_facts: |
| sections.append(f"% Clinical Entities\n{clinical_facts}") |
|
|
| |
| if self.anatomy_pipeline: |
| anatomy_facts = self.prolog_anatomy(text) |
| if anatomy_facts: |
| sections.append(f"% Anatomy Entities\n{anatomy_facts}") |
|
|
| |
| if self.nlp: |
| pos_facts = self.prolog_pos(text) |
| if pos_facts: |
| sections.append(f"% POS Tags\n{pos_facts}") |
|
|
| return "\n\n".join(sections) |