File size: 965 Bytes
21c7db9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | """Raw text parser for knowledge ingestion."""
from __future__ import annotations
import re
def extract_drug_mentions(text: str) -> list[str]:
tokens = re.findall(r"[a-zA-Z_-]{4,}", text.lower())
return sorted(set(tokens))
def extract_components(text: str) -> list[str]:
# Supports "active ingredient(s): ..." and similar label patterns.
lines = [line.strip().lower() for line in text.splitlines() if line.strip()]
components: list[str] = []
for line in lines:
if "ingredient" in line or "component" in line or "contains" in line:
parts = re.split(r":|\\.|;", line, maxsplit=1)
if len(parts) > 1:
rhs = parts[1]
for item in re.split(r",|/| and ", rhs):
token = re.sub(r"[^a-z0-9_ -]", "", item).strip().replace(" ", "_")
if 3 <= len(token) <= 40:
components.append(token)
return sorted(set(components))
|