"""Raw text parser for knowledge ingestion.""" from __future__ import annotations import re def extract_drug_mentions(text: str) -> list[str]: tokens = re.findall(r"[a-zA-Z_-]{4,}", text.lower()) return sorted(set(tokens)) def extract_components(text: str) -> list[str]: # Supports "active ingredient(s): ..." and similar label patterns. lines = [line.strip().lower() for line in text.splitlines() if line.strip()] components: list[str] = [] for line in lines: if "ingredient" in line or "component" in line or "contains" in line: parts = re.split(r":|\\.|;", line, maxsplit=1) if len(parts) > 1: rhs = parts[1] for item in re.split(r",|/| and ", rhs): token = re.sub(r"[^a-z0-9_ -]", "", item).strip().replace(" ", "_") if 3 <= len(token) <= 40: components.append(token) return sorted(set(components))