Spaces:
Running
Running
| """Raw text parser for knowledge ingestion.""" | |
| from __future__ import annotations | |
| import re | |
| def extract_drug_mentions(text: str) -> list[str]: | |
| tokens = re.findall(r"[a-zA-Z_-]{4,}", text.lower()) | |
| return sorted(set(tokens)) | |
| def extract_components(text: str) -> list[str]: | |
| # Supports "active ingredient(s): ..." and similar label patterns. | |
| lines = [line.strip().lower() for line in text.splitlines() if line.strip()] | |
| components: list[str] = [] | |
| for line in lines: | |
| if "ingredient" in line or "component" in line or "contains" in line: | |
| parts = re.split(r":|\\.|;", line, maxsplit=1) | |
| if len(parts) > 1: | |
| rhs = parts[1] | |
| for item in re.split(r",|/| and ", rhs): | |
| token = re.sub(r"[^a-z0-9_ -]", "", item).strip().replace(" ", "_") | |
| if 3 <= len(token) <= 40: | |
| components.append(token) | |
| return sorted(set(components)) | |