TheJackBright's picture
Deploy PolyGuard OpenEnv Space
877add7 verified
"""Raw text parser for knowledge ingestion."""
from __future__ import annotations
import re
def extract_drug_mentions(text: str) -> list[str]:
tokens = re.findall(r"[a-zA-Z_-]{4,}", text.lower())
return sorted(set(tokens))
def extract_components(text: str) -> list[str]:
# Supports "active ingredient(s): ..." and similar label patterns.
lines = [line.strip().lower() for line in text.splitlines() if line.strip()]
components: list[str] = []
for line in lines:
if "ingredient" in line or "component" in line or "contains" in line:
parts = re.split(r":|\\.|;", line, maxsplit=1)
if len(parts) > 1:
rhs = parts[1]
for item in re.split(r",|/| and ", rhs):
token = re.sub(r"[^a-z0-9_ -]", "", item).strip().replace(" ", "_")
if 3 <= len(token) <= 40:
components.append(token)
return sorted(set(components))