File size: 965 Bytes
21c7db9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
"""Raw text parser for knowledge ingestion."""

from __future__ import annotations

import re


def extract_drug_mentions(text: str) -> list[str]:
    tokens = re.findall(r"[a-zA-Z_-]{4,}", text.lower())
    return sorted(set(tokens))


def extract_components(text: str) -> list[str]:
    # Supports "active ingredient(s): ..." and similar label patterns.
    lines = [line.strip().lower() for line in text.splitlines() if line.strip()]
    components: list[str] = []
    for line in lines:
        if "ingredient" in line or "component" in line or "contains" in line:
            parts = re.split(r":|\\.|;", line, maxsplit=1)
            if len(parts) > 1:
                rhs = parts[1]
                for item in re.split(r",|/| and ", rhs):
                    token = re.sub(r"[^a-z0-9_ -]", "", item).strip().replace(" ", "_")
                    if 3 <= len(token) <= 40:
                        components.append(token)
    return sorted(set(components))