File size: 5,003 Bytes
7ff7119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""LLM provider factory β€” runtime injection via configurable_alternatives.

Usage::

    from providers import get_chat_model

    # Default profile (env: LLM_PROFILE)
    llm = get_chat_model()

    # Explicit profile selection
    llm = get_chat_model("dummy")

    # Runtime override inside a graph:
    graph.invoke(state, config={"configurable": {"llm_profile": "ollama"}})

The configurable_alternatives pattern lets you switch the provider at runtime
after the graph is compiled β€” no restart required.

The 3 profiles:
  * ``vllm``   β€” Qwen 2.5 served by vLLM on AMD MI300X (OpenAI-compatible API). Production default.
  * ``ollama`` β€” local fallback (Qwen 2.5 7B Instruct via Ollama). Dev / data-privacy.
  * ``dummy``  β€” deterministic stub (CI / eval / load tests). No network calls.
"""

from __future__ import annotations

from typing import Literal

from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.runnables import ConfigurableField, Runnable

from config import settings
from providers.dummy_provider import DummyChatModel, build_dummy_chat


# Cached singleton β€” same configurable instance returned every time
_chat_model: Runnable | None = None
_embeddings = None  # lazy: SentenceTransformerEmbeddings | None


def get_chat_model(
    profile: Literal["vllm", "ollama", "dummy"] | None = None,
) -> Runnable:
    """Return the application chat-model. Profile selectable at runtime.

    If ``profile=None`` (default): uses ``settings.llm_profile``.

    Returns a Runnable that can switch providers at runtime via
    ``configurable_alternatives``. All three BaseChatModel implementations
    support ``bind_tools()`` and ``with_structured_output()``.
    """
    global _chat_model
    if _chat_model is None:
        env_profile = settings.llm_profile
        base = _build_base_chat(env_profile)
        # configurable_alternatives offers the other 2 profiles besides the default,
        # BUT only if the underlying package can be imported. If e.g.
        # langchain-openai is not installed (CI dummy-only run), the vllm
        # alternative is skipped β€” runtime switching to it would then fail-fast
        # with a single ImportError.
        alternatives: dict[str, BaseChatModel] = {}
        for alt_profile in ("vllm", "ollama", "dummy"):
            if alt_profile == env_profile:
                continue
            try:
                alternatives[alt_profile] = _build_base_chat(alt_profile)
            except (ImportError, ModuleNotFoundError):
                # Provider package is not installed β€” that's OK, just no swap available
                continue
        _chat_model = base.configurable_alternatives(
            ConfigurableField(id="llm_profile"),
            default_key=env_profile,
            **alternatives,
        )

    if profile is None or profile == settings.llm_profile:
        return _chat_model

    # Explicit profile selection: via Runnable.with_config
    return _chat_model.with_config({"configurable": {"llm_profile": profile}})


def _build_base_chat(profile: str) -> BaseChatModel:
    """Build a BaseChatModel for a single profile.

    The vllm/ollama providers are lazy-imported so dummy-only runs do not
    require ``langchain-openai`` or ``langchain-ollama`` to be installed
    (CI-friendly).
    """
    if profile == "dummy":
        return build_dummy_chat()
    if profile == "vllm":
        from providers.vllm_provider import build_vllm_chat
        return build_vllm_chat()
    if profile == "ollama":
        from providers.ollama_provider import build_ollama_chat
        return build_ollama_chat()
    raise ValueError(
        f"Unknown LLM profile: {profile!r}. Available: vllm|ollama|dummy"
    )


def get_embeddings():
    """Embedding model singleton (sentence-transformers, local).

    Lazy-imported: the sentence-transformers package is only loaded when
    embeddings are actually needed (Phase 3+). Phase 1 smoke tests do not
    require it, so the lazy import protects CI/dummy-only runs.
    """
    global _embeddings
    if _embeddings is None:
        from providers.embeddings import build_embeddings
        _embeddings = build_embeddings()
    return _embeddings


def get_dummy_handle() -> DummyChatModel:
    """Return a direct handle to the dummy provider (for state management).

    The UI calls ``set_docs_hint(filenames)``: after upload, the dummy reads
    the actual file list to choose tool parameters. Returns a fresh
    DummyChatModel instance because the configurable_alternatives Runnable's
    inner state is not exposed via the public API. The UI must set the
    docs_hint on the SINGLETON instance (not on this returned handle) right
    before invoking the graph β€” the LangGraph compile holds the singleton.

    See ``app/main.py`` session-init for the correct pattern.
    """
    return build_dummy_chat()


__all__ = [
    "get_chat_model",
    "get_embeddings",
    "get_dummy_handle",
    "DummyChatModel",
]