File size: 1,510 Bytes
0d489f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
"""Tests for src.rag.embed — fastembed wrapper."""
from __future__ import annotations

import numpy as np
import pytest

from src.rag.embed import Embedder, EMBEDDING_DIM


class TestEmbedder:
    @pytest.fixture(scope="class")
    def embedder(self) -> Embedder:
        return Embedder()

    def test_dim_constant_matches_model(self, embedder: Embedder) -> None:
        out = embedder.encode(["hello"])
        assert out.shape == (1, EMBEDDING_DIM)

    def test_batch_encoding(self, embedder: Embedder) -> None:
        out = embedder.encode(["hello", "world", "blood-brain barrier"])
        assert out.shape == (3, EMBEDDING_DIM)
        assert out.dtype == np.float32

    def test_empty_list_returns_empty_array(self, embedder: Embedder) -> None:
        out = embedder.encode([])
        assert out.shape == (0, EMBEDDING_DIM)

    def test_similar_strings_have_higher_similarity_than_dissimilar(
        self, embedder: Embedder
    ) -> None:
        vecs = embedder.encode([
            "blood-brain barrier permeability",
            "BBB drug penetration",
            "MRI multi-site harmonization",
        ])
        # cosine similarity (vectors should be normalized for stable comparison)
        from numpy.linalg import norm
        def cos(a, b):
            return float(np.dot(a, b) / (norm(a) * norm(b)))
        sim_ab = cos(vecs[0], vecs[1])
        sim_ac = cos(vecs[0], vecs[2])
        assert sim_ab > sim_ac, f"Expected BBB-related strings closer; got {sim_ab=} vs {sim_ac=}"