Spaces:

ecaplan
/

splits

Running

App Files Files Community

Eylon Caplan commited on 13 days ago

Commit

ddb7b62

1 Parent(s): c82bc86

Deploy app code targeting HF Storage Bucket

Browse files

Files changed (20) hide show

.gitignore +1 -0
app.py +96 -0
core_logic.py +211 -0
packages.txt +1 -0
requirements.txt +6 -0
subspace/__init__.py +9 -0
subspace/__pycache__/__init__.cpython-310.pyc +0 -0
subspace/__pycache__/fuzzy.cpython-310.pyc +0 -0
subspace/__pycache__/similarity.cpython-310.pyc +0 -0
subspace/__pycache__/symbolic.cpython-310.pyc +0 -0
subspace/__pycache__/tool.cpython-310.pyc +0 -0
subspace/fuzzy.py +47 -0
subspace/grassmannian.py +26 -0
subspace/legacy_operations/__init__.py +1 -0
subspace/legacy_operations/operations.py +103 -0
subspace/operations.py +108 -0
subspace/optimal_transport.py +47 -0
subspace/similarity.py +162 -0
subspace/symbolic.py +29 -0
subspace/tool.py +72 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ bm25_indexes/

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import gradio as gr
+import pandas as pd
+import os
+from core_logic import (
+    query_bm25_index,
+    lift_at_k,
+    lift_ci,
+    compute_keyword_similarity
+)
+# Path to the index directory where the Hugging Face Storage Bucket is mounted.
+# Assuming the bucket is mounted at /data in the Space settings.
+INDEX_DIR = '/data/bm25_indexes'
+def get_available_indices():
+    if os.path.exists(INDEX_DIR):
+        return [d for d in os.listdir(INDEX_DIR) if os.path.isdir(os.path.join(INDEX_DIR, d))]
+    return ["No indices found"]
+def evaluate_keywords(index_name, target_demo, seed_words_str, generated_words_str):
+    try:
+        # Parse inputs
+        seed_words = [w.strip() for w in seed_words_str.split(",") if w.strip()]
+        generated_words = [w.strip() for w in generated_words_str.split(",") if w.strip()]
+        index_path = os.path.join(INDEX_DIR, index_name)
+        # 1. Compute BM25 Lifts for the GENERATED words
+        df_results = query_bm25_index(index_path, generated_words, doc_count=1000)
+        lift_100 = lift_at_k(df_results, target_demo, k=100)
+        pval_100, ci_lower_100, ci_upper_100 = lift_ci(df_results, target_demo, k=100)
+        lift_5_percent = lift_at_k(df_results, target_demo, k=0.05)
+        pval_5, ci_lower_5, ci_upper_5 = lift_ci(df_results, target_demo, k=0.05)
+        lift_text = (
+            f"**Lift@100:** {lift_100:.3f} (p={pval_100:.4f}, 95% CI: [{ci_lower_100:.3f}, {ci_upper_100:.3f}])\n"
+            f"**Lift@5%:** {lift_5_percent:.3f} (p={pval_5:.4f}, 95% CI: [{ci_lower_5:.3f}, {ci_upper_5:.3f}])"
+        )
+        # 2. Compute BERT Similarity
+        sim_metrics = compute_keyword_similarity(seed_words, generated_words, device='cpu')
+        sim_text = (
+            f"**Precision:** {sim_metrics['Precision']:.4f}\n"
+            f"**Recall:** {sim_metrics['Recall']:.4f}\n"
+            f"**F-Score:** {sim_metrics['F-Score']:.4f}"
+        )
+        # 3. Preview Top 10 hits
+        top_hits = df_results.head(10)[['id', 'score', 'demographic', 'content']]
+        return lift_text, sim_text, top_hits
+    except Exception as e:
+        return f"Error: {str(e)}", "", pd.DataFrame()
+# Gradio Interface
+with gr.Blocks(title="BM25 Splits Demo") as demo:
+    gr.Markdown("# 🚀 BM25 Target Demographic Evaluation Demo")
+    gr.Markdown("Test retrieved demographic splits against predefined seed keywords and BERT Subspace metrics.")
+    with gr.Row():
+        with gr.Column():
+            index_dropdown = gr.Dropdown(choices=get_available_indices(), label="Select BM25 Index")
+            target_demo_input = gr.Textbox(label="Target Demographic (e.g., 'jewish', 'black')", value="jewish")
+            seed_words_input = gr.Textbox(
+                label="Target Demographic Seed Words (Comma separated)",
+                value="the, be, to, of, and, a, in, that, have, I, it, for, not, on, with, he, as, you, do, at"
+            )
+            generated_words_input = gr.Textbox(
+                label="Your Subspace/Generated Keywords (Comma separated)",
+                value="church, jesus, christ, prayer"
+            )
+            submit_btn = gr.Button("Run Compute", variant="primary")
+        with gr.Column():
+            gr.Markdown("### 📊 Similarity Metrics (BERT-Score)")
+            sim_output = gr.Markdown("Waiting to run...")
+            gr.Markdown("### 📈 Lift Metrics (BM25)")
+            lift_output = gr.Markdown("Waiting to run...")
+    gr.Markdown("### 🔍 Top 10 Retrieved Hits")
+    table_output = gr.Dataframe()
+    submit_btn.click(
+        fn=evaluate_keywords,
+        inputs=[index_dropdown, target_demo_input, seed_words_input, generated_words_input],
+        outputs=[lift_output, sim_output, table_output]
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

core_logic.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import json
+import hashlib
+import random
+import pandas as pd
+from typing import Union, Tuple
+import torch
+from pyserini.search.lucene import LuceneSearcher
+from pyserini.index.lucene import Document
+from pyserini.analysis import get_lucene_analyzer
+from pyserini.pyclass import autoclass
+from scipy.stats import hypergeom
+from subspace.tool import SubspaceBERTScore
+# ==============================================================================
+# BM25 Search and Query Building
+# ==============================================================================
+def get_standard_query(query: str, field: str = "contents", analyzer=None):
+    """
+    Runs Lucene's StandardQueryParser to get a parsed query object.
+    """
+    if analyzer is None:
+        analyzer = get_lucene_analyzer()
+    JStandardQueryParser = autoclass('org.apache.lucene.queryparser.flexible.standard.StandardQueryParser')
+    query_parser = JStandardQueryParser()
+    query_parser.setAnalyzer(analyzer)
+    return query_parser.parse(query, field)
+def query_bm25_index(index_path: str, keywords: list, doc_count: int = 1000) -> pd.DataFrame:
+    """Load index, run BM25 phrase search using custom HuggingFace analyzer, and return results."""
+    # 1. Load searcher
+    searcher = LuceneSearcher(index_path)
+    # 2. Load custom analyzer matching your index strategy
+    analyzer = get_lucene_analyzer(
+        language='hgf_tokenizer',
+        huggingFaceTokenizer='bert-base-uncased'
+    )
+    # 3. Create query string connecting keywords by OR (e.g., '"jesus" OR "christ"')
+    query_string = " OR ".join([f'"{kw}"' for kw in keywords])
+    # 4. Build standard lucene query using your custom querybuilder
+    phrase_q = get_standard_query(query_string, analyzer=analyzer)
+    # 5. Search
+    hits = searcher.search(phrase_q, doc_count)
+    # 6. Parse results
+    results = []
+    returned_ids = set()
+    for hit in hits:
+        returned_ids.add(hit.docid)
+        doc = Document(hit.lucene_document)
+        raw = doc.raw()
+        jd = json.loads(raw)
+        row = {
+            'id': jd.get("id"),
+            'content': jd.get("contents", ""),
+            'score': hit.score
+        }
+        if "metadata" in jd and jd["metadata"]:
+            metadata = json.loads(jd["metadata"])
+            row.update(metadata)
+        results.append(row)
+    returned_ext_ids = {r['id'] for r in results}
+    # Pad with random unretrieved items exactly as required
+    if len(results) < doc_count:
+        needed = doc_count - len(results)
+        total = searcher.num_docs
+        # build a list of internal docnums whose external ID wasn't already returned
+        pool = []
+        for docnum in range(total):
+            lucene_doc = searcher.doc(docnum)
+            doc = Document(lucene_doc)
+            jd = json.loads(doc.raw())
+            ext_id = jd.get("id")
+            if ext_id not in returned_ext_ids:
+                pool.append(docnum)
+        # deterministically shuffle by query
+        md5 = hashlib.md5(query_string.encode("utf-8")).hexdigest()
+        seed = int(md5, 16) % 2**32
+        rng = random.Random(seed)
+        rng.shuffle(pool)
+        # pull 'needed' more docs
+        for docnum in pool[:needed]:
+            lucene_doc = searcher.doc(docnum)
+            doc = Document(lucene_doc)
+            raw = doc.raw()
+            jd = json.loads(raw)
+            row = {
+                "id": jd.get("id"),
+                "content": jd.get("contents", ""),
+                "score": None
+            }
+            if "metadata" in jd and jd["metadata"]:
+                metadata = json.loads(jd["metadata"])
+                row.update(metadata)
+            results.append(row)
+    return pd.DataFrame(results)
+# ==============================================================================
+# Evaluation Metrics (Precision/Lift)
+# ==============================================================================
+def _resolve_k(df, k):
+    """Convert float percentages to absolute k or return k as an int."""
+    if isinstance(k, float) and 0.0 < k <= 1.0:
+        return int(len(df) * k)
+    return int(k)
+def precision_at_k(df: pd.DataFrame, correct_demographic: str, k: Union[int, float]) -> float:
+    """Calculate precision at k for a target demographic."""
+    rel = (df['demographic'] == correct_demographic).astype(int)
+    k_abs = _resolve_k(df, k)
+    if k_abs <= 0:
+        return 0.0
+    return rel.iloc[:k_abs].sum() / float(k_abs)
+def lift_at_k(df: pd.DataFrame, correct_demographic: str, k: Union[int, float]) -> float:
+    """Lift@k: ratio of precision@k to the overall proportion of relevant items."""
+    k_abs = _resolve_k(df, k)
+    if k_abs <= 0 or len(df) == 0:
+        return 0.0
+    precision_k = precision_at_k(df, correct_demographic, k)
+    rel = (df['demographic'] == correct_demographic).astype(int)
+    overall_proportion = rel.sum() / float(len(df))
+    if overall_proportion == 0:
+        return 0.0
+    return precision_k / overall_proportion
+def hypergeometric_significance_test(df: pd.DataFrame, correct_demographic: str, k: Union[int, float], alpha: float = 0.05) -> Tuple[float, Tuple[int, int], Tuple[float, float]]:
+    """Hypergeometric statistical significance test for the retrieval."""
+    n = _resolve_k(df, k)
+    N = len(df)
+    rel = (df['demographic'] == correct_demographic).astype(int)
+    K = rel.sum()
+    k_obs = rel.iloc[:n].sum()
+    if K == 0 or n <= 0:
+        return 0.0, (0, 0), (0.0, 0.0)
+    p_value = hypergeom.sf(k_obs - 1, N, K, n)
+    L = int(hypergeom.ppf(alpha/2, N, K, n))
+    U = int(hypergeom.isf(alpha/2, N, K, n))
+    return p_value, (L, U), (L / n, U / n)
+def lift_ci(df: pd.DataFrame, correct_demographic: str, k: Union[int, float], alpha: float = 0.05) -> Tuple[float, float, float]:
+    """Calculate confidence interval for lift@k using hypergeometric distribution."""
+    n = _resolve_k(df, k)
+    N = len(df)
+    rel = (df['demographic'] == correct_demographic).astype(int)
+    K = rel.sum()
+    overall_proportion = K / float(N)
+    if K == 0 or n <= 0 or overall_proportion == 0:
+        return 0.0, 0.0, 0.0
+    pval, (L, U), _ = hypergeometric_significance_test(df, correct_demographic, k, alpha)
+    lower_bound_lift = (L / n) / overall_proportion
+    upper_bound_lift = (U / n) / overall_proportion
+    return pval, lower_bound_lift, upper_bound_lift
+# ==============================================================================
+# Keyword Similarity (SubspaceBERTScore)
+# ==============================================================================
+def compute_keyword_similarity(set1: list, set2: list, device: str = None) -> dict:
+    """
+    Computes precision, recall, and F-score similarity metrics between two keyword sets.
+    Mirrors the subspace-based BERT scoring logic handling keyword lists.
+    """
+    if device is None:
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    print(f"Initializing BERT model on {device}...")
+    scorer = SubspaceBERTScore(device=device, model_name_or_path='bert-base-uncased')
+    sentence_1 = [", ".join(set1)]
+    sentence_2 = [", ".join(set2)]
+    scores = scorer(sentence_1, sentence_2)
+    return {
+        'Precision': scores[0].item(),
+        'Recall': scores[1].item(),
+        'F-Score': scores[2].item()
+    }

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ default-jre

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pyserini==0.44.0
+pandas==2.2.3
+scipy==1.15.2
+transformers==4.53.2
+torch
+gradio

subspace/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .similarity import subspace_johnson
+from .similarity import vanilla_bert_score
+from .similarity import subspace_bert_score
+# Other metrics
+from .fuzzy import *
+from .symbolic import *
+#from .optimal_transport import *
+#from .grassmannian import *

subspace/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (322 Bytes). View file

subspace/__pycache__/fuzzy.cpython-310.pyc ADDED Viewed

Binary file (1.15 kB). View file

subspace/__pycache__/similarity.cpython-310.pyc ADDED Viewed

Binary file (5.61 kB). View file

subspace/__pycache__/symbolic.cpython-310.pyc ADDED Viewed

Binary file (1.09 kB). View file

subspace/__pycache__/tool.cpython-310.pyc ADDED Viewed

Binary file (2.84 kB). View file

subspace/fuzzy.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright 2018 Babylon Partners. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+def fuzzify(s, u):
+    """
+    Sentence fuzzifier.
+    Computes membership vector for the sentence S with respect to the
+    universe U
+    :param s: list of word embeddings for the sentence
+    :param u: the universe matrix U with shape (K, d)
+    :return: membership vectors for the sentence
+    """
+    f_s = np.dot(s, u.T)
+    m_s = np.max(f_s, axis=0)
+    m_s = np.maximum(m_s, 0, m_s)
+    return m_s
+def dynamax_jaccard(x, y):
+    """
+    DynaMax-Jaccard similarity measure between two sentences
+    :param x: list of word embeddings for the first sentence
+    :param y: list of word embeddings for the second sentence
+    :return: similarity score between the two sentences
+    """
+    u = np.vstack((x, y))
+    m_x = fuzzify(x, u)
+    m_y = fuzzify(y, u)
+    m_inter = np.sum(np.minimum(m_x, m_y))
+    m_union = np.sum(np.maximum(m_x, m_y))
+    return m_inter / m_union

subspace/grassmannian.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import numpy as np
+import scipy
+def grassmann_distance(U, V):
+    """ Compute geodesic distance for grassmann manifold
+        Args:
+            U, V: A matrix of bases of a linear subspace
+        Return:
+            grassmann distance
+        See Also:
+            scipy.linalg.subspace_angles
+        Example:
+            >>> U = np.array([[1,0,0], [1,1,1]])
+            >>> V = np.array([[0,1,0], [1,1,1]])
+            >>> grassmann_distance(U, V)
+    """
+    # compute the canonical angles
+    s = scipy.linalg.subspace_angles(U.T, V.T)
+    # grassmann distance
+    return sum(s * s)
+def grassmann_similarity(x, y):
+    return -grassmann_distance(x, y)

subspace/legacy_operations/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .operations import *

subspace/legacy_operations/operations.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import numpy as np
+from scipy.linalg import orth
+def subspace_np(A):
+    """ Compute orthonormal bases of the subspace
+        Args:
+            A: bases of the linear subspace (n_bases, dim)
+        Return:
+            Orthonormal bases
+        Example:
+            >>> A = np.random.random_sample((10, 300))
+            >>> subspace_np(A)
+    """
+    return orth(A.T).T
+def intersection_np(SA, SB, threshold=1e-2):
+    """ Compute bases of the intersection
+        Args:
+            SA, SB: bases of the linear subspace (n_bases, dim)
+        Return:
+            Bases of intersection
+        Example:
+            >>> A = np.random.random_sample((10, 300))
+            >>> B = np.random.random_sample((20, 300))
+            >>> intersection_np(A, B)
+    """
+    assert threshold > 1e-6
+    if SA.shape[0] > SB.shape[0]:
+        return intersection_np(SB, SA, threshold)
+    # orthonormalize
+    SA = subspace_np(SA)
+    SB = subspace_np(SB)
+    # compute canonical angles
+    u, s, v = np.linalg.svd(SA @ SB.T)
+    # extract the basis that the canonical angle is zero
+    u = u[:, np.abs(s - 1.0) < threshold]
+    return (SA.T @ u).T
+def sum_space_np(SA, SB):
+    """ Compute bases of the sum space
+        Args:
+            SA, SB: bases of the linear subspace (n_bases, dim)
+        Return:
+            Bases of sum space
+        Example:
+            >>> A = np.random.random_sample((10, 300))
+            >>> B = np.random.random_sample((20, 300))
+            >>> sum_space_np(A, B)
+    """
+    M = np.concatenate([SA, SB], axis=0)
+    return subspace_np(M)
+def orthogonal_complement_np(SA, threshold=1e-2):
+    """ Compute bases of the orthogonal complement
+        Args:
+            SA: bases of the linear subspace (n_bases, dim)
+        Return:
+            Bases of the orthogonal complement
+        Example:
+            >>> A = np.random.random_sample((10, 300))
+            >>> orthogonal_complement_np(A)
+    """
+    assert threshold > 1e-6
+    u, s, v = np.linalg.svd(SA.T)
+    # compute rank
+    rank = (s > threshold).sum()
+    return u[:, rank:].T
+def soft_membership_np(A, v):
+    """ Compute membership degree of the vector v for the subspace A
+        Args:
+            A: bases of the linear subspace (n_bases, dim)
+            v: vector (dim,)
+        Return:
+            soft membership degree
+        Example:
+            >>> A = np.array([[1,0,0], [0,1,0]])
+            >>> v = np.array([1,0,0])
+            >>> soft_membership_np(A, v)
+            1.0
+            >>> A = np.array([[1,0,0], [0,1,0]])
+            >>> v = np.array([0,0,1])
+            >>> soft_membership_np(A, v)
+            0.0
+    """
+    v = v.reshape(1, len(v))
+    v = subspace_np(v)
+    A = subspace_np(A)
+    # The cosine of the angles between a subspace and a vector are singular values
+    u, s, v = np.linalg.svd(A @ v.T)
+    s[s > 1] = 1
+    # Return the maximum cosine of the canonical angles, i.e., the soft membership.
+    return np.max(s)

subspace/operations.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+def subspace(A: torch.Tensor) -> torch.Tensor:
+    """
+    Compute orthonormal bases of the subspace
+        Args:
+            A: bases of the linear subspace (n_bases, dim)
+        Return:
+            Orthonormal bases
+        Example:
+            >>> A = torch.rand(10, 300)
+            >>> subspace(A)
+    """
+    return torch.linalg.qr(A.t()).Q.t()
+def intersection(SA: torch.Tensor, SB: torch.Tensor, threshold: float = 1e-2) -> torch.Tensor:
+    """
+    Compute bases of the intersection
+        Args:
+            SA, SB: bases of the linear subspace (n_bases, dim)
+        Return:
+            Bases of intersection
+        Example:
+            >>> A = torch.rand(10, 300)
+            >>> B = torch.rand(20, 300)
+            >>> intersection(A, B)
+    """
+    assert threshold > 1e-6
+    if SA.shape[0] > SB.shape[0]:
+        return intersection(SB, SA, threshold)
+    # orthonormalize
+    SA = subspace(SA)
+    SB = subspace(SB)
+    # compute canonical angles
+    u, s, v = torch.linalg.svd(SA @ SB.t())
+    # extract the basis that the canonical angle is zero
+    u = u[:, (s - 1.0).abs() < threshold]
+    return (SA.t() @ u).t()
+def sum_space(SA: torch.Tensor, SB: torch.Tensor) -> torch.Tensor:
+    """
+    Compute bases of the sum space
+        Args:
+            SA, SB: bases of the linear subspace (n_bases, dim)
+        Return:
+            Bases of sum space
+        Example:
+            >>> A = torch.rand(10, 300)
+            >>> B = torch.rand(20, 300)
+            >>> sum_space(A, B)
+    """
+    M = torch.cat([SA, SB], dim=0)
+    return subspace(M)
+def orthogonal_complement(SA: torch.Tensor, threshold: float = 1e-2) -> torch.Tensor:
+    """
+    Compute bases of the orthogonal complement
+        Args:
+            SA: bases of the linear subspace (n_bases, dim)
+        Return:
+            Bases of the orthogonal complement
+        Example:
+            >>> A = torch.rand(10, 300)
+            >>> orthogonal_complement(A)
+    """
+    assert threshold > 1e-6
+    u, s, v = torch.linalg.svd(SA.t())
+    # compute rank
+    rank = (s > threshold).sum()
+    return u[:, rank:].T
+def soft_membership(A: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Compute membership degree of the vector v for the subspace A
+        Args:
+            A: bases of the linear subspace (n_bases, dim)
+            v: vector (dim,)
+        Return:
+            soft membership degree
+        Example:
+            >>> A = torch.tensor([[1,0,0], [0,1,0]])
+            >>> v = torch.tensor([1,0,0])
+            >>> soft_membership(A, v)
+            1.0
+            >>> A = torch.tensor([[1,0,0], [0,1,0]])
+            >>> v = torch.tensor([0,0,1])
+            >>> soft_membership(A, v)
+            0.0
+    """
+    v = v.reshape(1, len(v))
+    v = subspace(v)
+    A = subspace(A)
+    # The cosine of the angles between a subspace and a vector are singular values
+    u, s, v = torch.linalg.svd(A @ v.t())
+    s[s > 1] = 1
+    # Return the maximum cosine of the canonical angles, i.e., the soft membership.
+    return torch.max(s)

subspace/optimal_transport.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# https://kexue.fm/archives/7388
+import numpy as np
+from scipy.optimize import linprog
+def wasserstein_distance(p, q, D):
+    A_eq = []
+    for i in range(len(p)):
+        A = np.zeros_like(D)
+        A[i, :] = 1
+        A_eq.append(A.reshape(-1))
+    for i in range(len(q)):
+        A = np.zeros_like(D)
+        A[:, i] = 1
+        A_eq.append(A.reshape(-1))
+    A_eq = np.array(A_eq)
+    b_eq = np.concatenate([p, q])
+    D = D.reshape(-1)
+    result = linprog(D, A_eq=A_eq[:-1], b_eq=b_eq[:-1])
+    return result.fun
+def word_rotator_distance(x, y):
+    x_norm = (x**2).sum(axis=1, keepdims=True)**0.5
+    y_norm = (y**2).sum(axis=1, keepdims=True)**0.5
+    p = x_norm[:, 0] / x_norm.sum()
+    q = y_norm[:, 0] / y_norm.sum()
+    D = 1 - np.dot(x / x_norm, (y / y_norm).T)
+    return wasserstein_distance(p, q, D)
+def word_mover_distance(x, y):
+    p = np.ones(x.shape[0]) / x.shape[0]
+    q = np.ones(y.shape[0]) / y.shape[0]
+    D = np.sqrt(np.square(x[:, None] - y[None, :]).mean(axis=2))
+    return wasserstein_distance(p, q, D)
+def word_rotator_similarity(x, y):
+    return 1 - word_rotator_distance(x, y)
+def word_mover_similarity(x, y):
+    return 1 - word_mover_distance(x, y)

subspace/similarity.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+def get_weights(A, B, weight):
+    # get weights
+    if weight == "L2":
+        weights_A = torch.linalg.norm(A, dim=2)
+        weights_B = torch.linalg.norm(B, dim=2)
+    elif weight == "L1":
+        weights_A = torch.linalg.norm(A, dim=2, ord=1)
+        weights_B = torch.linalg.norm(B, dim=2, ord=1)
+    elif weight == "no":
+        weights_A = torch.ones(A.size(0), A.size(1)).to(A.device)
+        weights_B = torch.ones(B.size(0), B.size(1)).to(B.device)
+    else:
+        raise NotImplementedError
+    return weights_A, weights_B
+def pairwise_cosine_matrix(matrix1, matrix2):
+    dot = torch.matmul(matrix1, matrix2.transpose(1, 2))
+    matrix1_norm = torch.norm(matrix1, dim=-1, keepdim=True)
+    matrix2_norm = torch.norm(matrix2, dim=-1, keepdim=True)
+    norm = torch.matmul(matrix1_norm, matrix2_norm.transpose(1, 2))
+    return dot / norm
+def subspace_batch(A):
+    """ Return the matrix of the subspace for a batch of matrices
+        Arg:
+            A: Bases of a linear subspace (batchsize, num_bases, emb_dim)
+        Return:
+            S: Orthonormalized bases of a linear subspace (batchsize, num_bases, emb_dim)
+        Example:
+            >>> A = torch.randn(5, 4, 300)
+            >>> subspace_batch(A)
+    """
+    # orthonormalize
+    S, _ = torch.linalg.qr(torch.transpose(A, 1, 2))
+    return torch.transpose(S, 1, 2)
+@torch.jit.script
+def soft_membership_batch(S, v):
+    """ Compute soft membership degree between a subspace and a vector for a batch of vectors
+        Args:
+            S: Orthonormalized bases of a linear subspace (batchsize, num_bases, emb_dim)
+            v: vector (batchsize, emb_dim)
+        Return:
+            soft_membership degree (batchsize,)
+        Example:
+            >>> S = torch.randn(5, 4, 300)
+            >>> v = torch.randn(5, 300)
+            >>> soft_membership_batch(S, v)
+    """
+    # normalize
+    v = torch.nn.functional.normalize(v)
+    v = v.view(v.size(0), v.size(1), 1)
+    # compute SVD for cos(theta)
+    m = torch.matmul(S, v)
+    s = torch.linalg.svdvals(m.float()) # s is the sequence of cos(theta_i)
+    return torch.mean(s, 1)
+def subspace_johnson(A, B, weight="L2"):
+    """ Compute similarity between two vector sets (sentences)
+        Args:
+            A: Matrix of word embeddings for the first sentence
+               (batchsize, num_bases, dim)
+            B: Matrix of word embeddings for the second sentence
+               (batchsize, num_bases, dim)
+        Return:
+            similarity between A and B (batchsize,)
+        Example:
+            >>> A = torch.randn(5, 3, 300)
+            >>> B = torch.randn(5, 4, 300)
+            >>> subspace_johnson(A, B)
+    """
+    def numerator(U, V, weights):
+        """
+            U should be a matrix of word embeddings
+            V should be a matrix of orthonormalized bases
+        """
+        softm = torch.stack([soft_membership_batch(V, vec)
+                             for vec in torch.transpose(U, 0, 1)])
+        softm = torch.transpose(softm, 0, 1)
+        return torch.sum(softm * weights, 1)
+    # get weights
+    weights_A, weights_B = get_weights(A, B, weight)
+    # compute similarity
+    x = numerator(A, subspace_batch(B), weights_A) / torch.sum(weights_A, 1)
+    y = numerator(B, subspace_batch(A), weights_B) / torch.sum(weights_B, 1)
+    return x + y
+def subspace_bert_score(A, B, weight="L2"):
+    """ Compute similarity between two vector sets (sentences)
+        Args:
+            A: Matrix of word embeddings for the first sentence
+               (batchsize, num_bases, dim)
+            B: Matrix of word embeddings for the second sentence
+               (batchsize, num_bases, dim)
+        Return:
+            similarity between A and B (batchsize,)
+        Example:
+            >>> A = torch.randn(5, 3, 300)
+            >>> B = torch.randn(5, 4, 300)
+            >>> subspace_bert_score(A, B)
+    """
+    def numerator(U, V, weights):
+        """
+            U should be a matrix of word embeddings
+            V should be a matrix of orthonormalized bases
+        """
+        softm = torch.stack([soft_membership_batch(V, vec)
+                             for vec in torch.transpose(U, 0, 1)])
+        softm = torch.transpose(softm, 0, 1)
+        return torch.sum(softm * weights, 1)
+    # get weights
+    weights_A, weights_B = get_weights(A, B, weight)
+    # Cmpute P, R, F
+    R = numerator(A, subspace_batch(B), weights_A) / torch.sum(weights_A, 1) # R is the left term of SubspaceJohnson
+    P = numerator(B, subspace_batch(A), weights_B) / torch.sum(weights_B, 1) # P is the right term of SubspaceJohnson
+    F = (2 * P * R) / (P + R)
+    return P, R, F
+def vanilla_bert_score(A, B, weight="L2"):
+    """ Compute similarity between two vector sets (sentences)
+        Args:
+            A: Matrix of word embeddings for the first sentence
+               (batchsize, num_bases, dim)
+            B: Matrix of word embeddings for the second sentence
+               (batchsize, num_bases, dim)
+        Return:
+            similarity between A and B (batchsize,)
+        Example:
+            >>> A = torch.randn(5, 3, 300)
+            >>> B = torch.randn(5, 4, 300)
+            >>> vanilla_bert_score(A, B)
+    """
+    def numerator(pairwise_cos, dim, weights):
+        max_cos, _ = pairwise_cos.max(dim=dim)
+        return torch.sum(max_cos * weights, 1) # (max_cos * weights).sum(dim=1)
+    # get weights
+    weights_A, weights_B = get_weights(A, B, weight)
+    # Pairwise cosine
+    pairwise_cos = pairwise_cosine_matrix(A, B)
+    # Cmpute P, R, F
+    R = numerator(pairwise_cos, 2, weights_A) / torch.sum(weights_A, 1) # R は SubspaceJohnson の 左項
+    P = numerator(pairwise_cos, 1, weights_B) / torch.sum(weights_B, 1) # P は SubspaceJohnson の 右項
+    F = (2 * P * R) / (P + R)
+    return P, R, F

subspace/symbolic.py ADDED Viewed

	@@ -0,0 +1,29 @@

+def symbolic_johnson(x, y):
+    """
+    Classical Johnson similarity measure between two sets
+    :param x: list of words (strings) for the first sentence
+    :param y: list of words (strings) for the second sentence
+    :return: similarity score between two sentences
+    """
+    if len(x) == 0 or len(y) == 0:
+        return 0.0
+    xs = set(x)
+    ys = set(y)
+    inter = xs & ys
+    return len(inter) / len(xs) + len(inter) / len(ys)
+def symbolic_jaccard(x, y):
+    """
+    Classical Jaccard similarity measure between two sets
+    :param x: list of words (strings) for the first sentence
+    :param y: list of words (strings) for the second sentence
+    :return: similarity score between two sentences
+    """
+    if len(x) == 0 or len(y) == 0:
+        return 0.0
+    xs = set(x)
+    ys = set(y)
+    inter = xs & ys
+    union = xs | ys
+    return len(inter) / len(union)

subspace/tool.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import transformers
+from transformers import AutoTokenizer, AutoModel
+from numpy import ndarray
+import numpy as np
+from .similarity import subspace_johnson, subspace_bert_score, vanilla_bert_score
+class MySimilarity:
+    def __init__(self, device='cpu', model_name_or_path='bert-base-uncased'):
+        # Set up model
+        self.device = device
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        self.model = AutoModel.from_pretrained(model_name_or_path)
+        self.model.eval()
+        self.model.to(device)
+        self.max_length = 128
+    def __call__(self, sentence1, sentence2, weight="L2"):
+        pass
+    def encode(self, sentence, return_numpy=False, batch_size=12):
+        single_sentence = False
+        if isinstance(sentence, str):
+            sentence = [sentence]
+            single_sentence = True
+        embedding_list = []
+        with torch.no_grad():
+            total_batch = len(sentence) // batch_size + (1 if len(sentence) % batch_size > 0 else 0)
+            for batch_id in range(total_batch):
+                inputs = self.tokenizer(
+                    sentence[batch_id*batch_size:(batch_id+1)*batch_size],
+                    padding=True,
+                    truncation=True,
+                    max_length=self.max_length,
+                    return_tensors="pt"
+                )
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                outputs = self.model(**inputs, return_dict=True)
+                embeddings = outputs.last_hidden_state.cpu()
+                embedding_list.append(embeddings)
+        embeddings = torch.cat(embedding_list, 0)
+        if return_numpy and not isinstance(embeddings, ndarray):
+            return embeddings.numpy()
+        return embeddings
+class SubspaceJohnsonSimilarity(MySimilarity):
+    def __call__(self, sentence1, sentence2, weight="L2"):
+        hidden_states1 = self.encode(sentence1)
+        hidden_states2 = self.encode(sentence2)
+        return subspace_johnson(hidden_states1, hidden_states2, weight)
+class SubspaceBERTScore(MySimilarity):
+    def __call__(self, sentence1, sentence2, weight="L2"):
+        hidden_states1 = self.encode(sentence1)
+        hidden_states2 = self.encode(sentence2)
+        return subspace_bert_score(hidden_states1, hidden_states2, weight)
+class VanillaBERTScore(MySimilarity):
+    def __call__(self, sentence1, sentence2, weight="L2"):
+        hidden_states1 = self.encode(sentence1)
+        hidden_states2 = self.encode(sentence2)
+        return vanilla_bert_score(hidden_states1, hidden_states2, weight)