YAML Metadata Warning:empty or missing yaml metadata in repo card

Check out the documentation for more information.

BGE-M3 PropTech Retriever

Korean property domain-specific retrieval model based on BGE-M3 light version

Dimensionality Reduction for a Lightweight Version of BGE-M3 Embeddings

  • ๊ณ ์ •๋œ bge embedding์— MLP ๋ ˆ์ด์–ด ์ ์šฉ โ†’ query์— pair๋œ positive ๋ฌธ์„œ์™€ negative ๋ฌธ์„œ ๊ฐ„ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ โ†’ ์ƒ์œ„ ๋ฌธ์„œ ์ถ”์ฒœ
  • Frozen Base Model + Trainable MLP Parameters: Triplet Loss for Fine-Tuning only fc layers

Dataset: ํ”„๋กญํ…Œํฌ ๋„๋ฉ”์ธ query-document ์Œ ๋ฐ์ดํ„ฐ์…‹

from datasets import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np

# Load dataset
def load_data():
    dataset = load_dataset("crjoya/korean-proptech-retrieval")
    data_list = list(dataset['train'])
    train_data, eval_data = train_test_split(
            data_list, 
            test_size=0.2, 
            random_state=42
        )
    return train_data, eval_data

Usage

For testing simple document

from datasets import load_dataset
from huggingface_hub import hf_hub_download
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import json


# 1. Load models and data
def load_model(repo_id="crjoya/bge-m3-proptech-retrieval", device="cpu"):
    """Load models with consistent dtype handling"""
    model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin")

    # Initialize models with consistent dtypes
    # bge_model (base) ๋กœ๋“œ
    bge_model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=False)  # Set to FP32
    mlp_model = BGE_M3_FineTuner().to(device).eval()
    mlp_model.load_state_dict(torch.load(model_path))

    return bge_model, mlp_model


def inference(query, documents, bge_model, mlp_model, device="cpu"):
    """Run inference with dtype handling"""
    # Get query embedding
    query_emb = bge_model.encode(query)["dense_vecs"]
    query_tensor = torch.tensor(query_emb, dtype=torch.float32).to(device)
    query_vec = mlp_model(query_tensor)

    # Get document embeddings
    similarities = []
    for doc in documents:
        doc_emb = bge_model.encode(doc)["dense_vecs"]
        doc_tensor = torch.tensor(doc_emb, dtype=torch.float32).to(device)
        # pass reduced dimension vector through MLP layers
        doc_vec = mlp_model(doc_tensor)
        sim = F.cosine_similarity(query_vec, doc_vec, dim=0)
        similarities.append(sim.item())

    return similarities


# Infer Usage
bge_model, mlp_model = load_model()

query = "์„œ์šธ ์•„ํŒŒํŠธ ๋งค๋งค ์‹œ์„ธ ์•Œ์•„๋ณด๊ณ  ์‹ถ์–ด์š”"
documents = [
    "์„œ์šธ ๊ฐ•๋‚จ๊ตฌ ์•„ํŒŒํŠธ ๋งค๋งค๊ฐ€๋Š” ์ตœ๊ทผ ํ‰๋‹น 1์–ต ์›์„ ๋ŒํŒŒํ–ˆ์Šต๋‹ˆ๋‹ค.",
    "์„œ์šธ ์•„ํŒŒํŠธ ์ „์„ธ ๊ฐ€๊ฒฉ์ด ์•ˆ์ •์„ธ๋ฅผ ๋ณด์ด๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.",
]

similarities = inference(query, documents, bge_model, mlp_model)
Downloads last month
1
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support