Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

.DS_Store +0 -0
.gitattributes +1 -0
1_Pooling/config.json +30 -0
1_Pooling/config_sentence_transformers.json +14 -0
1_Pooling/modules.json +14 -0
1_Pooling/sentence_bert_config.json +10 -0
1_Pooling/tokenizer_config.json +23 -0
README.md +117 -0
model.safetensors +3 -0
tokenizer.json +3 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "add_cross_attention": false,
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": null,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "is_decoder": false,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.4",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 250037
+}

1_Pooling/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "__version__": {
+    "pytorch": "2.11.0",
+    "sentence_transformers": "5.4.1",
+    "transformers": "5.5.4"
+  },
+  "default_prompt_name": null,
+  "model_type": "SentenceTransformer",
+  "prompts": {
+    "document": "",
+    "query": ""
+  },
+  "similarity_fn_name": "cosine"
+}

1_Pooling/modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.base.modules.transformer.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling"
+  }
+]

1_Pooling/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "transformer_task": "feature-extraction",
+    "modality_config": {
+        "text": {
+            "method": "forward",
+            "method_output_name": "last_hidden_state"
+        }
+    },
+    "module_output_name": "token_embeddings"
+}

1_Pooling/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "do_lower_case": true,
+  "eos_token": "</s>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "max_length": 128,
+  "model_max_length": 128,
+  "pad_to_multiple_of": null,
+  "pad_token": "<pad>",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "</s>",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "TokenizersBackend",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<unk>"
+}

README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+---
+language:
+- en
+- ru
+- fr
+- de
+- es
+- it
+- pt
+license: apache-2.0
+tags:
+- sentence-transformers
+- sentence-similarity
+- feature-extraction
+- literary
+- semantic-search
+- multilingual
+base_model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
+datasets:
+- rafaelui/literary-text-pairs
+pipeline_tag: sentence-similarity
+---
+# literary-minilm
+A multilingual semantic search model fine-tuned for **literary text** — novels, short stories, and other fiction. Built on top of [`paraphrase-multilingual-MiniLM-L12-v2`](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2), this model is optimized to understand narrative language, character descriptions, plot dynamics, and thematic queries across 7 languages.
+Developed for use in [Impulse](https://apps.apple.com/us/app/impulse-writers-studio/id6761264842?l=ru&mt=12) — a macOS writing app for authors.
+## Model Details
+| Property | Value |
+|---|---|
+| Base model | paraphrase-multilingual-MiniLM-L12-v2 |
+| Architecture | BERT (12 layers, 384 hidden size) |
+| Max sequence length | 128 tokens |
+| Languages | English, Russian, French, German, Spanish, Italian, Portuguese |
+| Training pairs | ~134,000 |
+| Output dimension | 384 |
+| License | Apache 2.0 |
+## Why literary-minilm?
+General-purpose multilingual embeddings are trained on a broad mix of content: Wikipedia, Reddit, StackOverflow, scientific papers, and web crawls. This works well for factual retrieval but poorly for fiction — where meaning is conveyed through metaphor, subtext, character voice, and narrative context.
+**literary-minilm** is domain-adapted exclusively on fiction. The result is a model that understands queries like:
+- *"scene where the hero doubts himself"*
+- *"description of a mysterious city at night"*
+- *"character who sacrifices everything for love"*
+## Training Data
+The model was fine-tuned on a custom dataset of ~134,000 literary text pairs across 7 languages, generated from:
+- **English**: Project Gutenberg (via `emozilla/pg19`) and `manu/project_gutenberg`
+- **Russian**: RusLit corpus (classical Russian prose) and `cointegrated/taiga_stripped_proza`
+- **French, German, Spanish, Italian, Portuguese**: OPUS Books (`Helsinki-NLP/opus_books`) and `manu/project_gutenberg`
+Each training example consists of:
+- `anchor` — a passage of literary text (up to 256 tokens)
+- `semantic_phrase` — a short natural-language search query describing the passage (5–10 words)
+- `paraphrase` — a rephrasing of the anchor in different words
+Training pairs were generated using a combination of YandexGPT, GPT-4.1-nano, and Qwen3 235B, then filtered for quality.
+## Usage
+### With sentence-transformers
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("rafaelui/literary-minilm")
+query = "hero says goodbye to a friend before war"
+passages = [
+    "He embraced his friend and held on for a long time, knowing he would never see him again.",
+    "The sun was bright, birds sang in the garden.",
+    "She closed the book and sat thinking about what she had read."
+]
+query_emb = model.encode(query)
+passage_embs = model.encode(passages)
+from sentence_transformers.util import cos_sim
+scores = cos_sim(query_emb, passage_embs)[0]
+for passage, score in zip(passages, scores):
+    print(f"{score:.3f}: {passage}")
+```
+Output:
+```
+0.621: He embraced his friend and held on for a long time...
+-0.082: The sun was bright, birds sang in the garden.
+0.275: She closed the book and sat thinking about what she had read.
+```
+### CoreML (iOS / macOS)
+A compiled `.mlpackage` is available for direct use in Apple platform apps. See the [Releases](https://huggingface.co/rafaelui/literary-minilm/tree/main) section.
+## Limitations
+- Optimized for **fiction only** — performance on factual, technical, or conversational text may be lower than the base model
+- Context window is limited to **128 tokens** — longer passages should be chunked
+- Asian languages (Chinese, Japanese, Korean) are not included in fine-tuning; the model falls back to base multilingual capabilities for these
+## Author
+**Alexei Goncharov** — [ImpulseLeap](https://www.impulseleap.com)
+Built for [Impulse](https://apps.apple.com/us/app/impulse-writers-studio/id6761264842?l=ru&mt=12), a macOS app for writers.
+## License
+Apache 2.0

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b20cb0c384f0e394db15dd6ff520ef92d5919d290d1fa7e8d75f58508b159832
+size 470637392

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cad551d5600a84242d0973327029452a1e3672ba6313c2a3c3d69c4310e12719
+size 17082987