hackersgame commited on
Commit
30e273c
·
verified ·
1 Parent(s): 6132608

Upload fle.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. fle.py +152 -0
fle.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Free Language Embeddings — load and query V34 word vectors.
3
+
4
+ Usage:
5
+ python fle.py # interactive mode
6
+ python fle.py king - man + woman # single query
7
+ python fle.py --similar cat # nearest neighbors
8
+
9
+ Requires: fle_v34.npz (download from GitHub releases)
10
+ """
11
+
12
+ import numpy as np
13
+ import sys
14
+ import os
15
+
16
+ EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "fle_v34.npz")
17
+
18
+
19
+ class FLE:
20
+ """Free Language Embeddings — 100K words, 300d, V34 dynamic masking word2vec."""
21
+
22
+ def __init__(self, path=EMBEDDINGS_FILE):
23
+ data = np.load(path, allow_pickle=True)
24
+ self.embeddings = data["embeddings"] # (100000, 300) float32
25
+ self.words = list(data["words"])
26
+ self.word2id = {w: i for i, w in enumerate(self.words)}
27
+ self._normed = None
28
+
29
+ @property
30
+ def normed(self):
31
+ if self._normed is None:
32
+ norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
33
+ self._normed = self.embeddings / np.maximum(norms, 1e-8)
34
+ return self._normed
35
+
36
+ def __contains__(self, word):
37
+ return word in self.word2id
38
+
39
+ def __getitem__(self, word):
40
+ return self.embeddings[self.word2id[word]]
41
+
42
+ def similar(self, word, n=10):
43
+ """Find n most similar words."""
44
+ if word not in self.word2id:
45
+ return []
46
+ vec = self.normed[self.word2id[word]]
47
+ sims = self.normed @ vec
48
+ sims[self.word2id[word]] = -1
49
+ top = np.argsort(-sims)[:n]
50
+ return [(self.words[i], float(sims[i])) for i in top]
51
+
52
+ def analogy(self, a, b, c, n=5):
53
+ """a is to b as c is to ? (b - a + c)"""
54
+ for w in [a, b, c]:
55
+ if w not in self.word2id:
56
+ return []
57
+ vec = self.normed[self.word2id[b]] - self.normed[self.word2id[a]] + self.normed[self.word2id[c]]
58
+ vec = vec / (np.linalg.norm(vec) + 1e-8)
59
+ sims = self.normed @ vec
60
+ for w in [a, b, c]:
61
+ sims[self.word2id[w]] = -1
62
+ top = np.argsort(-sims)[:n]
63
+ return [(self.words[i], float(sims[i])) for i in top]
64
+
65
+ def similarity(self, a, b):
66
+ """Cosine similarity between two words."""
67
+ if a not in self.word2id or b not in self.word2id:
68
+ return None
69
+ return float(self.normed[self.word2id[a]] @ self.normed[self.word2id[b]])
70
+
71
+ def query(self, expression):
72
+ """Evaluate a vector arithmetic expression like 'king - man + woman'."""
73
+ tokens = expression.strip().split()
74
+ if not tokens:
75
+ return []
76
+
77
+ vec = np.zeros(self.embeddings.shape[1])
78
+ sign = 1.0
79
+ used = set()
80
+ for token in tokens:
81
+ if token == '+':
82
+ sign = 1.0
83
+ elif token == '-':
84
+ sign = -1.0
85
+ elif token in self.word2id:
86
+ vec += sign * self.normed[self.word2id[token]]
87
+ used.add(token)
88
+ sign = 1.0
89
+ else:
90
+ return [(f"'{token}' not in vocabulary", 0.0)]
91
+
92
+ vec = vec / (np.linalg.norm(vec) + 1e-8)
93
+ sims = self.normed @ vec
94
+ for w in used:
95
+ sims[self.word2id[w]] = -1
96
+ top = np.argsort(-sims)[:10]
97
+ return [(self.words[i], float(sims[i])) for i in top]
98
+
99
+
100
+ def main():
101
+ if not os.path.exists(EMBEDDINGS_FILE):
102
+ print(f"Error: {EMBEDDINGS_FILE} not found.")
103
+ print("Download from: https://github.com/ruapotato/Free-Language-Embeddings/releases")
104
+ sys.exit(1)
105
+
106
+ fle = FLE()
107
+ print(f"Loaded {len(fle.words):,} words, {fle.embeddings.shape[1]}d")
108
+
109
+ # CLI mode
110
+ if len(sys.argv) > 1:
111
+ if sys.argv[1] == "--similar":
112
+ word = sys.argv[2] if len(sys.argv) > 2 else "cat"
113
+ for w, s in fle.similar(word, 15):
114
+ print(f" {w:<20} {s:.4f}")
115
+ else:
116
+ expr = " ".join(sys.argv[1:])
117
+ print(f" {expr}")
118
+ for w, s in fle.query(expr):
119
+ print(f" → {w:<20} {s:.4f}")
120
+ return
121
+
122
+ # Interactive mode
123
+ print("\nExamples:")
124
+ print(" king - man + woman")
125
+ print(" similar cat")
126
+ print(" paris - france + germany")
127
+ print()
128
+
129
+ while True:
130
+ try:
131
+ line = input("fle> ").strip()
132
+ except (EOFError, KeyboardInterrupt):
133
+ print()
134
+ break
135
+
136
+ if not line:
137
+ continue
138
+
139
+ if line.startswith("similar "):
140
+ word = line.split()[1]
141
+ results = fle.similar(word, 15)
142
+ if not results:
143
+ print(f" '{word}' not in vocabulary")
144
+ for w, s in results:
145
+ print(f" {w:<20} {s:.4f}")
146
+ else:
147
+ for w, s in fle.query(line):
148
+ print(f" {w:<20} {s:.4f}")
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()