Spaces:
Sleeping
Sleeping
File size: 8,296 Bytes
03faf26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | import re
from typing import Iterable, List, Set
_SKILL_ALIASES = {
"node": "Node.js",
"nodejs": "Node.js",
"node.js": "Node.js",
"express": "Express.js",
"expressjs": "Express.js",
"express.js": "Express.js",
"react": "React",
"reactjs": "React",
"react.js": "React",
"next": "Next.js",
"nextjs": "Next.js",
"next.js": "Next.js",
"js": "JavaScript",
"javascript": "JavaScript",
"ts": "TypeScript",
"typescript": "TypeScript",
"py": "Python",
"mongo": "MongoDB",
"mongodb": "MongoDB",
"postgres": "PostgreSQL",
"postgresql": "PostgreSQL",
"mysql": "MySQL",
"aws": "AWS",
"gcp": "GCP",
"azure": "Azure",
"ci/cd": "CI/CD",
"ci cd": "CI/CD",
"rest": "REST API",
"rest api": "REST API",
"fastapi": "FastAPI",
"langchain": "LangChain",
"langgraph": "LangGraph",
"langsmith": "LangSmith",
"rag": "RAG",
"rag pipeline": "RAG Pipelines",
"rag pipelines": "RAG Pipelines",
"chromadb": "ChromaDB",
"scikit learn": "Scikit-learn",
"scikit-learn": "Scikit-learn",
"pytorch": "PyTorch",
"llama": "Llama",
"llama 4": "Llama 4",
"gemini api": "Gemini API",
"sentence transformers": "Sentence Transformers",
"e5 multilingual embeddings": "E5 Multilingual Embeddings",
"cnn": "CNN",
"cnns": "CNN",
"rnn": "RNN",
"rnns": "RNN",
"gan": "GAN",
"gans": "GAN",
"bert": "BERT",
"bert fine tuning": "BERT Fine-tuning",
"ocr": "OCR",
"ocr based extraction": "OCR Based Extraction",
"k means": "K-Means",
"cross validation": "Cross-validation",
"oop": "OOP",
"ml": "Machine Learning",
}
_SKILL_CLUSTER_RULES = [
(
"Deep Learning",
["cnn", "rnn", "lstm", "gru", "gan", "transformers", "bert", "pytorch", "tensorflow", "encoder decoder"],
),
(
"Machine Learning",
[
"machine learning",
"random forest",
"svm",
"logistic regression",
"linear regression",
"k means",
"model evaluation",
"cross validation",
"scikit learn",
],
),
(
"LLM and GenAI",
[
"langchain",
"langgraph",
"langsmith",
"prompt engineering",
"rag",
"rag pipeline",
"rag pipelines",
"semantic search",
"gemini api",
"llama",
"embedding models",
"e5 multilingual embeddings",
"sentence transformers",
],
),
(
"Data and Databases",
["sql", "mysql", "postgresql", "mongodb", "pinecone", "chromadb", "vector similarity search"],
),
(
"Backend and APIs",
["python", "java", "javascript", "typescript", "fastapi", "django", "flask", "node", "express", "rest api"],
),
(
"Cloud and DevOps",
["docker", "kubernetes", "aws", "gcp", "azure", "git", "github", "ci cd"],
),
(
"Document AI and OCR",
["ocr", "ocr based extraction", "document extraction"],
),
]
def _normalize_key(value: str) -> str:
value = value.strip().lower()
value = re.sub(r"[\u2010-\u2015]", "-", value)
value = value.replace("&", " and ")
value = re.sub(r"[^a-z0-9+#.\-/ ]+", " ", value)
value = value.replace("/", " ")
value = value.replace("-", " ")
value = re.sub(r"\s+", " ", value).strip()
return value
def canonicalize_skill(skill: str) -> str:
if not isinstance(skill, str):
return ""
cleaned = skill.strip()
if not cleaned:
return ""
normalized = _normalize_key(cleaned)
if normalized in _SKILL_ALIASES:
return _SKILL_ALIASES[normalized]
# Keep all-caps acronyms readable (e.g., SQL, API, OOP).
if cleaned.isupper() and len(cleaned) <= 6:
return cleaned
return " ".join(part.capitalize() for part in normalized.split(" "))
def _split_skill_chunks(skill: str) -> List[str]:
if not isinstance(skill, str):
return []
parts = re.split(r",|\||;", skill)
chunks = []
for part in parts:
candidate = part.strip()
if not candidate:
continue
chunks.append(candidate)
return chunks
def normalize_skill_list(skills: Iterable[str], limit: int = 80) -> List[str]:
unique: List[str] = []
seen: Set[str] = set()
for raw in skills or []:
for token in _split_skill_chunks(raw):
canon = canonicalize_skill(token)
if not canon:
continue
key = _normalize_key(canon)
if key in seen:
continue
seen.add(key)
unique.append(canon)
if len(unique) >= limit:
return unique
return unique
def _classify_cluster(skill: str) -> str | None:
key = _normalize_key(skill)
if not key:
return None
for cluster_name, rules in _SKILL_CLUSTER_RULES:
for rule in rules:
if rule in key or key in rule:
return cluster_name
return None
def cluster_skills(skills: Iterable[str], max_members_per_cluster: int = 4) -> List[dict]:
"""Return grouped skills with compact labels for UI and prompting."""
normalized = normalize_skill_list(skills)
grouped: dict[str, list[str]] = {}
for skill in normalized:
cluster_name = _classify_cluster(skill)
if not cluster_name:
continue
grouped.setdefault(cluster_name, [])
if skill not in grouped[cluster_name]:
grouped[cluster_name].append(skill)
# Prefer denser clusters first for cleaner UX.
ordered = sorted(grouped.items(), key=lambda item: len(item[1]), reverse=True)
result = []
for cluster_name, members in ordered:
sampled = members[:max_members_per_cluster]
label = f"{cluster_name} ({', '.join(sampled)})"
result.append(
{
"cluster": cluster_name,
"members": members,
"label": label,
"count": len(members),
}
)
return result
def build_interview_focus_skills(skills: Iterable[str], max_clusters: int = 6, max_extras: int = 2) -> List[str]:
"""Build a compact, cluster-aware skill list for interview question generation."""
normalized = normalize_skill_list(skills)
grouped = cluster_skills(normalized)
focus = [g["label"] for g in grouped[:max_clusters]]
# Add a couple of non-clustered items so niche tools are not ignored.
extras = []
clustered_members = {m for g in grouped for m in g["members"]}
for skill in normalized:
if skill in clustered_members:
continue
extras.append(skill)
if len(extras) >= max_extras:
break
combined = focus + extras
return combined if combined else normalized[: max_clusters + max_extras]
def skill_match(candidate_skill: str, required_skill: str) -> bool:
c_key = _normalize_key(canonicalize_skill(candidate_skill))
r_key = _normalize_key(canonicalize_skill(required_skill))
if not c_key or not r_key:
return False
if c_key == r_key:
return True
# Soft phrase matching for related forms like "rest api" vs "restful api".
if c_key in r_key or r_key in c_key:
return True
return False
def find_matching_skills(candidate_skills: Iterable[str], required_skills: Iterable[str]) -> List[str]:
matched: List[str] = []
for req in required_skills or []:
for cand in candidate_skills or []:
if skill_match(cand, req):
matched.append(canonicalize_skill(req))
break
return normalize_skill_list(matched)
def find_missing_skills(candidate_skills: Iterable[str], required_skills: Iterable[str]) -> List[str]:
missing: List[str] = []
for req in required_skills or []:
has_match = False
for cand in candidate_skills or []:
if skill_match(cand, req):
has_match = True
break
if not has_match:
missing.append(canonicalize_skill(req))
return normalize_skill_list(missing) |