Spaces:
Sleeping
Sleeping
| import re | |
| from typing import Iterable, List, Set | |
| _SKILL_ALIASES = { | |
| "node": "Node.js", | |
| "nodejs": "Node.js", | |
| "node.js": "Node.js", | |
| "express": "Express.js", | |
| "expressjs": "Express.js", | |
| "express.js": "Express.js", | |
| "react": "React", | |
| "reactjs": "React", | |
| "react.js": "React", | |
| "next": "Next.js", | |
| "nextjs": "Next.js", | |
| "next.js": "Next.js", | |
| "js": "JavaScript", | |
| "javascript": "JavaScript", | |
| "ts": "TypeScript", | |
| "typescript": "TypeScript", | |
| "py": "Python", | |
| "mongo": "MongoDB", | |
| "mongodb": "MongoDB", | |
| "postgres": "PostgreSQL", | |
| "postgresql": "PostgreSQL", | |
| "mysql": "MySQL", | |
| "aws": "AWS", | |
| "gcp": "GCP", | |
| "azure": "Azure", | |
| "ci/cd": "CI/CD", | |
| "ci cd": "CI/CD", | |
| "rest": "REST API", | |
| "rest api": "REST API", | |
| "fastapi": "FastAPI", | |
| "langchain": "LangChain", | |
| "langgraph": "LangGraph", | |
| "langsmith": "LangSmith", | |
| "rag": "RAG", | |
| "rag pipeline": "RAG Pipelines", | |
| "rag pipelines": "RAG Pipelines", | |
| "chromadb": "ChromaDB", | |
| "scikit learn": "Scikit-learn", | |
| "scikit-learn": "Scikit-learn", | |
| "pytorch": "PyTorch", | |
| "llama": "Llama", | |
| "llama 4": "Llama 4", | |
| "gemini api": "Gemini API", | |
| "sentence transformers": "Sentence Transformers", | |
| "e5 multilingual embeddings": "E5 Multilingual Embeddings", | |
| "cnn": "CNN", | |
| "cnns": "CNN", | |
| "rnn": "RNN", | |
| "rnns": "RNN", | |
| "gan": "GAN", | |
| "gans": "GAN", | |
| "bert": "BERT", | |
| "bert fine tuning": "BERT Fine-tuning", | |
| "ocr": "OCR", | |
| "ocr based extraction": "OCR Based Extraction", | |
| "k means": "K-Means", | |
| "cross validation": "Cross-validation", | |
| "oop": "OOP", | |
| "ml": "Machine Learning", | |
| } | |
| _SKILL_CLUSTER_RULES = [ | |
| ( | |
| "Deep Learning", | |
| ["cnn", "rnn", "lstm", "gru", "gan", "transformers", "bert", "pytorch", "tensorflow", "encoder decoder"], | |
| ), | |
| ( | |
| "Machine Learning", | |
| [ | |
| "machine learning", | |
| "random forest", | |
| "svm", | |
| "logistic regression", | |
| "linear regression", | |
| "k means", | |
| "model evaluation", | |
| "cross validation", | |
| "scikit learn", | |
| ], | |
| ), | |
| ( | |
| "LLM and GenAI", | |
| [ | |
| "langchain", | |
| "langgraph", | |
| "langsmith", | |
| "prompt engineering", | |
| "rag", | |
| "rag pipeline", | |
| "rag pipelines", | |
| "semantic search", | |
| "gemini api", | |
| "llama", | |
| "embedding models", | |
| "e5 multilingual embeddings", | |
| "sentence transformers", | |
| ], | |
| ), | |
| ( | |
| "Data and Databases", | |
| ["sql", "mysql", "postgresql", "mongodb", "pinecone", "chromadb", "vector similarity search"], | |
| ), | |
| ( | |
| "Backend and APIs", | |
| ["python", "java", "javascript", "typescript", "fastapi", "django", "flask", "node", "express", "rest api"], | |
| ), | |
| ( | |
| "Cloud and DevOps", | |
| ["docker", "kubernetes", "aws", "gcp", "azure", "git", "github", "ci cd"], | |
| ), | |
| ( | |
| "Document AI and OCR", | |
| ["ocr", "ocr based extraction", "document extraction"], | |
| ), | |
| ] | |
| def _normalize_key(value: str) -> str: | |
| value = value.strip().lower() | |
| value = re.sub(r"[\u2010-\u2015]", "-", value) | |
| value = value.replace("&", " and ") | |
| value = re.sub(r"[^a-z0-9+#.\-/ ]+", " ", value) | |
| value = value.replace("/", " ") | |
| value = value.replace("-", " ") | |
| value = re.sub(r"\s+", " ", value).strip() | |
| return value | |
| def canonicalize_skill(skill: str) -> str: | |
| if not isinstance(skill, str): | |
| return "" | |
| cleaned = skill.strip() | |
| if not cleaned: | |
| return "" | |
| normalized = _normalize_key(cleaned) | |
| if normalized in _SKILL_ALIASES: | |
| return _SKILL_ALIASES[normalized] | |
| # Keep all-caps acronyms readable (e.g., SQL, API, OOP). | |
| if cleaned.isupper() and len(cleaned) <= 6: | |
| return cleaned | |
| return " ".join(part.capitalize() for part in normalized.split(" ")) | |
| def _split_skill_chunks(skill: str) -> List[str]: | |
| if not isinstance(skill, str): | |
| return [] | |
| parts = re.split(r",|\||;", skill) | |
| chunks = [] | |
| for part in parts: | |
| candidate = part.strip() | |
| if not candidate: | |
| continue | |
| chunks.append(candidate) | |
| return chunks | |
| def normalize_skill_list(skills: Iterable[str], limit: int = 80) -> List[str]: | |
| unique: List[str] = [] | |
| seen: Set[str] = set() | |
| for raw in skills or []: | |
| for token in _split_skill_chunks(raw): | |
| canon = canonicalize_skill(token) | |
| if not canon: | |
| continue | |
| key = _normalize_key(canon) | |
| if key in seen: | |
| continue | |
| seen.add(key) | |
| unique.append(canon) | |
| if len(unique) >= limit: | |
| return unique | |
| return unique | |
| def _classify_cluster(skill: str) -> str | None: | |
| key = _normalize_key(skill) | |
| if not key: | |
| return None | |
| for cluster_name, rules in _SKILL_CLUSTER_RULES: | |
| for rule in rules: | |
| if rule in key or key in rule: | |
| return cluster_name | |
| return None | |
| def cluster_skills(skills: Iterable[str], max_members_per_cluster: int = 4) -> List[dict]: | |
| """Return grouped skills with compact labels for UI and prompting.""" | |
| normalized = normalize_skill_list(skills) | |
| grouped: dict[str, list[str]] = {} | |
| for skill in normalized: | |
| cluster_name = _classify_cluster(skill) | |
| if not cluster_name: | |
| continue | |
| grouped.setdefault(cluster_name, []) | |
| if skill not in grouped[cluster_name]: | |
| grouped[cluster_name].append(skill) | |
| # Prefer denser clusters first for cleaner UX. | |
| ordered = sorted(grouped.items(), key=lambda item: len(item[1]), reverse=True) | |
| result = [] | |
| for cluster_name, members in ordered: | |
| sampled = members[:max_members_per_cluster] | |
| label = f"{cluster_name} ({', '.join(sampled)})" | |
| result.append( | |
| { | |
| "cluster": cluster_name, | |
| "members": members, | |
| "label": label, | |
| "count": len(members), | |
| } | |
| ) | |
| return result | |
| def build_interview_focus_skills(skills: Iterable[str], max_clusters: int = 6, max_extras: int = 2) -> List[str]: | |
| """Build a compact, cluster-aware skill list for interview question generation.""" | |
| normalized = normalize_skill_list(skills) | |
| grouped = cluster_skills(normalized) | |
| focus = [g["label"] for g in grouped[:max_clusters]] | |
| # Add a couple of non-clustered items so niche tools are not ignored. | |
| extras = [] | |
| clustered_members = {m for g in grouped for m in g["members"]} | |
| for skill in normalized: | |
| if skill in clustered_members: | |
| continue | |
| extras.append(skill) | |
| if len(extras) >= max_extras: | |
| break | |
| combined = focus + extras | |
| return combined if combined else normalized[: max_clusters + max_extras] | |
| def skill_match(candidate_skill: str, required_skill: str) -> bool: | |
| c_key = _normalize_key(canonicalize_skill(candidate_skill)) | |
| r_key = _normalize_key(canonicalize_skill(required_skill)) | |
| if not c_key or not r_key: | |
| return False | |
| if c_key == r_key: | |
| return True | |
| # Soft phrase matching for related forms like "rest api" vs "restful api". | |
| if c_key in r_key or r_key in c_key: | |
| return True | |
| return False | |
| def find_matching_skills(candidate_skills: Iterable[str], required_skills: Iterable[str]) -> List[str]: | |
| matched: List[str] = [] | |
| for req in required_skills or []: | |
| for cand in candidate_skills or []: | |
| if skill_match(cand, req): | |
| matched.append(canonicalize_skill(req)) | |
| break | |
| return normalize_skill_list(matched) | |
| def find_missing_skills(candidate_skills: Iterable[str], required_skills: Iterable[str]) -> List[str]: | |
| missing: List[str] = [] | |
| for req in required_skills or []: | |
| has_match = False | |
| for cand in candidate_skills or []: | |
| if skill_match(cand, req): | |
| has_match = True | |
| break | |
| if not has_match: | |
| missing.append(canonicalize_skill(req)) | |
| return normalize_skill_list(missing) |