| |
|
|
| import os |
| |
|
|
|
|
| import streamlit as st |
|
|
| import google.generativeai as genai |
| from PIL import Image |
| import fitz |
| from docx import Document |
| import pytesseract |
| import io |
| import json |
| from pathlib import Path |
| from datetime import datetime |
| import re |
|
|
| |
| def extract_text_from_pdf(pdf_file): |
| """Extract text from PDF, with OCR fallback for scanned PDFs.""" |
| text_content = [] |
| pdf_bytes = pdf_file.read() |
| try: |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
| for page in doc: |
| page_text = page.get_text() |
| if not page_text.strip(): |
| |
| pix = page.get_pixmap() |
| img = Image.open(io.BytesIO(pix.tobytes("png"))) |
| page_text = pytesseract.image_to_string(img) |
| text_content.append(page_text) |
| return "\n".join(text_content) |
| except Exception as e: |
| st.error(f"PDF extraction error: {str(e)}") |
| return "" |
|
|
| def extract_text_from_docx(docx_file): |
| try: |
| doc = Document(docx_file) |
| return "\n".join([p.text for p in doc.paragraphs]) |
| except Exception as e: |
| st.error(f"DOCX extraction error: {str(e)}") |
| return "" |
|
|
| def extract_text_from_image(image_file): |
| try: |
| image = Image.open(image_file) |
| return pytesseract.image_to_string(image) |
| except Exception as e: |
| st.error(f"Image extraction error: {str(e)}") |
| return "" |
|
|
| |
| def parse_date(date_str): |
| try: |
| if date_str.lower() in ["present", "current", "now"]: |
| return datetime.now() |
| date_str = date_str.strip() |
| formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"] |
| for fmt in formats: |
| try: |
| return datetime.strptime(date_str, fmt) |
| except: |
| continue |
| year_match = re.search(r"\b20\d{2}\b", date_str) |
| if year_match: |
| return datetime.strptime(year_match.group(), "%Y") |
| return None |
| except: |
| return None |
|
|
| def calculate_experience(work_history): |
| total_exp = 0 |
| for job in work_history: |
| duration = job.get("duration", "") |
| if not duration: |
| continue |
| parts = re.split(r"\s*-\s*|\s+to\s+", duration) |
| if len(parts) != 2: |
| continue |
| start, end = parse_date(parts[0]), parse_date(parts[1]) |
| if start and end: |
| years = (end.year - start.year) + (end.month - start.month)/12 |
| total_exp += max(0, years) |
| return round(total_exp, 1) |
|
|
| |
| def parse_resume(file_uploaded, api_key): |
| genai.configure(api_key=api_key) |
| model = genai.GenerativeModel("gemini-1.5-flash") |
|
|
| prompt = """Extract the following information from this resume: |
| 1. Summarize in 100 words, focus on skills, experience, qualifications. |
| 2. Full Name |
| 3. Email |
| 4. Phone |
| 5. Education (degree, institution, year, field) |
| 6. Work experience with exact duration (e.g., Jan 2020 - Present) |
| 7. Skills |
| 8. LinkedIn URL |
| |
| Return as JSON: |
| { |
| "summary": "", "name": "", "email": "", "phone": "", |
| "education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}], |
| "work_experience": [{"company": "", "position": "", "duration": ""}], |
| "skills": [], "linkedin": "" |
| }""" |
|
|
| |
| ext = Path(file_uploaded.name).suffix.lower() |
| if ext == ".pdf": |
| text_content = extract_text_from_pdf(file_uploaded) |
| elif ext in [".docx", ".doc"]: |
| text_content = extract_text_from_docx(file_uploaded) |
| elif ext in [".jpg", ".jpeg", ".png"]: |
| text_content = extract_text_from_image(file_uploaded) |
| else: |
| st.error(f"Unsupported file type: {ext}") |
| return None |
|
|
| if not text_content.strip(): |
| st.error("No text found in resume.") |
| return None |
|
|
| |
| try: |
| response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}") |
| response_text = response.text |
| st.text_area("Raw Response", response_text, height=200) |
|
|
| |
| json_start = response_text.find("{") |
| json_end = response_text.rfind("}") + 1 |
| json_str = response_text[json_start:json_end] |
| result = json.loads(json_str) |
|
|
| result["total_years_experience"] = calculate_experience(result.get("work_experience", [])) |
| return result |
| except Exception as e: |
| st.error(f"Error parsing resume: {str(e)}") |
| return None |
|
|
| |
| def format_education(edu): |
| parts = [] |
| if edu.get("degree"): |
| parts.append(edu["degree"]) |
| if edu.get("field"): |
| parts.append(f"in {edu['field']}") |
| if edu.get("institution"): |
| parts.append(f"from {edu['institution']}") |
| if edu.get("year"): |
| parts.append(f"({edu['year']})") |
| if edu.get("gpa"): |
| parts.append(f"- GPA: {edu['gpa']}") |
| return " ".join(parts) |
|
|
| |
| def main(): |
| st.title("Resume Parser (PDF/DOCX/Image)") |
| api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password") |
| uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"]) |
|
|
| if uploaded_file and api_key: |
| with st.spinner("Analyzing resume..."): |
| result = parse_resume(uploaded_file, api_key) |
|
|
| if result: |
| st.subheader("Extracted Information") |
| st.text_area("Summary", result.get("summary",""), height=100) |
| |
| col1, col2, col3 = st.columns(3) |
| col1.write("**Name:** "+result.get("name","")) |
| col2.write("**Email:** "+result.get("email","")) |
| col3.write("**Phone:** "+result.get("phone","")) |
|
|
| exp = result.get("total_years_experience",0) |
| exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months" |
| st.write("**Total Experience:**", exp_text) |
|
|
| st.subheader("Education") |
| for edu in result.get("education", []): |
| st.write("- "+format_education(edu)) |
|
|
| st.subheader("Work Experience") |
| for w in result.get("work_experience", []): |
| dur = f" ({w.get('duration','')})" if w.get("duration") else "" |
| st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}") |
|
|
| st.subheader("Skills") |
| for s in result.get("skills", []): |
| st.write("- "+s) |
|
|
| st.write("**LinkedIn:**", result.get("linkedin","")) |
|
|
| if __name__ == "__main__": |
| main() |
|
|