Spaces:

bangaboy
/

pp

Sleeping

App Files Files Community

pp / src /streamlit_app.py

bangaboy

Update src/streamlit_app.py

3772f08 verified 7 months ago

raw

history blame contribute delete

6.89 kB

	# === app.py ===

	import os
	# os.environ["STREAMLIT_HOME"] = "/pp/.streamlit" # folder inside container with write permissions


	import streamlit as st

	import google.generativeai as genai
	from PIL import Image
	import fitz # PyMuPDF
	from docx import Document
	import pytesseract
	import io
	import json
	from pathlib import Path
	from datetime import datetime
	import re

	# ---------------- PDF / DOCX / IMAGE EXTRACTION ----------------
	def extract_text_from_pdf(pdf_file):
	"""Extract text from PDF, with OCR fallback for scanned PDFs."""
	text_content = []
	pdf_bytes = pdf_file.read()
	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	for page in doc:
	page_text = page.get_text()
	if not page_text.strip():
	# Fallback to OCR
	pix = page.get_pixmap()
	img = Image.open(io.BytesIO(pix.tobytes("png")))
	page_text = pytesseract.image_to_string(img)
	text_content.append(page_text)
	return "\n".join(text_content)
	except Exception as e:
	st.error(f"PDF extraction error: {str(e)}")
	return ""

	def extract_text_from_docx(docx_file):
	try:
	doc = Document(docx_file)
	return "\n".join([p.text for p in doc.paragraphs])
	except Exception as e:
	st.error(f"DOCX extraction error: {str(e)}")
	return ""

	def extract_text_from_image(image_file):
	try:
	image = Image.open(image_file)
	return pytesseract.image_to_string(image)
	except Exception as e:
	st.error(f"Image extraction error: {str(e)}")
	return ""

	# ---------------- DATE / EXPERIENCE CALCULATION ----------------
	def parse_date(date_str):
	try:
	if date_str.lower() in ["present", "current", "now"]:
	return datetime.now()
	date_str = date_str.strip()
	formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"]
	for fmt in formats:
	try:
	return datetime.strptime(date_str, fmt)
	except:
	continue
	year_match = re.search(r"\b20\d{2}\b", date_str)
	if year_match:
	return datetime.strptime(year_match.group(), "%Y")
	return None
	except:
	return None

	def calculate_experience(work_history):
	total_exp = 0
	for job in work_history:
	duration = job.get("duration", "")
	if not duration:
	continue
	parts = re.split(r"\s-\s\|\s+to\s+", duration)
	if len(parts) != 2:
	continue
	start, end = parse_date(parts[0]), parse_date(parts[1])
	if start and end:
	years = (end.year - start.year) + (end.month - start.month)/12
	total_exp += max(0, years)
	return round(total_exp, 1)

	# ---------------- RESUME PARSING ----------------
	def parse_resume(file_uploaded, api_key):
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel("gemini-1.5-flash")

	prompt = """Extract the following information from this resume:
	1. Summarize in 100 words, focus on skills, experience, qualifications.
	2. Full Name
	3. Email
	4. Phone
	5. Education (degree, institution, year, field)
	6. Work experience with exact duration (e.g., Jan 2020 - Present)
	7. Skills
	8. LinkedIn URL

	Return as JSON:
	{
	"summary": "", "name": "", "email": "", "phone": "",
	"education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}],
	"work_experience": [{"company": "", "position": "", "duration": ""}],
	"skills": [], "linkedin": ""
	}"""

	# Extract text
	ext = Path(file_uploaded.name).suffix.lower()
	if ext == ".pdf":
	text_content = extract_text_from_pdf(file_uploaded)
	elif ext in [".docx", ".doc"]:
	text_content = extract_text_from_docx(file_uploaded)
	elif ext in [".jpg", ".jpeg", ".png"]:
	text_content = extract_text_from_image(file_uploaded)
	else:
	st.error(f"Unsupported file type: {ext}")
	return None

	if not text_content.strip():
	st.error("No text found in resume.")
	return None

	# Generate JSON from Gemini
	try:
	response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
	response_text = response.text
	st.text_area("Raw Response", response_text, height=200) # Debugging

	# Extract JSON
	json_start = response_text.find("{")
	json_end = response_text.rfind("}") + 1
	json_str = response_text[json_start:json_end]
	result = json.loads(json_str)

	result["total_years_experience"] = calculate_experience(result.get("work_experience", []))
	return result
	except Exception as e:
	st.error(f"Error parsing resume: {str(e)}")
	return None

	# ---------------- FORMAT EDUCATION ----------------
	def format_education(edu):
	parts = []
	if edu.get("degree"):
	parts.append(edu["degree"])
	if edu.get("field"):
	parts.append(f"in {edu['field']}")
	if edu.get("institution"):
	parts.append(f"from {edu['institution']}")
	if edu.get("year"):
	parts.append(f"({edu['year']})")
	if edu.get("gpa"):
	parts.append(f"- GPA: {edu['gpa']}")
	return " ".join(parts)

	# ---------------- MAIN APP ----------------
	def main():
	st.title("Resume Parser (PDF/DOCX/Image)")
	api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password")
	uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"])

	if uploaded_file and api_key:
	with st.spinner("Analyzing resume..."):
	result = parse_resume(uploaded_file, api_key)

	if result:
	st.subheader("Extracted Information")
	st.text_area("Summary", result.get("summary",""), height=100)

	col1, col2, col3 = st.columns(3)
	col1.write("Name: "+result.get("name",""))
	col2.write("Email: "+result.get("email",""))
	col3.write("Phone: "+result.get("phone",""))

	exp = result.get("total_years_experience",0)
	exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months"
	st.write("Total Experience:", exp_text)

	st.subheader("Education")
	for edu in result.get("education", []):
	st.write("- "+format_education(edu))

	st.subheader("Work Experience")
	for w in result.get("work_experience", []):
	dur = f" ({w.get('duration','')})" if w.get("duration") else ""
	st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}")

	st.subheader("Skills")
	for s in result.get("skills", []):
	st.write("- "+s)

	st.write("LinkedIn:", result.get("linkedin",""))

	if __name__ == "__main__":
	main()