Spaces:

Robzy
/

jobbert_knowledge_extraction

Paused

App Files Files Community

jobbert_knowledge_extraction / tag-posting.py

Robzy

corrected tagging pipeline + updated README

113c4ac over 1 year ago

raw

history blame contribute delete

7.34 kB

	import spacy
	import re
	from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer
	import torch
	from typing import List
	import os
	from datetime import datetime


	### Parsing job posting

	def split_text_recursively(text):
	if '\n' not in text:
	return [text]
	parts = text.split('\n', 1)
	return [parts[0]] + split_text_recursively(parts[1])

	def parse_post(path):

	nlp = spacy.load("en_core_web_sm")

	# Read the file

	with open(path, 'r') as file:
	text = file.read()

	# Sentence tokenization

	str_list = split_text_recursively(text)
	str_list = [i.strip() for i in str_list]
	str_list = list(filter(None, str_list))

	count = 0
	sents = []

	for line in str_list:
	doc = nlp(line)
	for sent in doc.sents:
	print(f"{sent.text}")
	sents.append(sent.text)

	return sents


	### Model inference

	from torch.utils.data import DataLoader
	import torch.nn as nn
	from transformers import DataCollatorForTokenClassification
	from typing import List, Tuple

	tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
	model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")

	id2label = model.config.id2label
	label2id = model.config.label2id

	def pad(list_of_lists, pad_value=0):
	max_len = max(len(lst) for lst in list_of_lists)

	# Pad shorter lists with the specified value
	padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
	attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]

	return torch.tensor(padded_lists), torch.tensor(attention_masks)

	def collate_fn(batch: List[List[torch.Tensor]]):

	input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
	tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
	return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}

	def extract_spans(B_mask, I_mask, token_ids, tokenizer):
	"""
	Extract text spans for 2D tensors (batch of sequences).
	"""
	batch_size = B_mask.size(0)
	all_spans = []

	d = tokenizer.decode

	for batch_idx in range(batch_size):
	spans = []
	current_span = []

	for i in range(B_mask.size(1)): # Iterate over sequence length
	if B_mask[batch_idx, i].item() == 1: # Begin a new span
	if current_span:
	spans.append(current_span)
	print(d(current_span))
	current_span = [token_ids[batch_idx, i].item()]
	print(d(current_span))
	elif I_mask[batch_idx, i].item() == 1 and current_span: # Continue the current span
	print(d(current_span))
	current_span.append(token_ids[batch_idx, i].item())
	else: # Outside any entity
	print(d(current_span))
	if current_span:
	spans.append(current_span)
	current_span = []

	if current_span: # Save the last span if it exists
	spans.append(current_span)

	# Decode spans for this sequence
	decoded_spans = [tokenizer.decode(span, skip_special_tokens=True) for span in spans]
	all_spans.append(decoded_spans)

	# Remove empty spans
	all_spans = list(filter(lambda x: x != [], all_spans))

	return all_spans


	def concat_subtokens(tokens):
	result = []

	for token in tokens:
	if token.startswith('##'):
	# Concatenate sub-token to the last token in result
	result[-1] += token[2:] # Remove '##' and append the continuation
	else:
	# If it's a new token, add it to result
	result.append(token)

	return result

	def merge_spans(batch_spans, tokenizer):

	batch_decoded_spans = []

	for spans in batch_spans:

	## Concatenate subtokens

	if spans[0].startswith('##'):
	continue

	decoded_spans = []
	for token in spans:
	if token.startswith('##'):
	# Concatenate sub-token to the last token in result
	decoded_spans[-1] += token[2:] # Remove '##' and append the continuation
	else:
	# If it's a new token, add it to result
	decoded_spans.append(token)

	## Concatenatation done

	for span in decoded_spans:
	batch_decoded_spans.append(span)

	return batch_decoded_spans


	def extract_skills(batch_sentences: List[str]):

	print('Extracting skills from job posting...')

	# Batch

	# Tokenize
	batch = tokenizer(batch_sentences, padding=True, truncation=True)
	batch_tokens = torch.tensor(batch['input_ids'])
	batch_attention_masks = torch.tensor(batch['attention_mask'])

	model.eval()
	with torch.no_grad():
	output = model(input_ids=batch_tokens, attention_mask=batch_attention_masks)

	# Post-process
	pred = output.logits.argmax(-1)
	pred = torch.where(batch_attention_masks==0, torch.tensor(-100), pred)

	b_mask = torch.where(pred==0, 1, 0)
	i_mask = torch.where(pred==1, 1, 0)

	spans = extract_spans(b_mask, i_mask, batch_tokens, tokenizer)
	decoded_spans = merge_spans(spans, tokenizer)

	return decoded_spans

	def skills_save(path,skills):
	with open(path, 'w') as f:
	for i, skill in enumerate(skills):
	if i == len(skills) - 1:
	f.write(f"{skill}")
	else:
	f.write(f"{skill}\n")


	def backfill():

	job_dir = os.path.join(os.getcwd(), 'job-postings')
	tag_dir = os.path.join(os.getcwd(), 'tags')

	for date in os.listdir(job_dir):
	print(f"Processing date directory: {date}")

	job_date = os.path.join(job_dir, date)
	tag_date = os.path.join(tag_dir, date)

	for job in os.listdir(job_date):
	job_path = os.path.join(job_date, job)
	tag_path = os.path.join(tag_date, job)

	print(f"Processing job file: {job_path}")

	if not os.path.exists(tag_date):
	os.makedirs(tag_date)
	print(f"Created directory: {tag_date}")

	sents = parse_post(job_path)
	skills = extract_skills(sents)
	skills_save(tag_path, skills)

	print(f"Saved skills to: {tag_path}")

	def tag_date(date):

	tag_dir = os.path.join(os.getcwd(), 'tags', date)
	job_dir = os.path.join(os.getcwd(), 'job-postings', date)

	for job in os.listdir(job_dir):

	job_path = os.path.join(job_dir, job)
	tag_path = os.path.join(tag_dir, job)

	print(f"Processing job file: {job_path}")

	if not os.path.exists(tag_dir):
	os.makedirs(tag_dir)
	print(f"Created directory: {tag_dir}")

	sents = parse_post(job_path)
	skills = extract_skills(sents)
	skills_save(tag_path, skills)

	print(f"Saved skills to: {tag_path}")

	if __name__ == '__main__':

	# Backfill all job postings
	# backfill()

	# Tag today's job postings
	date = datetime.today().strftime('%m-%d-%Y')
	tag_date(date)