| import os |
| from langchain_openai import ChatOpenAI |
| from pydantic import BaseModel |
| from langchain_core.output_parsers import JsonOutputParser |
| from langchain_core.output_parsers import PydanticOutputParser |
| from langchain_core.prompts import PromptTemplate |
| from langchain_openai import OpenAI |
| from langchain_openai import ChatOpenAI |
| from pydantic import BaseModel |
| from typing import List |
| from dotenv import load_dotenv |
| from transformers import AutoTokenizer, AutoModelForTokenClassification |
| import torch |
| import sys |
| from tabulate import tabulate |
| import spacy |
| import re |
| import json |
| from datetime import datetime |
| from tqdm import tqdm |
| import time |
|
|
|
|
| load_dotenv(".env") |
| nlp = spacy.load("en_core_web_sm") |
|
|
| def split_text_recursively(text): |
| if '\n' not in text: |
| return [text] |
| parts = text.split('\n', 1) |
| return [parts[0]] + split_text_recursively(parts[1]) |
|
|
|
|
| def tokenize_to_sent(path): |
|
|
| print(f"Tokenizing {path} to sentences...") |
|
|
| |
| with open(path, 'r') as file: |
| text = file.read() |
|
|
| |
|
|
| str_list = split_text_recursively(text) |
| str_list = [i.strip() for i in str_list] |
| str_list = list(filter(None, str_list)) |
|
|
| count = 0 |
| sents = [] |
|
|
| for line in str_list: |
| doc = nlp(line) |
| for sent in doc.sents: |
| sents.append(sent.text) |
|
|
| print(f"Tokenization completed. {len(sents)} sentences found.") |
| |
| return sents |
|
|
|
|
| |
|
|
| model = ChatOpenAI(temperature=0) |
|
|
| class TokenTaggingResult(BaseModel): |
| tokens: List[str] |
| tags_knowledge: List[str] |
|
|
| class Results(BaseModel): |
| results: List[TokenTaggingResult] |
|
|
|
|
| model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY')) |
| tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction") |
| parser = JsonOutputParser(pydantic_object=Results) |
|
|
| |
|
|
| skill_definition = """ |
| Skill means the ability to apply knowledge and use know-how to complete tasks and solve problems. |
| """ |
|
|
| knowledge_definition = """ |
| Knowledge means the outcome of the assimilation of information through learning. Knowledge is the body of facts, principles, theories and practices that is related to a field of work or study. |
| """ |
|
|
| |
| with open('few-shot.txt', 'r') as file: |
| few_shot_examples = file.read() |
|
|
| prompt = PromptTemplate( |
| template="""You are an expert in tagging tokens with knowledge labels. Use the following definitions to tag the input tokens: |
| Knowledge definition:{knowledge_definition} |
| Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""", |
| input_variables=["input"], |
| partial_variables={"format_instructions": parser.get_format_instructions(), |
| "few_shot_examples": few_shot_examples, |
| |
| "knowledge_definition": knowledge_definition}, |
| ) |
|
|
| def extract_tags(sents: str, tokenize = True) -> Results: |
|
|
| print("Extracting tags...") |
| print(f"Tokenizing {len(sents)} sentences...") |
|
|
| start_time = time.time() |
|
|
| if tokenize: |
| tokens = [tokenizer.tokenize(t) for t in sents] |
|
|
| prompt_and_model = prompt | model |
| output = prompt_and_model.invoke({"input": tokens}) |
| output = parser.invoke(output) |
|
|
| time_taken = time.time() - start_time |
| print(f"Tags extracted in {time_taken} seconds.") |
|
|
| return tokens, output |
|
|
|
|
| def tag_posting(job_path, output_path): |
|
|
| |
| sents = tokenize_to_sent(job_path) |
|
|
| |
| tokens, output = extract_tags(sents, tokenize=True) |
|
|
| with open(output_path, "w") as file: |
| for entry in output['results']: |
| json.dump(entry, file) |
| file.write("\n") |
|
|
| def tag_all_today(): |
|
|
| date = datetime.today().strftime('%d-%m-%Y') |
| |
|
|
| jobs = os.listdir(f'./job-postings/{date}') |
| output_path = f'./data/tags-{date}.jsonl' |
| count = 0 |
|
|
| for job in tqdm(jobs, desc="Tagging job postings"): |
|
|
| job_path = f'./job-postings/{date}/{job}' |
| |
| |
| sents = tokenize_to_sent(job_path) |
|
|
| |
| tokens, output = extract_tags(sents, tokenize=True) |
|
|
| with open(output_path, "a") as file: |
| for entry in output['results']: |
| json.dump(entry, file) |
| file.write("\n") |
| |
| count += 1 |
| if count > 2: |
| break |
| |
|
|
| print(f"Tagging completed. Output saved to {output_path}") |
|
|
| |
| if __name__ == "__main__": |
|
|
| tag_all_today() |