check
Browse files- .python-version +1 -0
- main.py +98 -0
- XLM-RoBERTa.ipynb → models/XLM-RoBERTa.ipynb +0 -0
- mBERT.ipynb → models/mBERT.ipynb +0 -0
- push_to_HF.py → models/push_to_HF.py +0 -0
- requirements.txt +23 -1
- static/app.js +42 -0
- static/style.css +124 -0
- templates/index.html +27 -0
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12.0
|
main.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, Request, Form
|
| 2 |
+
from fastapi.responses import HTMLResponse
|
| 3 |
+
from fastapi.staticfiles import StaticFiles
|
| 4 |
+
from fastapi.templating import Jinja2Templates
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
app = FastAPI()
|
| 10 |
+
|
| 11 |
+
# Serve static files like CSS and JavaScript
|
| 12 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 13 |
+
|
| 14 |
+
# Set up Jinja2 templates
|
| 15 |
+
templates = Jinja2Templates(directory="templates")
|
| 16 |
+
|
| 17 |
+
# Load the Hugging Face model and tokenizer
|
| 18 |
+
model_name = "IsmatS/xlm-roberta-az-ner"
|
| 19 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 20 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
| 21 |
+
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
| 22 |
+
|
| 23 |
+
label_mapping = {
|
| 24 |
+
"LABEL_0": "Other",
|
| 25 |
+
"LABEL_1": "Person",
|
| 26 |
+
"LABEL_2": "Location",
|
| 27 |
+
"LABEL_3": "Organization",
|
| 28 |
+
"LABEL_4": "Date",
|
| 29 |
+
"LABEL_5": "Time",
|
| 30 |
+
"LABEL_6": "Money",
|
| 31 |
+
"LABEL_7": "Percentage",
|
| 32 |
+
"LABEL_8": "Facility",
|
| 33 |
+
"LABEL_9": "Product",
|
| 34 |
+
"LABEL_10": "Event",
|
| 35 |
+
"LABEL_11": "Art",
|
| 36 |
+
"LABEL_12": "Law",
|
| 37 |
+
"LABEL_13": "Language",
|
| 38 |
+
"LABEL_14": "Government",
|
| 39 |
+
"LABEL_15": "Nationality or Religion",
|
| 40 |
+
"LABEL_16": "Ordinal",
|
| 41 |
+
"LABEL_17": "Cardinal",
|
| 42 |
+
"LABEL_18": "Disease",
|
| 43 |
+
"LABEL_19": "Contact",
|
| 44 |
+
"LABEL_20": "Proverb or Saying",
|
| 45 |
+
"LABEL_21": "Quantity",
|
| 46 |
+
"LABEL_22": "Miscellaneous",
|
| 47 |
+
"LABEL_23": "Position",
|
| 48 |
+
"LABEL_24": "Project"
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
def convert_numpy_types(obj):
|
| 52 |
+
if isinstance(obj, np.float32):
|
| 53 |
+
return float(obj)
|
| 54 |
+
elif isinstance(obj, np.int32):
|
| 55 |
+
return int(obj)
|
| 56 |
+
elif isinstance(obj, list):
|
| 57 |
+
return [convert_numpy_types(item) for item in obj]
|
| 58 |
+
elif isinstance(obj, dict):
|
| 59 |
+
return {key: convert_numpy_types(value) for key, value in obj.items()}
|
| 60 |
+
else:
|
| 61 |
+
return obj
|
| 62 |
+
|
| 63 |
+
@app.get("/", response_class=HTMLResponse)
|
| 64 |
+
async def index(request: Request):
|
| 65 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
| 66 |
+
|
| 67 |
+
@app.post("/predict/")
|
| 68 |
+
async def predict_ner(text: str = Form(...)):
|
| 69 |
+
ner_results = nlp_ner(text)
|
| 70 |
+
|
| 71 |
+
# Initialize dictionary to store entities by type
|
| 72 |
+
entities_by_type = {}
|
| 73 |
+
|
| 74 |
+
# Process each detected entity
|
| 75 |
+
for entity in ner_results:
|
| 76 |
+
# Get the human-readable label
|
| 77 |
+
entity_type = label_mapping.get(entity["entity_group"], entity["entity_group"])
|
| 78 |
+
|
| 79 |
+
# Filter out non-entities (label "Other" in this case)
|
| 80 |
+
if entity_type == "Other":
|
| 81 |
+
continue
|
| 82 |
+
|
| 83 |
+
# Add entity to the dictionary by its type
|
| 84 |
+
if entity_type not in entities_by_type:
|
| 85 |
+
entities_by_type[entity_type] = [] # Initialize list for new entity type
|
| 86 |
+
|
| 87 |
+
# Append the entity word to the corresponding type list
|
| 88 |
+
entities_by_type[entity_type].append(entity["word"])
|
| 89 |
+
|
| 90 |
+
return {"entities": entities_by_type}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# Run with uvicorn main:app --reload
|
| 94 |
+
# curl -X POST "http://127.0.0.1:8000/predict/" \
|
| 95 |
+
# -H "Content-Type: application/json" \
|
| 96 |
+
# -d '{"text": "Bakı şəhərində Azərbaycan Respublikasının prezidenti İlham Əliyev."}'
|
| 97 |
+
|
| 98 |
+
# 2014 - cu ilde Azərbaycan Respublikasının prezidenti İlham Əliyev Salyanda olub.
|
XLM-RoBERTa.ipynb → models/XLM-RoBERTa.ipynb
RENAMED
|
File without changes
|
mBERT.ipynb → models/mBERT.ipynb
RENAMED
|
File without changes
|
push_to_HF.py → models/push_to_HF.py
RENAMED
|
File without changes
|
requirements.txt
CHANGED
|
@@ -1,13 +1,35 @@
|
|
|
|
|
|
|
|
| 1 |
certifi==2024.8.30
|
| 2 |
charset-normalizer==3.4.0
|
|
|
|
|
|
|
| 3 |
filelock==3.16.1
|
| 4 |
fsspec==2024.10.0
|
|
|
|
| 5 |
huggingface-hub==0.26.2
|
| 6 |
idna==3.10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
packaging==24.1
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
PyYAML==6.0.2
|
|
|
|
| 10 |
requests==2.32.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
tqdm==4.66.6
|
|
|
|
| 12 |
typing_extensions==4.12.2
|
| 13 |
urllib3==2.2.3
|
|
|
|
|
|
| 1 |
+
annotated-types==0.7.0
|
| 2 |
+
anyio==4.6.2.post1
|
| 3 |
certifi==2024.8.30
|
| 4 |
charset-normalizer==3.4.0
|
| 5 |
+
click==8.1.7
|
| 6 |
+
fastapi==0.115.4
|
| 7 |
filelock==3.16.1
|
| 8 |
fsspec==2024.10.0
|
| 9 |
+
h11==0.14.0
|
| 10 |
huggingface-hub==0.26.2
|
| 11 |
idna==3.10
|
| 12 |
+
Jinja2==3.1.4
|
| 13 |
+
MarkupSafe==3.0.2
|
| 14 |
+
mpmath==1.3.0
|
| 15 |
+
networkx==3.4.2
|
| 16 |
+
numpy==2.1.3
|
| 17 |
packaging==24.1
|
| 18 |
+
pydantic==2.9.2
|
| 19 |
+
pydantic_core==2.23.4
|
| 20 |
+
python-multipart==0.0.17
|
| 21 |
PyYAML==6.0.2
|
| 22 |
+
regex==2024.9.11
|
| 23 |
requests==2.32.3
|
| 24 |
+
safetensors==0.4.5
|
| 25 |
+
setuptools==75.3.0
|
| 26 |
+
sniffio==1.3.1
|
| 27 |
+
starlette==0.41.2
|
| 28 |
+
sympy==1.13.1
|
| 29 |
+
tokenizers==0.20.1
|
| 30 |
+
torch==2.5.1
|
| 31 |
tqdm==4.66.6
|
| 32 |
+
transformers==4.46.1
|
| 33 |
typing_extensions==4.12.2
|
| 34 |
urllib3==2.2.3
|
| 35 |
+
uvicorn==0.32.0
|
static/app.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
document.getElementById("nerForm").addEventListener("submit", async function (e) {
|
| 2 |
+
e.preventDefault();
|
| 3 |
+
const text = document.getElementById("textInput").value;
|
| 4 |
+
const response = await fetch("/predict/", {
|
| 5 |
+
method: "POST",
|
| 6 |
+
headers: { "Content-Type": "application/x-www-form-urlencoded" },
|
| 7 |
+
body: new URLSearchParams({ text })
|
| 8 |
+
});
|
| 9 |
+
const result = await response.json();
|
| 10 |
+
|
| 11 |
+
// Display results in a structured format
|
| 12 |
+
displayResults(result.entities);
|
| 13 |
+
});
|
| 14 |
+
|
| 15 |
+
function displayResults(entities) {
|
| 16 |
+
const resultsDiv = document.getElementById("resultsContent");
|
| 17 |
+
resultsDiv.innerHTML = ""; // Clear previous results
|
| 18 |
+
|
| 19 |
+
// Check if any entities are returned
|
| 20 |
+
if (Object.keys(entities).length === 0) {
|
| 21 |
+
resultsDiv.innerHTML = "<p>No high-confidence entities found.</p>";
|
| 22 |
+
return;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
// Display entities grouped by type in the desired format
|
| 26 |
+
Object.entries(entities).forEach(([entityType, words]) => {
|
| 27 |
+
const entityGroup = document.createElement("div");
|
| 28 |
+
entityGroup.classList.add("entity-group");
|
| 29 |
+
|
| 30 |
+
const title = document.createElement("h3");
|
| 31 |
+
title.textContent = entityType; // Display entity type (e.g., Date, Government)
|
| 32 |
+
entityGroup.appendChild(title);
|
| 33 |
+
|
| 34 |
+
words.forEach(word => {
|
| 35 |
+
const entityEl = document.createElement("p");
|
| 36 |
+
entityEl.textContent = word; // Display the actual entity word
|
| 37 |
+
entityGroup.appendChild(entityEl);
|
| 38 |
+
});
|
| 39 |
+
|
| 40 |
+
resultsDiv.appendChild(entityGroup);
|
| 41 |
+
});
|
| 42 |
+
}
|
static/style.css
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Basic Reset */
|
| 2 |
+
* {
|
| 3 |
+
box-sizing: border-box;
|
| 4 |
+
margin: 0;
|
| 5 |
+
padding: 0;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
/* Body Styling */
|
| 9 |
+
body {
|
| 10 |
+
font-family: Arial, sans-serif;
|
| 11 |
+
display: flex;
|
| 12 |
+
justify-content: center;
|
| 13 |
+
align-items: center;
|
| 14 |
+
min-height: 100vh;
|
| 15 |
+
background-color: #f4f4f9;
|
| 16 |
+
margin: 0;
|
| 17 |
+
padding: 20px;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
/* Container Styling */
|
| 21 |
+
.container {
|
| 22 |
+
width: 100%;
|
| 23 |
+
max-width: 600px;
|
| 24 |
+
text-align: center;
|
| 25 |
+
background: white;
|
| 26 |
+
border-radius: 8px;
|
| 27 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
| 28 |
+
padding: 20px;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
/* Title and Description */
|
| 32 |
+
h1 {
|
| 33 |
+
font-size: 24px;
|
| 34 |
+
color: #333;
|
| 35 |
+
margin-bottom: 10px;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
p {
|
| 39 |
+
font-size: 16px;
|
| 40 |
+
color: #666;
|
| 41 |
+
margin-bottom: 20px;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
/* Form and Button Styling */
|
| 45 |
+
textarea {
|
| 46 |
+
width: 100%;
|
| 47 |
+
height: 100px;
|
| 48 |
+
padding: 10px;
|
| 49 |
+
font-size: 16px;
|
| 50 |
+
border: 1px solid #ddd;
|
| 51 |
+
border-radius: 5px;
|
| 52 |
+
margin-bottom: 10px;
|
| 53 |
+
resize: vertical;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
button {
|
| 57 |
+
padding: 10px 20px;
|
| 58 |
+
font-size: 16px;
|
| 59 |
+
color: #fff;
|
| 60 |
+
background-color: #007bff;
|
| 61 |
+
border: none;
|
| 62 |
+
border-radius: 5px;
|
| 63 |
+
cursor: pointer;
|
| 64 |
+
transition: background-color 0.3s;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
button:hover {
|
| 68 |
+
background-color: #0056b3;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
/* Results Section Styling */
|
| 72 |
+
.results-section {
|
| 73 |
+
margin-top: 20px;
|
| 74 |
+
text-align: left;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.results-section h2 {
|
| 78 |
+
font-size: 20px;
|
| 79 |
+
color: #333;
|
| 80 |
+
margin-bottom: 10px;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
/* Entity Group Styling */
|
| 84 |
+
.entity-group {
|
| 85 |
+
margin-bottom: 15px;
|
| 86 |
+
padding: 10px;
|
| 87 |
+
border: 1px solid #ddd;
|
| 88 |
+
border-radius: 5px;
|
| 89 |
+
background-color: #fafafa;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
.entity-group h3 {
|
| 93 |
+
font-size: 18px;
|
| 94 |
+
color: #007bff;
|
| 95 |
+
margin-bottom: 8px;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
.entity-group p {
|
| 99 |
+
font-size: 16px;
|
| 100 |
+
color: #555;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
/* Responsive Design */
|
| 104 |
+
@media (max-width: 600px) {
|
| 105 |
+
.container {
|
| 106 |
+
width: 90%;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
h1 {
|
| 110 |
+
font-size: 20px;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
button {
|
| 114 |
+
font-size: 14px;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.results-section h2, .entity-group h3 {
|
| 118 |
+
font-size: 18px;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
textarea {
|
| 122 |
+
height: 80px;
|
| 123 |
+
}
|
| 124 |
+
}
|
templates/index.html
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Named Entity Recognition</title>
|
| 7 |
+
<link rel="stylesheet" href="/static/style.css">
|
| 8 |
+
</head>
|
| 9 |
+
<body>
|
| 10 |
+
<div class="container">
|
| 11 |
+
<h1>Named Entity Recognition</h1>
|
| 12 |
+
<p>Enter your text below to analyze entities and see their types, such as Date, Person, or Location.</p>
|
| 13 |
+
|
| 14 |
+
<form id="nerForm">
|
| 15 |
+
<!-- Add the default sentence as a value in the textarea -->
|
| 16 |
+
<textarea id="textInput" name="text">2014 - cu ilde Azərbaycan Respublikasının prezidenti İlham Əliyev Salyanda olub.</textarea>
|
| 17 |
+
<button type="submit">Analyze</button>
|
| 18 |
+
</form>
|
| 19 |
+
|
| 20 |
+
<div id="results" class="results-section">
|
| 21 |
+
<h2>Results</h2>
|
| 22 |
+
<div id="resultsContent"></div> <!-- Display JSON here -->
|
| 23 |
+
</div>
|
| 24 |
+
</div>
|
| 25 |
+
<script src="/static/app.js"></script>
|
| 26 |
+
</body>
|
| 27 |
+
</html>
|