boffire's picture
Update app.py
b1e769a verified
#!/usr/bin/env python3
"""
Kabyle-letter checker – Hugging Face Space edition
Run locally: python app.py
On Spaces the container starts this file automatically.
"""
import io, re, unicodedata, uuid
from collections import Counter
from pathlib import Path
from flask import (Flask, render_template_string, request, send_file,
url_for, flash, redirect)
from tqdm import tqdm
# ------------------------------------------------------------------ constants
ALLOWED = set(
"a b c d e f g h i j k l m n o p q r s t u v w x y z "
"č ḍ ǧ ḥ ɣ ṛ ṣ ṭ ɛ ẓ".split()
)
UPLOAD_DIR = Path("uploads")
CSV_DIR = Path("csv")
UPLOAD_DIR.mkdir(exist_ok=True)
CSV_DIR.mkdir(exist_ok=True)
PER_PAGE = 50 # sentences per page for pagination
# ------------------------------------------------------------------ Flask app
app = Flask(__name__)
app.secret_key = uuid.uuid4().hex # random on every restart
# ------------------------------------------------------------------ helpers
def _is_ok(ch): # quick reusable test
return ch.isalpha() and (ch.lower() in ALLOWED)
def analyse(path: Path):
"""
Returns:
counter – Counter of illegal chars
sentences – list[dict] with keys
line_nr, sentence, bad_chars (set), highlights (HTML)
"""
counter, sentences = Counter(), []
with path.open(encoding="utf-8") as fh:
for line_nr, sentence in enumerate(tqdm(fh, unit=" lines", desc="Analysing"), 1):
bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()}
if bad:
# build red-span version
highlighted = re.sub(
r"([^\s])",
lambda m: (f'<span style="color:red">{m.group(0)}</span>'
if not _is_ok(m.group(0)) and m.group(0).isalpha()
else m.group(0)),
sentence,
)
sentences.append({
"line_nr": line_nr,
"sentence": sentence.rstrip(),
"bad_chars": bad,
"highlights": highlighted.rstrip(),
})
for ch in bad:
counter[ch] += 1
return counter, sentences
# ------------------------------------------------------------------ routes
@app.route("/", methods=["GET", "POST"])
def index():
if request.method == "POST":
file = request.files.get("file")
if not file or file.filename == "":
flash("No file selected")
return redirect(request.url)
uid = uuid.uuid4().hex
fpath = UPLOAD_DIR / f"{uid}.txt"
file.save(fpath)
counter, sentences = analyse(fpath)
# CSV
csv_io = io.StringIO()
csv_io.write("char,codepoint,count,unicode_name\n")
for ch, freq in counter.most_common():
csv_io.write(f"{ch},U+{ord(ch):04X},{freq},{unicodedata.name(ch,'')}\n")
(CSV_DIR / f"{uid}.csv").write_text(csv_io.getvalue(), encoding="utf-8")
# store sentences list for /show/<uid>
(CSV_DIR / f"{uid}_sentences.txt").write_text(
"\n".join(f'{s["line_nr"]:>6} {s["sentence"]}' for s in sentences),
encoding="utf-8",
)
table_rows = [
{"char": ch, "code": f"U+{ord(ch):04X}", "count": freq,
"name": unicodedata.name(ch, "")}
for ch, freq in counter.most_common()
]
total_lines = sum(1 for _ in fpath.open(encoding="utf-8"))
return render_template_string(
TEMPLATE_RESULT,
table=table_rows,
total_lines=total_lines,
csv_url=url_for("download", uid=uid),
show_url=url_for("show_sentences", uid=uid),
sent_count=len(sentences),
)
return render_template_string(TEMPLATE_INDEX)
@app.route("/csv/<uid>")
def download(uid):
return send_file(CSV_DIR / f"{uid}.csv", as_attachment=True,
download_name="unsupported_stats.csv")
@app.route("/show/<uid>")
def show_sentences(uid):
"""Display affected sentences with red highlights (paginated)."""
# recover all sentences from disk
all_sentences = []
for raw in (CSV_DIR / f"{uid}_sentences.txt").read_text(encoding="utf-8").splitlines():
line_nr, sentence = raw[:6].strip(), raw[8:]
bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()}
highlighted = re.sub(
r"([^\s])",
lambda m: (f'<span style="color:red;font-weight:bold">{m.group(0)}</span>'
if not _is_ok(m.group(0)) and m.group(0).isalpha()
else m.group(0)),
sentence,
)
all_sentences.append({
"line_nr": line_nr,
"highlights": highlighted,
"sentence": sentence,
})
# ----- pagination logic -----
page = request.args.get('page', 1, type=int)
if page < 1:
page = 1
total = len(all_sentences)
total_pages = (total + PER_PAGE - 1) // PER_PAGE if total else 1
if page > total_pages:
page = total_pages
start = (page - 1) * PER_PAGE
end = start + PER_PAGE
sentences = all_sentences[start:end]
prev_url = url_for('show_sentences', uid=uid, page=page-1) if page > 1 else None
next_url = url_for('show_sentences', uid=uid, page=page+1) if page < total_pages else None
# ---------------------------
return render_template_string(
TEMPLATE_SHOW,
sentences=sentences,
back_url=url_for("index"),
page=page,
total_pages=total_pages,
prev_url=prev_url,
next_url=next_url,
total=total,
)
# ------------------------------------------------------------------ templates
TEMPLATE_INDEX = """
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Tatoeba Kabyle Corpus Standardisation Checker</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
</head>
<body class="bg-light">
<div class="container py-5">
<h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1>
<p class="lead mb-4">Upload Kabyle sentences from Tatoeba and spot letters/symbols that are <strong>not</strong> part of the official Kabyle alphabet (CLDR). Download a CSV or view highlighted sentences ready for correction.</p>
<form method="post" enctype="multipart/form-data" class="mb-4">
<div class="input-group">
<input type="file" name="file" class="form-control" required>
<button class="btn btn-primary" type="submit">Upload & analyse</button>
</div>
</form>
{% with msgs = get_flashed_messages() %}
{% if msgs %}<<div class="alert alert-warning">{{ msgs[0] }}</div>{% endif %}
{% endwith %}
</div>
</body>
</html>
"""
TEMPLATE_RESULT = """
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Tatoeba Kabyle Corpus Standardisation Checker</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
</head>
<body class="bg-light">
<div class="container py-5">
<h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1>
<div class="alert alert-info">Analysed {{ total_lines }} lines.
Found {{ sent_count }} sentence(s) with non-standard letters.</div>
<h2>Result table</h2>
<div class="table-responsive">
<table class="table table-sm table-striped align-middle">
<thead class="table-light">
<tr><th>Char</th><th>Codepoint</th><th>Count</th><th>Unicode name</th></tr>
</thead>
<tbody>
{% for row in table %}
<tr>
<td style="font-size:1.5rem">{{ row.char }}</td>
<td><code>{{ row.code }}</code></td>
<td>{{ row.count }}</td>
<td><small>{{ row.name }}</small></td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<a class="btn btn-outline-success" href="{{ csv_url }}">⬇ Download CSV</a>
<a class="btn btn-outline-primary ms-2" href="{{ show_url }}">👁 Show affected sentences</a>
</div>
</body>
</html>
"""
TEMPLATE_SHOW = """
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Tatoeba Kabyle Corpus Standardisation Checker</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
</head>
<body class="bg-light">
<div class="container py-5">
<h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1>
<a class="btn btn-outline-secondary mb-3" href="{{ back_url }}">← Back</a>
<p class="text-muted">Showing {{ sentences|length }} of {{ total }} affected sentences — Page {{ page }} of {{ total_pages }}</p>
<ol start="{{ (page - 1) * 50 + 1 }}">
{% for s in sentences %}
<li class="mb-2">
<pre style="display:inline; margin:0; padding:0; border:none; background:none;">{{ s.highlights|safe }}</pre>
<small class="text-muted">(line {{ s.line_nr }})</small>
<a class="btn btn-sm btn-outline-primary ms-2"
href="https://tatoeba.org/en/sentences/search?from=kab&query={{ s.sentence|urlencode }}&to="
target="_blank" title="Search on Tatoeba" aria-label="Search">🔍</a>
</li>
{% endfor %}
</ol>
<!-- Pagination -->
<nav aria-label="Sentence pages" class="mt-4">
<ul class="pagination justify-content-center">
<li class="page-item {% if not prev_url %}disabled{% endif %}">
<a class="page-link" href="{{ prev_url or '#' }}">← Previous</a>
</li>
<li class="page-item disabled">
<span class="page-link">Page {{ page }} of {{ total_pages }}</span>
</li>
<li class="page-item {% if not next_url %}disabled{% endif %}">
<a class="page-link" href="{{ next_url or '#' }}">Next →</a>
</li>
</ul>
</nav>
</div>
</body>
</html>
"""
# ------------------------------------------------------------------ entrypoint
if __name__ == "__main__":
# 0.0.0.0:7860 is mandatory for Hugging Face Spaces
app.run(host="0.0.0.0", port=7860, debug=False)