#!/usr/bin/env python3 """ Kabyle-letter checker – Hugging Face Space edition Run locally: python app.py On Spaces the container starts this file automatically. """ import io, re, unicodedata, uuid from collections import Counter from pathlib import Path from flask import (Flask, render_template_string, request, send_file, url_for, flash, redirect) from tqdm import tqdm # ------------------------------------------------------------------ constants ALLOWED = set( "a b c d e f g h i j k l m n o p q r s t u v w x y z " "č ḍ ǧ ḥ ɣ ṛ ṣ ṭ ɛ ẓ".split() ) UPLOAD_DIR = Path("uploads") CSV_DIR = Path("csv") UPLOAD_DIR.mkdir(exist_ok=True) CSV_DIR.mkdir(exist_ok=True) PER_PAGE = 50 # sentences per page for pagination # ------------------------------------------------------------------ Flask app app = Flask(__name__) app.secret_key = uuid.uuid4().hex # random on every restart # ------------------------------------------------------------------ helpers def _is_ok(ch): # quick reusable test return ch.isalpha() and (ch.lower() in ALLOWED) def analyse(path: Path): """ Returns: counter – Counter of illegal chars sentences – list[dict] with keys line_nr, sentence, bad_chars (set), highlights (HTML) """ counter, sentences = Counter(), [] with path.open(encoding="utf-8") as fh: for line_nr, sentence in enumerate(tqdm(fh, unit=" lines", desc="Analysing"), 1): bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()} if bad: # build red-span version highlighted = re.sub( r"([^\s])", lambda m: (f'{m.group(0)}' if not _is_ok(m.group(0)) and m.group(0).isalpha() else m.group(0)), sentence, ) sentences.append({ "line_nr": line_nr, "sentence": sentence.rstrip(), "bad_chars": bad, "highlights": highlighted.rstrip(), }) for ch in bad: counter[ch] += 1 return counter, sentences # ------------------------------------------------------------------ routes @app.route("/", methods=["GET", "POST"]) def index(): if request.method == "POST": file = request.files.get("file") if not file or file.filename == "": flash("No file selected") return redirect(request.url) uid = uuid.uuid4().hex fpath = UPLOAD_DIR / f"{uid}.txt" file.save(fpath) counter, sentences = analyse(fpath) # CSV csv_io = io.StringIO() csv_io.write("char,codepoint,count,unicode_name\n") for ch, freq in counter.most_common(): csv_io.write(f"{ch},U+{ord(ch):04X},{freq},{unicodedata.name(ch,'')}\n") (CSV_DIR / f"{uid}.csv").write_text(csv_io.getvalue(), encoding="utf-8") # store sentences list for /show/ (CSV_DIR / f"{uid}_sentences.txt").write_text( "\n".join(f'{s["line_nr"]:>6} {s["sentence"]}' for s in sentences), encoding="utf-8", ) table_rows = [ {"char": ch, "code": f"U+{ord(ch):04X}", "count": freq, "name": unicodedata.name(ch, "")} for ch, freq in counter.most_common() ] total_lines = sum(1 for _ in fpath.open(encoding="utf-8")) return render_template_string( TEMPLATE_RESULT, table=table_rows, total_lines=total_lines, csv_url=url_for("download", uid=uid), show_url=url_for("show_sentences", uid=uid), sent_count=len(sentences), ) return render_template_string(TEMPLATE_INDEX) @app.route("/csv/") def download(uid): return send_file(CSV_DIR / f"{uid}.csv", as_attachment=True, download_name="unsupported_stats.csv") @app.route("/show/") def show_sentences(uid): """Display affected sentences with red highlights (paginated).""" # recover all sentences from disk all_sentences = [] for raw in (CSV_DIR / f"{uid}_sentences.txt").read_text(encoding="utf-8").splitlines(): line_nr, sentence = raw[:6].strip(), raw[8:] bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()} highlighted = re.sub( r"([^\s])", lambda m: (f'{m.group(0)}' if not _is_ok(m.group(0)) and m.group(0).isalpha() else m.group(0)), sentence, ) all_sentences.append({ "line_nr": line_nr, "highlights": highlighted, "sentence": sentence, }) # ----- pagination logic ----- page = request.args.get('page', 1, type=int) if page < 1: page = 1 total = len(all_sentences) total_pages = (total + PER_PAGE - 1) // PER_PAGE if total else 1 if page > total_pages: page = total_pages start = (page - 1) * PER_PAGE end = start + PER_PAGE sentences = all_sentences[start:end] prev_url = url_for('show_sentences', uid=uid, page=page-1) if page > 1 else None next_url = url_for('show_sentences', uid=uid, page=page+1) if page < total_pages else None # --------------------------- return render_template_string( TEMPLATE_SHOW, sentences=sentences, back_url=url_for("index"), page=page, total_pages=total_pages, prev_url=prev_url, next_url=next_url, total=total, ) # ------------------------------------------------------------------ templates TEMPLATE_INDEX = """ Tatoeba Kabyle Corpus Standardisation Checker

Tatoeba Kabyle Corpus Standardisation Checker

Upload Kabyle sentences from Tatoeba and spot letters/symbols that are not part of the official Kabyle alphabet (CLDR). Download a CSV or view highlighted sentences ready for correction.

{% with msgs = get_flashed_messages() %} {% if msgs %}<
{{ msgs[0] }}
{% endif %} {% endwith %}
""" TEMPLATE_RESULT = """ Tatoeba Kabyle Corpus Standardisation Checker

Tatoeba Kabyle Corpus Standardisation Checker

Analysed {{ total_lines }} lines. Found {{ sent_count }} sentence(s) with non-standard letters.

Result table

{% for row in table %} {% endfor %}
CharCodepointCountUnicode name
{{ row.char }} {{ row.code }} {{ row.count }} {{ row.name }}
⬇ Download CSV 👁 Show affected sentences
""" TEMPLATE_SHOW = """ Tatoeba Kabyle Corpus Standardisation Checker

Tatoeba Kabyle Corpus Standardisation Checker

← Back

Showing {{ sentences|length }} of {{ total }} affected sentences — Page {{ page }} of {{ total_pages }}

    {% for s in sentences %}
  1. {{ s.highlights|safe }}
    (line {{ s.line_nr }}) 🔍
  2. {% endfor %}
""" # ------------------------------------------------------------------ entrypoint if __name__ == "__main__": # 0.0.0.0:7860 is mandatory for Hugging Face Spaces app.run(host="0.0.0.0", port=7860, debug=False)