Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Kabyle-letter checker – Hugging Face Space edition | |
| Run locally: python app.py | |
| On Spaces the container starts this file automatically. | |
| """ | |
| import io, re, unicodedata, uuid | |
| from collections import Counter | |
| from pathlib import Path | |
| from flask import (Flask, render_template_string, request, send_file, | |
| url_for, flash, redirect) | |
| from tqdm import tqdm | |
| # ------------------------------------------------------------------ constants | |
| ALLOWED = set( | |
| "a b c d e f g h i j k l m n o p q r s t u v w x y z " | |
| "č ḍ ǧ ḥ ɣ ṛ ṣ ṭ ɛ ẓ".split() | |
| ) | |
| UPLOAD_DIR = Path("uploads") | |
| CSV_DIR = Path("csv") | |
| UPLOAD_DIR.mkdir(exist_ok=True) | |
| CSV_DIR.mkdir(exist_ok=True) | |
| PER_PAGE = 50 # sentences per page for pagination | |
| # ------------------------------------------------------------------ Flask app | |
| app = Flask(__name__) | |
| app.secret_key = uuid.uuid4().hex # random on every restart | |
| # ------------------------------------------------------------------ helpers | |
| def _is_ok(ch): # quick reusable test | |
| return ch.isalpha() and (ch.lower() in ALLOWED) | |
| def analyse(path: Path): | |
| """ | |
| Returns: | |
| counter – Counter of illegal chars | |
| sentences – list[dict] with keys | |
| line_nr, sentence, bad_chars (set), highlights (HTML) | |
| """ | |
| counter, sentences = Counter(), [] | |
| with path.open(encoding="utf-8") as fh: | |
| for line_nr, sentence in enumerate(tqdm(fh, unit=" lines", desc="Analysing"), 1): | |
| bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()} | |
| if bad: | |
| # build red-span version | |
| highlighted = re.sub( | |
| r"([^\s])", | |
| lambda m: (f'<span style="color:red">{m.group(0)}</span>' | |
| if not _is_ok(m.group(0)) and m.group(0).isalpha() | |
| else m.group(0)), | |
| sentence, | |
| ) | |
| sentences.append({ | |
| "line_nr": line_nr, | |
| "sentence": sentence.rstrip(), | |
| "bad_chars": bad, | |
| "highlights": highlighted.rstrip(), | |
| }) | |
| for ch in bad: | |
| counter[ch] += 1 | |
| return counter, sentences | |
| # ------------------------------------------------------------------ routes | |
| def index(): | |
| if request.method == "POST": | |
| file = request.files.get("file") | |
| if not file or file.filename == "": | |
| flash("No file selected") | |
| return redirect(request.url) | |
| uid = uuid.uuid4().hex | |
| fpath = UPLOAD_DIR / f"{uid}.txt" | |
| file.save(fpath) | |
| counter, sentences = analyse(fpath) | |
| # CSV | |
| csv_io = io.StringIO() | |
| csv_io.write("char,codepoint,count,unicode_name\n") | |
| for ch, freq in counter.most_common(): | |
| csv_io.write(f"{ch},U+{ord(ch):04X},{freq},{unicodedata.name(ch,'')}\n") | |
| (CSV_DIR / f"{uid}.csv").write_text(csv_io.getvalue(), encoding="utf-8") | |
| # store sentences list for /show/<uid> | |
| (CSV_DIR / f"{uid}_sentences.txt").write_text( | |
| "\n".join(f'{s["line_nr"]:>6} {s["sentence"]}' for s in sentences), | |
| encoding="utf-8", | |
| ) | |
| table_rows = [ | |
| {"char": ch, "code": f"U+{ord(ch):04X}", "count": freq, | |
| "name": unicodedata.name(ch, "")} | |
| for ch, freq in counter.most_common() | |
| ] | |
| total_lines = sum(1 for _ in fpath.open(encoding="utf-8")) | |
| return render_template_string( | |
| TEMPLATE_RESULT, | |
| table=table_rows, | |
| total_lines=total_lines, | |
| csv_url=url_for("download", uid=uid), | |
| show_url=url_for("show_sentences", uid=uid), | |
| sent_count=len(sentences), | |
| ) | |
| return render_template_string(TEMPLATE_INDEX) | |
| def download(uid): | |
| return send_file(CSV_DIR / f"{uid}.csv", as_attachment=True, | |
| download_name="unsupported_stats.csv") | |
| def show_sentences(uid): | |
| """Display affected sentences with red highlights (paginated).""" | |
| # recover all sentences from disk | |
| all_sentences = [] | |
| for raw in (CSV_DIR / f"{uid}_sentences.txt").read_text(encoding="utf-8").splitlines(): | |
| line_nr, sentence = raw[:6].strip(), raw[8:] | |
| bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()} | |
| highlighted = re.sub( | |
| r"([^\s])", | |
| lambda m: (f'<span style="color:red;font-weight:bold">{m.group(0)}</span>' | |
| if not _is_ok(m.group(0)) and m.group(0).isalpha() | |
| else m.group(0)), | |
| sentence, | |
| ) | |
| all_sentences.append({ | |
| "line_nr": line_nr, | |
| "highlights": highlighted, | |
| "sentence": sentence, | |
| }) | |
| # ----- pagination logic ----- | |
| page = request.args.get('page', 1, type=int) | |
| if page < 1: | |
| page = 1 | |
| total = len(all_sentences) | |
| total_pages = (total + PER_PAGE - 1) // PER_PAGE if total else 1 | |
| if page > total_pages: | |
| page = total_pages | |
| start = (page - 1) * PER_PAGE | |
| end = start + PER_PAGE | |
| sentences = all_sentences[start:end] | |
| prev_url = url_for('show_sentences', uid=uid, page=page-1) if page > 1 else None | |
| next_url = url_for('show_sentences', uid=uid, page=page+1) if page < total_pages else None | |
| # --------------------------- | |
| return render_template_string( | |
| TEMPLATE_SHOW, | |
| sentences=sentences, | |
| back_url=url_for("index"), | |
| page=page, | |
| total_pages=total_pages, | |
| prev_url=prev_url, | |
| next_url=next_url, | |
| total=total, | |
| ) | |
| # ------------------------------------------------------------------ templates | |
| TEMPLATE_INDEX = """ | |
| <!doctype html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8"> | |
| <title>Tatoeba Kabyle Corpus Standardisation Checker</title> | |
| <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet"> | |
| </head> | |
| <body class="bg-light"> | |
| <div class="container py-5"> | |
| <h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1> | |
| <p class="lead mb-4">Upload Kabyle sentences from Tatoeba and spot letters/symbols that are <strong>not</strong> part of the official Kabyle alphabet (CLDR). Download a CSV or view highlighted sentences ready for correction.</p> | |
| <form method="post" enctype="multipart/form-data" class="mb-4"> | |
| <div class="input-group"> | |
| <input type="file" name="file" class="form-control" required> | |
| <button class="btn btn-primary" type="submit">Upload & analyse</button> | |
| </div> | |
| </form> | |
| {% with msgs = get_flashed_messages() %} | |
| {% if msgs %}<<div class="alert alert-warning">{{ msgs[0] }}</div>{% endif %} | |
| {% endwith %} | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| TEMPLATE_RESULT = """ | |
| <!doctype html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8"> | |
| <title>Tatoeba Kabyle Corpus Standardisation Checker</title> | |
| <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet"> | |
| </head> | |
| <body class="bg-light"> | |
| <div class="container py-5"> | |
| <h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1> | |
| <div class="alert alert-info">Analysed {{ total_lines }} lines. | |
| Found {{ sent_count }} sentence(s) with non-standard letters.</div> | |
| <h2>Result table</h2> | |
| <div class="table-responsive"> | |
| <table class="table table-sm table-striped align-middle"> | |
| <thead class="table-light"> | |
| <tr><th>Char</th><th>Codepoint</th><th>Count</th><th>Unicode name</th></tr> | |
| </thead> | |
| <tbody> | |
| {% for row in table %} | |
| <tr> | |
| <td style="font-size:1.5rem">{{ row.char }}</td> | |
| <td><code>{{ row.code }}</code></td> | |
| <td>{{ row.count }}</td> | |
| <td><small>{{ row.name }}</small></td> | |
| </tr> | |
| {% endfor %} | |
| </tbody> | |
| </table> | |
| </div> | |
| <a class="btn btn-outline-success" href="{{ csv_url }}">⬇ Download CSV</a> | |
| <a class="btn btn-outline-primary ms-2" href="{{ show_url }}">👁 Show affected sentences</a> | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| TEMPLATE_SHOW = """ | |
| <!doctype html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8"> | |
| <title>Tatoeba Kabyle Corpus Standardisation Checker</title> | |
| <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet"> | |
| </head> | |
| <body class="bg-light"> | |
| <div class="container py-5"> | |
| <h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1> | |
| <a class="btn btn-outline-secondary mb-3" href="{{ back_url }}">← Back</a> | |
| <p class="text-muted">Showing {{ sentences|length }} of {{ total }} affected sentences — Page {{ page }} of {{ total_pages }}</p> | |
| <ol start="{{ (page - 1) * 50 + 1 }}"> | |
| {% for s in sentences %} | |
| <li class="mb-2"> | |
| <pre style="display:inline; margin:0; padding:0; border:none; background:none;">{{ s.highlights|safe }}</pre> | |
| <small class="text-muted">(line {{ s.line_nr }})</small> | |
| <a class="btn btn-sm btn-outline-primary ms-2" | |
| href="https://tatoeba.org/en/sentences/search?from=kab&query={{ s.sentence|urlencode }}&to=" | |
| target="_blank" title="Search on Tatoeba" aria-label="Search">🔍</a> | |
| </li> | |
| {% endfor %} | |
| </ol> | |
| <!-- Pagination --> | |
| <nav aria-label="Sentence pages" class="mt-4"> | |
| <ul class="pagination justify-content-center"> | |
| <li class="page-item {% if not prev_url %}disabled{% endif %}"> | |
| <a class="page-link" href="{{ prev_url or '#' }}">← Previous</a> | |
| </li> | |
| <li class="page-item disabled"> | |
| <span class="page-link">Page {{ page }} of {{ total_pages }}</span> | |
| </li> | |
| <li class="page-item {% if not next_url %}disabled{% endif %}"> | |
| <a class="page-link" href="{{ next_url or '#' }}">Next →</a> | |
| </li> | |
| </ul> | |
| </nav> | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| # ------------------------------------------------------------------ entrypoint | |
| if __name__ == "__main__": | |
| # 0.0.0.0:7860 is mandatory for Hugging Face Spaces | |
| app.run(host="0.0.0.0", port=7860, debug=False) |