kabyle-tatoeba-checker

Sleeping

App Files Files Community

kabyle-tatoeba-checker / app.py

boffire

Update app.py

b1e769a verified 4 days ago

raw

history blame contribute delete

10.2 kB

	#!/usr/bin/env python3
	"""
	Kabyle-letter checker – Hugging Face Space edition
	Run locally: python app.py
	On Spaces the container starts this file automatically.
	"""

	import io, re, unicodedata, uuid
	from collections import Counter
	from pathlib import Path

	from flask import (Flask, render_template_string, request, send_file,
	url_for, flash, redirect)
	from tqdm import tqdm

	# ------------------------------------------------------------------ constants
	ALLOWED = set(
	"a b c d e f g h i j k l m n o p q r s t u v w x y z "
	"č ḍ ǧ ḥ ɣ ṛ ṣ ṭ ɛ ẓ".split()
	)
	UPLOAD_DIR = Path("uploads")
	CSV_DIR = Path("csv")
	UPLOAD_DIR.mkdir(exist_ok=True)
	CSV_DIR.mkdir(exist_ok=True)

	PER_PAGE = 50 # sentences per page for pagination

	# ------------------------------------------------------------------ Flask app
	app = Flask(__name__)
	app.secret_key = uuid.uuid4().hex # random on every restart

	# ------------------------------------------------------------------ helpers
	def _is_ok(ch): # quick reusable test
	return ch.isalpha() and (ch.lower() in ALLOWED)

	def analyse(path: Path):
	"""
	Returns:
	counter – Counter of illegal chars
	sentences – list[dict] with keys
	line_nr, sentence, bad_chars (set), highlights (HTML)
	"""
	counter, sentences = Counter(), []
	with path.open(encoding="utf-8") as fh:
	for line_nr, sentence in enumerate(tqdm(fh, unit=" lines", desc="Analysing"), 1):
	bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()}
	if bad:
	# build red-span version
	highlighted = re.sub(
	r"([^\s])",
	lambda m: (f'<span style="color:red">{m.group(0)}</span>'
	if not _is_ok(m.group(0)) and m.group(0).isalpha()
	else m.group(0)),
	sentence,
	)
	sentences.append({
	"line_nr": line_nr,
	"sentence": sentence.rstrip(),
	"bad_chars": bad,
	"highlights": highlighted.rstrip(),
	})
	for ch in bad:
	counter[ch] += 1
	return counter, sentences

	# ------------------------------------------------------------------ routes
	@app.route("/", methods=["GET", "POST"])
	def index():
	if request.method == "POST":
	file = request.files.get("file")
	if not file or file.filename == "":
	flash("No file selected")
	return redirect(request.url)

	uid = uuid.uuid4().hex
	fpath = UPLOAD_DIR / f"{uid}.txt"
	file.save(fpath)

	counter, sentences = analyse(fpath)

	# CSV
	csv_io = io.StringIO()
	csv_io.write("char,codepoint,count,unicode_name\n")
	for ch, freq in counter.most_common():
	csv_io.write(f"{ch},U+{ord(ch):04X},{freq},{unicodedata.name(ch,'')}\n")
	(CSV_DIR / f"{uid}.csv").write_text(csv_io.getvalue(), encoding="utf-8")

	# store sentences list for /show/<uid>
	(CSV_DIR / f"{uid}_sentences.txt").write_text(
	"\n".join(f'{s["line_nr"]:>6} {s["sentence"]}' for s in sentences),
	encoding="utf-8",
	)

	table_rows = [
	{"char": ch, "code": f"U+{ord(ch):04X}", "count": freq,
	"name": unicodedata.name(ch, "")}
	for ch, freq in counter.most_common()
	]
	total_lines = sum(1 for _ in fpath.open(encoding="utf-8"))
	return render_template_string(
	TEMPLATE_RESULT,
	table=table_rows,
	total_lines=total_lines,
	csv_url=url_for("download", uid=uid),
	show_url=url_for("show_sentences", uid=uid),
	sent_count=len(sentences),
	)
	return render_template_string(TEMPLATE_INDEX)

	@app.route("/csv/<uid>")
	def download(uid):
	return send_file(CSV_DIR / f"{uid}.csv", as_attachment=True,
	download_name="unsupported_stats.csv")

	@app.route("/show/<uid>")
	def show_sentences(uid):
	"""Display affected sentences with red highlights (paginated)."""
	# recover all sentences from disk
	all_sentences = []
	for raw in (CSV_DIR / f"{uid}_sentences.txt").read_text(encoding="utf-8").splitlines():
	line_nr, sentence = raw[:6].strip(), raw[8:]
	bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()}
	highlighted = re.sub(
	r"([^\s])",
	lambda m: (f'<span style="color:red;font-weight:bold">{m.group(0)}</span>'
	if not _is_ok(m.group(0)) and m.group(0).isalpha()
	else m.group(0)),
	sentence,
	)
	all_sentences.append({
	"line_nr": line_nr,
	"highlights": highlighted,
	"sentence": sentence,
	})

	# ----- pagination logic -----
	page = request.args.get('page', 1, type=int)
	if page < 1:
	page = 1

	total = len(all_sentences)
	total_pages = (total + PER_PAGE - 1) // PER_PAGE if total else 1
	if page > total_pages:
	page = total_pages

	start = (page - 1) * PER_PAGE
	end = start + PER_PAGE
	sentences = all_sentences[start:end]

	prev_url = url_for('show_sentences', uid=uid, page=page-1) if page > 1 else None
	next_url = url_for('show_sentences', uid=uid, page=page+1) if page < total_pages else None
	# ---------------------------

	return render_template_string(
	TEMPLATE_SHOW,
	sentences=sentences,
	back_url=url_for("index"),
	page=page,
	total_pages=total_pages,
	prev_url=prev_url,
	next_url=next_url,
	total=total,
	)

	# ------------------------------------------------------------------ templates
	TEMPLATE_INDEX = """
	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<title>Tatoeba Kabyle Corpus Standardisation Checker</title>
	<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
	</head>
	<body class="bg-light">
	<div class="container py-5">
	<h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1>
	<p class="lead mb-4">Upload Kabyle sentences from Tatoeba and spot letters/symbols that are <strong>not</strong> part of the official Kabyle alphabet (CLDR). Download a CSV or view highlighted sentences ready for correction.</p>

	<form method="post" enctype="multipart/form-data" class="mb-4">
	<div class="input-group">
	<input type="file" name="file" class="form-control" required>
	<button class="btn btn-primary" type="submit">Upload & analyse</button>
	</div>
	</form>

	{% with msgs = get_flashed_messages() %}
	{% if msgs %}<<div class="alert alert-warning">{{ msgs[0] }}</div>{% endif %}
	{% endwith %}
	</div>
	</body>
	</html>
	"""

	TEMPLATE_RESULT = """
	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<title>Tatoeba Kabyle Corpus Standardisation Checker</title>
	<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
	</head>
	<body class="bg-light">
	<div class="container py-5">
	<h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1>

	<div class="alert alert-info">Analysed {{ total_lines }} lines.
	Found {{ sent_count }} sentence(s) with non-standard letters.</div>

	<h2>Result table</h2>
	<div class="table-responsive">
	<table class="table table-sm table-striped align-middle">
	<thead class="table-light">
	<tr><th>Char</th><th>Codepoint</th><th>Count</th><th>Unicode name</th></tr>
	</thead>
	<tbody>
	{% for row in table %}
	<tr>
	<td style="font-size:1.5rem">{{ row.char }}</td>
	<td><code>{{ row.code }}</code></td>
	<td>{{ row.count }}</td>
	<td><small>{{ row.name }}</small></td>
	</tr>
	{% endfor %}
	</tbody>
	</table>
	</div>

	<a class="btn btn-outline-success" href="{{ csv_url }}">⬇ Download CSV</a>
	<a class="btn btn-outline-primary ms-2" href="{{ show_url }}">👁 Show affected sentences</a>
	</div>
	</body>
	</html>
	"""

	TEMPLATE_SHOW = """
	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<title>Tatoeba Kabyle Corpus Standardisation Checker</title>
	<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
	</head>
	<body class="bg-light">
	<div class="container py-5">
	<h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1>
	<a class="btn btn-outline-secondary mb-3" href="{{ back_url }}">← Back</a>

	<p class="text-muted">Showing {{ sentences\|length }} of {{ total }} affected sentences — Page {{ page }} of {{ total_pages }}</p>

	<ol start="{{ (page - 1) * 50 + 1 }}">
	{% for s in sentences %}
	<li class="mb-2">
	<pre style="display:inline; margin:0; padding:0; border:none; background:none;">{{ s.highlights\|safe }}</pre>
	<small class="text-muted">(line {{ s.line_nr }})</small>
	<a class="btn btn-sm btn-outline-primary ms-2"
	href="https://tatoeba.org/en/sentences/search?from=kab&query={{ s.sentence\|urlencode }}&to="
	target="_blank" title="Search on Tatoeba" aria-label="Search">🔍</a>
	</li>
	{% endfor %}
	</ol>

	<!-- Pagination -->
	<nav aria-label="Sentence pages" class="mt-4">
	<ul class="pagination justify-content-center">
	<li class="page-item {% if not prev_url %}disabled{% endif %}">
	<a class="page-link" href="{{ prev_url or '#' }}">← Previous</a>
	</li>
	<li class="page-item disabled">
	<span class="page-link">Page {{ page }} of {{ total_pages }}</span>
	</li>
	<li class="page-item {% if not next_url %}disabled{% endif %}">
	<a class="page-link" href="{{ next_url or '#' }}">Next →</a>
	</li>
	</ul>
	</nav>
	</div>
	</body>
	</html>
	"""

	# ------------------------------------------------------------------ entrypoint
	if __name__ == "__main__":
	# 0.0.0.0:7860 is mandatory for Hugging Face Spaces
	app.run(host="0.0.0.0", port=7860, debug=False)