boffire commited on
Commit
5c3dba5
Β·
verified Β·
1 Parent(s): 93daa90

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +241 -0
app.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Kabyle-letter checker – Hugging Face Space edition
4
+ Run locally: python app.py
5
+ On Spaces the container starts this file automatically.
6
+ """
7
+
8
+ import io, re, unicodedata, uuid
9
+ from collections import Counter
10
+ from pathlib import Path
11
+
12
+ from flask import (Flask, render_template_string, request, send_file,
13
+ url_for, flash, redirect)
14
+ from tqdm import tqdm
15
+
16
+ # ------------------------------------------------------------------ constants
17
+ ALLOWED = set(
18
+ "a b c d e f g h i j k l m n o p q r s t u v w x y z "
19
+ "č ḍ Η§ αΈ₯ Ι£ αΉ› αΉ£ αΉ­ Ι› αΊ“".split()
20
+ )
21
+ UPLOAD_DIR = Path("uploads")
22
+ CSV_DIR = Path("csv")
23
+ UPLOAD_DIR.mkdir(exist_ok=True)
24
+ CSV_DIR.mkdir(exist_ok=True)
25
+
26
+ # ------------------------------------------------------------------ Flask app
27
+ app = Flask(__name__)
28
+ app.secret_key = uuid.uuid4().hex # random on every restart
29
+
30
+ # ------------------------------------------------------------------ helpers
31
+ def _is_ok(ch): # quick reusable test
32
+ return ch.isalpha() and (ch.lower() in ALLOWED)
33
+
34
+ def analyse(path: Path):
35
+ """
36
+ Returns:
37
+ counter – Counter of illegal chars
38
+ sentences – list[dict] with keys
39
+ line_nr, sentence, bad_chars (set), highlights (HTML)
40
+ """
41
+ counter, sentences = Counter(), []
42
+ with path.open(encoding="utf-8") as fh:
43
+ for line_nr, sentence in enumerate(tqdm(fh, unit=" lines", desc="Analysing"), 1):
44
+ bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()}
45
+ if bad:
46
+ # build red-span version
47
+ highlighted = re.sub(
48
+ r"([^\s])",
49
+ lambda m: (f'<span style="color:red">{m.group(0)}</span>'
50
+ if not _is_ok(m.group(0)) and m.group(0).isalpha()
51
+ else m.group(0)),
52
+ sentence,
53
+ )
54
+ sentences.append({
55
+ "line_nr": line_nr,
56
+ "sentence": sentence.rstrip(),
57
+ "bad_chars": bad,
58
+ "highlights": highlighted.rstrip(),
59
+ })
60
+ for ch in bad:
61
+ counter[ch] += 1
62
+ return counter, sentences
63
+
64
+ # ------------------------------------------------------------------ routes
65
+ @app.route("/", methods=["GET", "POST"])
66
+ def index():
67
+ if request.method == "POST":
68
+ file = request.files.get("file")
69
+ if not file or file.filename == "":
70
+ flash("No file selected")
71
+ return redirect(request.url)
72
+
73
+ uid = uuid.uuid4().hex
74
+ fpath = UPLOAD_DIR / f"{uid}.txt"
75
+ file.save(fpath)
76
+
77
+ counter, sentences = analyse(fpath)
78
+
79
+ # CSV
80
+ csv_io = io.StringIO()
81
+ csv_io.write("char,codepoint,count,unicode_name\n")
82
+ for ch, freq in counter.most_common():
83
+ csv_io.write(f"{ch},U+{ord(ch):04X},{freq},{unicodedata.name(ch,'')}\n")
84
+ (CSV_DIR / f"{uid}.csv").write_text(csv_io.getvalue(), encoding="utf-8")
85
+
86
+ # store sentences list for /show/<uid>
87
+ (CSV_DIR / f"{uid}_sentences.txt").write_text(
88
+ "\n".join(f'{s["line_nr"]:>6} {s["sentence"]}' for s in sentences),
89
+ encoding="utf-8",
90
+ )
91
+
92
+ table_rows = [
93
+ {"char": ch, "code": f"U+{ord(ch):04X}", "count": freq,
94
+ "name": unicodedata.name(ch, "")}
95
+ for ch, freq in counter.most_common()
96
+ ]
97
+ total_lines = sum(1 for _ in fpath.open(encoding="utf-8"))
98
+ return render_template_string(
99
+ TEMPLATE_RESULT,
100
+ table=table_rows,
101
+ total_lines=total_lines,
102
+ csv_url=url_for("download", uid=uid),
103
+ show_url=url_for("show_sentences", uid=uid),
104
+ sent_count=len(sentences),
105
+ )
106
+ return render_template_string(TEMPLATE_INDEX)
107
+
108
+ @app.route("/csv/<uid>")
109
+ def download(uid):
110
+ return send_file(CSV_DIR / f"{uid}.csv", as_attachment=True,
111
+ download_name="unsupported_stats.csv")
112
+
113
+ @app.route("/show/<uid>")
114
+ def show_sentences(uid):
115
+ """Display affected sentences with red highlights."""
116
+ sentences = []
117
+ for raw in (CSV_DIR / f"{uid}_sentences.txt").read_text(encoding="utf-8").splitlines():
118
+ # recover line_nr + sentence
119
+ line_nr, sentence = raw[:6].strip(), raw[8:]
120
+ bad = {ch for ch in sentence if not _is_ok(ch) if ch.isalpha()}
121
+ highlighted = re.sub(
122
+ r"([^\s])",
123
+ lambda m: (f'<span style="color:red;font-weight:bold">{m.group(0)}</span>'
124
+ if not _is_ok(m.group(0)) and m.group(0).isalpha()
125
+ else m.group(0)),
126
+ sentence,
127
+ )
128
+ sentences.append({
129
+ "line_nr": line_nr,
130
+ "highlights": highlighted,
131
+ "sentence": sentence,
132
+ })
133
+ return render_template_string(
134
+ TEMPLATE_SHOW,
135
+ sentences=sentences,
136
+ back_url=url_for("index"),
137
+ )
138
+
139
+ # ------------------------------------------------------------------ templates
140
+ TEMPLATE_INDEX = """
141
+ <!doctype html>
142
+ <html lang="en">
143
+ <head>
144
+ <meta charset="utf-8">
145
+ <title>Tatoeba Kabyle Corpus Standardisation Checker</title>
146
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
147
+ </head>
148
+ <body class="bg-light">
149
+ <div class="container py-5">
150
+ <h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1>
151
+ <p class="lead mb-4">Upload Kabyle sentences from Tatoeba and spot letters/symbols that are <strong>not</strong> part of the official Kabyle alphabet (CLDR). Download a CSV or view highlighted sentences ready for correction.</p>
152
+
153
+ <form method="post" enctype="multipart/form-data" class="mb-4">
154
+ <div class="input-group">
155
+ <input type="file" name="file" class="form-control" required>
156
+ <button class="btn btn-primary" type="submit">Upload & analyse</button>
157
+ </div>
158
+ </form>
159
+
160
+ {% with msgs = get_flashed_messages() %}
161
+ {% if msgs %}<div class="alert alert-warning">{{ msgs[0] }}</div>{% endif %}
162
+ {% endwith %}
163
+ </div>
164
+ </body>
165
+ </html>
166
+ """
167
+
168
+ TEMPLATE_RESULT = """
169
+ <!doctype html>
170
+ <html lang="en">
171
+ <head>
172
+ <meta charset="utf-8">
173
+ <title>Tatoeba Kabyle Corpus Standardisation Checker</title>
174
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
175
+ </head>
176
+ <body class="bg-light">
177
+ <div class="container py-5">
178
+ <h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1>
179
+
180
+ <div class="alert alert-info">Analysed {{ total_lines }} lines.
181
+ Found {{ sent_count }} sentence(s) with non-standard letters.</div>
182
+
183
+ <h2>Result table</h2>
184
+ <div class="table-responsive">
185
+ <table class="table table-sm table-striped align-middle">
186
+ <thead class="table-light">
187
+ <tr><th>Char</th><th>Codepoint</th><th>Count</th><th>Unicode name</th></tr>
188
+ </thead>
189
+ <tbody>
190
+ {% for row in table %}
191
+ <tr>
192
+ <td style="font-size:1.5rem">{{ row.char }}</td>
193
+ <td><code>{{ row.code }}</code></td>
194
+ <td>{{ row.count }}</td>
195
+ <td><small>{{ row.name }}</small></td>
196
+ </tr>
197
+ {% endfor %}
198
+ </tbody>
199
+ </table>
200
+ </div>
201
+
202
+ <a class="btn btn-outline-success" href="{{ csv_url }}">⬇ Download CSV</a>
203
+ <a class="btn btn-outline-primary ms-2" href="{{ show_url }}">πŸ‘ Show affected sentences</a>
204
+ </div>
205
+ </body>
206
+ </html>
207
+ """
208
+
209
+ TEMPLATE_SHOW = """
210
+ <!doctype html>
211
+ <html lang="en">
212
+ <head>
213
+ <meta charset="utf-8">
214
+ <title>Tatoeba Kabyle Corpus Standardisation Checker</title>
215
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
216
+ </head>
217
+ <body class="bg-light">
218
+ <div class="container py-5">
219
+ <h1 class="mb-3">Tatoeba Kabyle Corpus Standardisation Checker</h1>
220
+ <a class="btn btn-outline-secondary mb-3" href="{{ back_url }}">← Back</a>
221
+
222
+ <ol>
223
+ {% for s in sentences %}
224
+ <li class="mb-2">
225
+ <pre style="display:inline; margin:0; padding:0; border:none; background:none;">{{ s.highlights|safe }}</pre>
226
+ <small class="text-muted">(line {{ s.line_nr }})</small>
227
+ <a class="btn btn-sm btn-outline-primary ms-2"
228
+ href="https://tatoeba.org/en/sentences/search?from=kab&query={{ s.sentence|urlencode }}&to="
229
+ target="_blank" title="Search on Tatoeba" aria-label="Search">πŸ”</a>
230
+ </li>
231
+ {% endfor %}
232
+ </ol>
233
+ </div>
234
+ </body>
235
+ </html>
236
+ """
237
+
238
+ # ------------------------------------------------------------------ entrypoint
239
+ if __name__ == "__main__":
240
+ # 0.0.0.0:7860 is mandatory for Hugging Face Spaces
241
+ app.run(host="0.0.0.0", port=7860, debug=False)