| |
| |
|
|
| import os |
| import subprocess |
| import gradio as gr |
|
|
| |
| os.environ["WANDB_MODE"] = "disabled" |
|
|
| |
| _default_mecab = "/usr/bin/mecab" if os.path.exists("/usr/bin/mecab") else "mecab" |
| MECAB_BIN = os.getenv("MECAB_BIN", _default_mecab) |
| os.environ["MECAB_BIN"] = MECAB_BIN |
|
|
| |
| _model = None |
| _exp_info = None |
|
|
|
|
| def _ensure_model(): |
| global _model, _exp_info |
| if _model is None: |
| from infer import load_model |
|
|
| result = load_model() |
| if result is None: |
| raise RuntimeError( |
| "Model could not be loaded. Ensure sample_model/ exists with config.yaml and model.pt." |
| ) |
| _model, _exp_info = result |
|
|
|
|
| def _to_mecab_lines(results, optimal_morphemes=None) -> str: |
| |
| def mecab_features(m): |
| pos = m.get("pos", "*") |
| pos1 = m.get("pos_detail1", "*") |
| pos2 = m.get("pos_detail2", "*") |
| ctype = m.get("inflection_type", "*") |
| cform = m.get("inflection_form", "*") |
| base = m.get("base_form", m.get("lemma", "*")) or "*" |
| |
| reading = m.get("reading", "*") or "*" |
| return f"{pos},{pos1},{pos2},{ctype},{cform},{base},{reading}" |
|
|
| items = ( |
| optimal_morphemes |
| if optimal_morphemes |
| else [ |
| { |
| "surface": r.get("surface", ""), |
| "pos": r.get("pos", "*"), |
| "pos_detail1": "*", |
| "pos_detail2": "*", |
| "inflection_type": "*", |
| "inflection_form": "*", |
| "base_form": r.get("surface", ""), |
| "reading": r.get("reading", "*"), |
| } |
| for r in results |
| ] |
| ) |
|
|
| lines = [f"{m.get('surface','')}\t{mecab_features(m)}" for m in items] |
| lines.append("EOS") |
| return "\n".join(lines) |
|
|
|
|
| def mecab_plain(text: str) -> str: |
| """Run system MeCab and return its raw parsing (surface\tCSV ...\nEOS).""" |
| try: |
| from mecari.analyzers.mecab import MeCabAnalyzer |
|
|
| analyzer = MeCabAnalyzer() |
| mecab_bin = os.getenv("MECAB_BIN", analyzer.mecab_bin) |
| args = [mecab_bin] |
| if isinstance(analyzer.jumandic_path, str) and os.path.isdir(analyzer.jumandic_path): |
| args += ["-d", analyzer.jumandic_path] |
| p = subprocess.run(args, input=text, text=True, capture_output=True) |
| out = (p.stdout or "") + ("\n" + p.stderr if p.stderr else "") |
| if p.returncode != 0: |
| return out.strip() or f"mecab error rc={p.returncode}" |
| |
| lines = [] |
| for line in out.splitlines(): |
| if not line or line.strip() == "EOS": |
| lines.append("EOS") |
| continue |
| if "\t" in line: |
| surface, feats = line.split("\t", 1) |
| parts = [s.strip() for s in feats.split(",")] |
| trimmed = parts[:6] |
| while len(trimmed) < 6: |
| trimmed.append("*") |
| lines.append(f"{surface}\t{','.join(trimmed)}") |
| else: |
| lines.append(line) |
| |
| if not lines or lines[-1] != "EOS": |
| lines.append("EOS") |
| return "\n".join(lines) |
| except FileNotFoundError: |
| return "MeCabバイナリが見つかりません(MECAB_BINやpackages.txtを確認)。" |
| except Exception as e: |
| return f"mecab実行時エラー: {e}" |
|
|
|
|
| def analyze(text: str): |
| if not text or not text.strip(): |
| return "", "" |
|
|
| try: |
| _ensure_model() |
| from infer import predict_morphemes_from_text |
|
|
| text = text.strip() |
| result = predict_morphemes_from_text(text, _model, _exp_info, silent=True) |
| if not result: |
| return "推論に失敗しました。", mecab_plain(text) |
| results, optimal_morphemes = result |
| mecari_out = _to_mecab_lines(results, optimal_morphemes) |
| mecab_out = mecab_plain(text) |
| return mecari_out, mecab_out |
| except FileNotFoundError: |
| return ( |
| "MeCabが見つかりません。Spaceのpackages.txtに 'mecab' と 'mecab-jumandic-utf8' を含めてビルドし直すか、\n" |
| "変数 MECAB_BIN=/usr/bin/mecab を設定してください。" |
| ), "" |
| except Exception as e: |
| import traceback |
|
|
| tb = traceback.format_exc() |
| return f"エラー: {e}\n\n{tb}", "" |
|
|
|
|
| FONT_CSS = """ |
| /* Prefer common system fonts for Latin text */ |
| body, .gradio-container, .prose, textarea, input, button, |
| .gr-text-input input, .gr-text-input textarea, .gr-textbox textarea { |
| font-family: system-ui, -apple-system, 'Segoe UI', Roboto, 'Noto Sans', |
| 'Helvetica Neue', Arial, 'Apple Color Emoji', 'Segoe UI Emoji', |
| sans-serif !important; |
| } |
| """ |
|
|
| with gr.Blocks(theme=gr.themes.Soft(), css=FONT_CSS) as demo: |
| gr.Markdown( |
| """ |
| # Mecari Morpheme Analyzer |
| |
| 形態素解析器"Mecari"のデモです。Googleが発表した手法の非公式再現実装です。GitHub: https://github.com/zbller/Mecari |
| """ |
| ) |
|
|
| with gr.Row(): |
| inp = gr.Textbox(label="テキスト入力", value="外国人参政権", placeholder="とうきょうに行った", lines=3) |
| btn = gr.Button("解析する") |
| with gr.Row(): |
| out_mecari = gr.Textbox(label="Mecari", lines=10) |
| out_mecab = gr.Textbox(label="MeCab(Jumandic)", lines=10) |
| gr.Examples( |
| examples=[ |
| ["とうきょうに行った"], |
| ["吾輩わがはいは猫である。名前はまだ無い。"] |
| ], |
| inputs=inp, |
| outputs=[out_mecari, out_mecab], |
| fn=analyze, |
| label="Good examples", |
| run_on_click=True, |
| cache_examples=False, |
| ) |
| gr.Examples( |
| examples=[ |
| ["すもももももももものうち"], |
| ["こちら葛飾区亀有公園前派出所"] |
| ], |
| inputs=inp, |
| outputs=[out_mecari, out_mecab], |
| fn=analyze, |
| label="Bad examples", |
| run_on_click=True, |
| cache_examples=False, |
| ) |
| btn.click(fn=analyze, inputs=inp, outputs=[out_mecari, out_mecab]) |
|
|
| |
| def _warmup(): |
| try: |
| _ensure_model() |
| except Exception: |
| pass |
|
|
| _warmup() |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860"))) |
|
|