Spaces:
Running
Running
| """ | |
| Native Code Submission Portal — Language, Decoded / Expedition Tiny Aya | |
| ======================================================================= | |
| A Gradio app for collecting natively-written Legesher code from | |
| native speakers of Chinese, Spanish, and Urdu. | |
| Contributors write Python in their native language using Legesher, | |
| run it to verify it works, then submit it for research use. | |
| HF Space secrets required: | |
| HF_TOKEN — write token for pushing submissions to the dataset repo | |
| HF_DATASET_ID — target dataset repo (e.g. "legesher-research/native-code-submissions") | |
| """ | |
| import json | |
| import logging | |
| import os | |
| import subprocess | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| # --------------------------------------------------------------------------- | |
| # Install bundled Legesher wheels (not on PyPI) — runs on HF Spaces startup | |
| # --------------------------------------------------------------------------- | |
| def _install_legesher_wheels(): | |
| wheels_dir = Path(__file__).parent / "wheels" | |
| if not wheels_dir.exists(): | |
| return | |
| wheels = sorted(wheels_dir.glob("*.whl")) | |
| if not wheels: | |
| return | |
| print(f"Installing {len(wheels)} Legesher wheel(s)...") | |
| result = subprocess.run( | |
| [sys.executable, "-m", "pip", "install", "--quiet"] + [str(w) for w in wheels], | |
| capture_output=True, text=True | |
| ) | |
| if result.returncode == 0: | |
| print("Legesher wheels installed.") | |
| else: | |
| print("Wheel install warning:", result.stderr[-300:]) | |
| _install_legesher_wheels() | |
| import uuid | |
| from datetime import datetime, timezone | |
| import gradio as gr | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Legesher setup | |
| # --------------------------------------------------------------------------- | |
| try: | |
| from legesher_core import TokenTranslator | |
| LEGESHER_OK = True | |
| logger.info("Legesher loaded successfully.") | |
| except ImportError as e: | |
| LEGESHER_OK = False | |
| logger.warning(f"Legesher not available: {e}. Run/translate disabled.") | |
| # Cache translators so we don't reload on every button click | |
| _translators: dict[str, TokenTranslator] = {} | |
| def _get_native_to_en(lang: str) -> TokenTranslator: | |
| if lang not in _translators: | |
| en_to_native = TokenTranslator.from_language_pack(lang) | |
| _translators[lang] = en_to_native.reverse() | |
| return _translators[lang] | |
| # --------------------------------------------------------------------------- | |
| # Exercises | |
| # --------------------------------------------------------------------------- | |
| EXERCISES_PATH = Path(__file__).parent / "exercises.json" | |
| with open(EXERCISES_PATH, encoding="utf-8") as _f: | |
| _ALL_EXERCISES: dict = json.load(_f) | |
| TIER_KEYS = ["tier1", "tier2", "tier3"] | |
| TIER_LABELS = ["Tier 1 — Basic (10–20 lines)", "Tier 2 — Applied (20–50 lines)", "Tier 3 — Domain (50–100 lines)"] | |
| LANGUAGES = { | |
| "zh": "中文 (Chinese)", | |
| "es": "Español (Spanish)", | |
| "ur": "اردو (Urdu)", | |
| } | |
| # Reverse map: display label -> code (handles Gradio 6 returning label instead of value) | |
| _LABEL_TO_CODE = {v: k for k, v in LANGUAGES.items()} | |
| # --------------------------------------------------------------------------- | |
| # Keyword reference — loaded from REFERENCE.md files per language | |
| # --------------------------------------------------------------------------- | |
| _REFERENCES_DIR = Path(__file__).parent / "references" | |
| def load_reference(lang: str) -> str: | |
| """Load the Legesher REFERENCE.md for the given language.""" | |
| path = _REFERENCES_DIR / f"{lang}.md" | |
| if path.exists(): | |
| return path.read_text(encoding="utf-8") | |
| return f"_Reference file not found for `{lang}`._" | |
| def normalize_lang(lang_input: str) -> str: | |
| """Accept either a language code ('ur') or display label ('اردو (Urdu)') and return the code.""" | |
| if lang_input in LANGUAGES: | |
| return lang_input | |
| if lang_input in _LABEL_TO_CODE: | |
| return _LABEL_TO_CODE[lang_input] | |
| logger.warning(f"Unknown lang input: {lang_input!r}, falling back to zh") | |
| return "zh" | |
| def get_exercise_choices(lang: str, tier_idx: int) -> list[str]: | |
| tier_key = TIER_KEYS[tier_idx] | |
| exercises = _ALL_EXERCISES.get(lang, {}).get(tier_key, []) | |
| return [ex["title"] for ex in exercises] | |
| def get_exercise(lang: str, tier_idx: int, title: str) -> dict | None: | |
| tier_key = TIER_KEYS[tier_idx] | |
| for ex in _ALL_EXERCISES.get(lang, {}).get(tier_key, []): | |
| if ex["title"] == title: | |
| return ex | |
| return None | |
| # --------------------------------------------------------------------------- | |
| # HuggingFace dataset submission | |
| # --------------------------------------------------------------------------- | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| HF_DATASET_ID = os.environ.get("HF_DATASET_ID", "legesher-research/native-code-submissions") | |
| def push_submission(row: dict) -> bool: | |
| """Push a single submission row to the HF dataset as a JSONL entry.""" | |
| if not HF_TOKEN: | |
| logger.warning("HF_TOKEN not set — submission not saved to HF.") | |
| return False | |
| try: | |
| from huggingface_hub import HfApi | |
| api = HfApi(token=HF_TOKEN) | |
| filename = f"submissions/{row['language']}/{row['id']}.json" | |
| content = json.dumps(row, ensure_ascii=False, indent=2).encode("utf-8") | |
| api.upload_file( | |
| path_or_fileobj=content, | |
| path_in_repo=filename, | |
| repo_id=HF_DATASET_ID, | |
| repo_type="dataset", | |
| commit_message=f"[submission] {row['language']} {row['exercise_id']} {row['id'][:8]}", | |
| ) | |
| return True | |
| except Exception as e: | |
| logger.error(f"HF upload failed: {e}") | |
| return False | |
| # --------------------------------------------------------------------------- | |
| # Core actions | |
| # --------------------------------------------------------------------------- | |
| def run_code(code: str, lang: str, stdin: str = "") -> str: | |
| """Translate native code to English Python and execute it safely.""" | |
| lang = normalize_lang(lang) | |
| if not code.strip(): | |
| return "Write some code first." | |
| if not LEGESHER_OK: | |
| return "Legesher is not available in this environment. Code execution disabled." | |
| try: | |
| translator = _get_native_to_en(lang) | |
| english_code = translator.translate_code(code) | |
| except Exception as e: | |
| return f"Translation error: {e}" | |
| try: | |
| with tempfile.NamedTemporaryFile( | |
| mode="w", suffix=".py", delete=False, encoding="utf-8" | |
| ) as tmp: | |
| tmp.write(english_code) | |
| tmp_path = tmp.name | |
| result = subprocess.run( | |
| [sys.executable, "-X", "utf8", tmp_path], | |
| input=stdin if stdin.strip() else None, | |
| capture_output=True, | |
| text=True, | |
| timeout=10, | |
| encoding="utf-8", | |
| env={**os.environ, "PYTHONIOENCODING": "utf-8", "PYTHONUTF8": "1"}, | |
| ) | |
| os.unlink(tmp_path) | |
| out = result.stdout.strip() | |
| err = result.stderr.strip() | |
| if err: | |
| out = (out + "\n\n⚠️ Errors:\n" + err).strip() | |
| return out or "(no output — did you call your function at the bottom?)" | |
| except subprocess.TimeoutExpired: | |
| return "Timed out after 10 seconds. Check for infinite loops or missing input." | |
| except Exception as e: | |
| return f"Execution error: {e}" | |
| def submit_code( | |
| code: str, | |
| lang: str, # normalized below | |
| tier_idx: int, | |
| exercise_title: str, | |
| time_spent: int, | |
| consent: bool, | |
| profile: gr.OAuthProfile | None, | |
| ) -> str: | |
| if profile is None: | |
| return "Please sign in with your HuggingFace account (button above) before submitting." | |
| lang = normalize_lang(lang) | |
| if not consent: | |
| return "Please tick the consent checkbox before submitting." | |
| if not code.strip(): | |
| return "Code is empty — nothing to submit." | |
| if not exercise_title: | |
| return "Select an exercise before submitting." | |
| ex = get_exercise(lang, tier_idx, exercise_title) | |
| exercise_id = ex["id"] if ex else f"{lang}-unknown" | |
| row = { | |
| "id": str(uuid.uuid4()), | |
| "code": code, | |
| "language": lang, | |
| "exercise_id": exercise_id, | |
| "exercise_title": exercise_title, | |
| "tier": tier_idx + 1, | |
| "time_spent_minutes": time_spent, | |
| "submitted_at": datetime.now(timezone.utc).isoformat(), | |
| "consent_given": True, | |
| "legesher_version": "0.7.3", | |
| "hf_username": profile.username, | |
| } | |
| saved = push_submission(row) | |
| if saved: | |
| return f"Submitted! Your contribution ID: `{row['id'][:8]}`. Thank you for contributing to Language, Decoded." | |
| else: | |
| # Fallback: save locally so nothing is lost | |
| fallback_dir = Path(__file__).parent / "local_submissions" | |
| fallback_dir.mkdir(exist_ok=True) | |
| (fallback_dir / f"{row['id']}.json").write_text( | |
| json.dumps(row, ensure_ascii=False, indent=2), encoding="utf-8" | |
| ) | |
| return f"Saved locally (HF upload unavailable). ID: `{row['id'][:8]}`." | |
| # --------------------------------------------------------------------------- | |
| # UI helpers | |
| # --------------------------------------------------------------------------- | |
| def update_exercise_dropdown(lang: str, tier_idx: int): | |
| choices = get_exercise_choices(lang, tier_idx) | |
| value = choices[0] if choices else None | |
| return gr.update(choices=choices, value=value) | |
| def update_prompt(lang: str, tier_idx: int, exercise_title: str) -> str: | |
| if not exercise_title: | |
| return "" | |
| ex = get_exercise(lang, tier_idx, exercise_title) | |
| if not ex: | |
| return "" | |
| time_note = f"*Estimated time: ~{ex['time_estimate_min']} minutes*\n\n" | |
| return time_note + ex["prompt"] | |
| def update_on_lang_change(lang: str, tier_idx: int): | |
| choices = get_exercise_choices(lang, tier_idx) | |
| value = choices[0] if choices else None | |
| prompt = update_prompt(lang, tier_idx, value) if value else "" | |
| return gr.Dropdown(choices=choices, value=value), prompt | |
| # --------------------------------------------------------------------------- | |
| # Gradio UI | |
| # --------------------------------------------------------------------------- | |
| HEADER_MD = """ | |
| # Native Code Contribution Portal | |
| ### Language, Decoded — Expedition Tiny Aya | |
| Write Python code in your native language using [Legesher](https://legesher.io) and contribute to our research on native-language programming. | |
| **How it works:** | |
| 1. Select your language and an exercise | |
| 2. Write your solution using Legesher (native keywords + native variable names) | |
| 3. Click **Run** to verify your code works | |
| 4. Fill in the metadata and click **Submit** | |
| --- | |
| """ | |
| CONSENT_TEXT = ( | |
| "I confirm I am a native or fluent speaker of the selected language and wrote this code myself. " | |
| "I grant permission for this code to be used in the Language, Decoded research project and " | |
| "released as part of an open dataset under Apache 2.0 / CC-BY-4.0. " | |
| "I retain copyright over my submission. My HuggingFace username will be stored alongside my " | |
| "contribution for attribution and credit — no other personal information is collected." | |
| ) | |
| SIDEBAR_MD = """ | |
| ### What counts as native code? | |
| - Variables, functions, and classes named in your language | |
| - Written from scratch — not translated from an English solution | |
| - Reflects how *you* would naturally think about the problem | |
| - Uses Legesher keywords throughout | |
| ### What doesn't qualify? | |
| - Word-for-word translations of English code | |
| - AI-generated code | |
| - English variable names with only keywords swapped | |
| Need help? Read the full [criteria document](https://linear.app/legesher-research/document/native-code-criteria-qualification-disqualification-and-quality-rubric-5705fc83d6e8). | |
| """ | |
| def build_app() -> gr.Blocks: | |
| with gr.Blocks(title="Native Code Submission — Language, Decoded") as app: | |
| gr.Markdown(HEADER_MD) | |
| # HF OAuth login — must be at top level of Blocks for the redirect to work | |
| with gr.Row(): | |
| with gr.Column(scale=0, min_width=200): | |
| gr.LoginButton() | |
| with gr.Column(scale=1): | |
| gr.Markdown("_Sign in with your HuggingFace account so we can credit your contribution._") | |
| with gr.Row(): | |
| # ---- Left column: config + exercise ---- | |
| with gr.Column(scale=1): | |
| lang_dropdown = gr.Dropdown( | |
| label="Your language", | |
| choices=[(v, k) for k, v in LANGUAGES.items()], | |
| value="zh", | |
| ) | |
| tier_radio = gr.Radio( | |
| label="Exercise tier", | |
| choices=TIER_LABELS, | |
| value=TIER_LABELS[0], | |
| ) | |
| exercise_dropdown = gr.Dropdown( | |
| label="Exercise", | |
| choices=get_exercise_choices("zh", 0), | |
| value=get_exercise_choices("zh", 0)[0], | |
| ) | |
| exercise_prompt = gr.Markdown( | |
| value=update_prompt("zh", 0, get_exercise_choices("zh", 0)[0]), | |
| label="Exercise prompt", | |
| ) | |
| gr.Markdown(SIDEBAR_MD) | |
| with gr.Accordion("Legesher keyword reference", open=False): | |
| cheatsheet = gr.Markdown( | |
| value=load_reference("zh"), | |
| ) | |
| # ---- Right column: editor + run + submit ---- | |
| with gr.Column(scale=2): | |
| gr.Markdown( | |
| "_Write your solution using Legesher — use native-language variable and function names_", | |
| ) | |
| code_editor = gr.Code( | |
| label="Your Legesher code", | |
| language="python", | |
| lines=20, | |
| value="", | |
| ) | |
| with gr.Row(): | |
| run_btn = gr.Button("Run code", variant="secondary") | |
| clear_btn = gr.Button("Clear", variant="stop") | |
| stdin_input = gr.Textbox( | |
| label="Program input (stdin) — one value per line", | |
| placeholder="e.g. 15\n(leave blank if your code needs no input)", | |
| lines=2, | |
| ) | |
| run_output = gr.Textbox( | |
| label="Output", | |
| lines=6, | |
| placeholder="Run your code to see output here...", | |
| interactive=False, | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("### Submit your solution") | |
| time_slider = gr.Slider( | |
| label="Time spent (minutes)", | |
| minimum=1, | |
| maximum=120, | |
| step=1, | |
| value=15, | |
| ) | |
| consent_checkbox = gr.Checkbox( | |
| label=CONSENT_TEXT, | |
| value=False, | |
| ) | |
| submit_btn = gr.Button("Submit contribution", variant="primary") | |
| submit_status = gr.Textbox( | |
| label="Submission status", | |
| interactive=False, | |
| ) | |
| # ---- State ---- | |
| tier_idx_state = gr.State(value=0) | |
| lang_state = gr.State(value="zh") | |
| # ---- Event wiring ---- | |
| def tier_to_idx(tier_label: str) -> int: | |
| return TIER_LABELS.index(tier_label) | |
| def on_tier_change(tier_label: str, lang: str): | |
| try: | |
| idx = TIER_LABELS.index(tier_label) if tier_label in TIER_LABELS else 0 | |
| choices = get_exercise_choices(lang, idx) | |
| value = choices[0] if choices else None | |
| prompt = update_prompt(lang, idx, value) if value else "" | |
| return idx, gr.update(choices=choices, value=value), prompt | |
| except Exception as e: | |
| logger.error(f"on_tier_change error: {e!r} | tier_label={tier_label!r} lang={lang!r}") | |
| return 0, gr.update(), "" | |
| def on_lang_change(lang_raw: str, tier_label: str): | |
| try: | |
| lang_code = normalize_lang(lang_raw) | |
| logger.info(f"on_lang_change: raw={lang_raw!r} -> code={lang_code!r} tier={tier_label!r}") | |
| idx = TIER_LABELS.index(tier_label) if tier_label in TIER_LABELS else 0 | |
| choices = get_exercise_choices(lang_code, idx) | |
| value = choices[0] if choices else None | |
| prompt = update_prompt(lang_code, idx, value) if value else "" | |
| sheet = load_reference(lang_code) | |
| logger.info(f"on_lang_change OK: choices={choices}") | |
| return lang_code, gr.update(choices=choices, value=value), prompt, sheet | |
| except Exception as e: | |
| logger.error(f"on_lang_change error: {e!r} | lang_raw={lang_raw!r}") | |
| return "zh", gr.update(), "", load_reference("zh") | |
| def on_exercise_change(exercise_title: str, lang_raw: str, tier_idx: int): | |
| # Use lang_dropdown value directly (always current), not lang_state (can be stale) | |
| try: | |
| lang_code = normalize_lang(lang_raw) | |
| return update_prompt(lang_code, tier_idx, exercise_title) | |
| except Exception as e: | |
| logger.error(f"on_exercise_change error: {e!r}") | |
| return "" | |
| lang_dropdown.change( | |
| on_lang_change, | |
| inputs=[lang_dropdown, tier_radio], | |
| outputs=[lang_state, exercise_dropdown, exercise_prompt, cheatsheet], | |
| ) | |
| tier_radio.change( | |
| on_tier_change, | |
| inputs=[tier_radio, lang_state], | |
| outputs=[tier_idx_state, exercise_dropdown, exercise_prompt], | |
| ) | |
| exercise_dropdown.change( | |
| on_exercise_change, | |
| inputs=[exercise_dropdown, lang_dropdown, tier_idx_state], | |
| outputs=[exercise_prompt], | |
| ) | |
| run_btn.click( | |
| fn=run_code, | |
| inputs=[code_editor, lang_state, stdin_input], | |
| outputs=[run_output], | |
| ) | |
| clear_btn.click( | |
| fn=lambda: ("", "", ""), | |
| outputs=[code_editor, stdin_input, run_output], | |
| ) | |
| submit_btn.click( | |
| fn=submit_code, | |
| inputs=[ | |
| code_editor, | |
| lang_state, | |
| tier_idx_state, | |
| exercise_dropdown, | |
| time_slider, | |
| consent_checkbox, | |
| ], | |
| outputs=[submit_status], | |
| ) | |
| return app | |
| if __name__ == "__main__": | |
| app = build_app() | |
| app.launch(theme=gr.themes.Soft(primary_hue="blue")) | |