Spaces:

TranTruongMMCII
/

UIT.CS2229.ReACC

Running

App Files Files Community

TranTruongMMCII commited on 5 days ago

Commit

8b5d27e

1 Parent(s): b11a23b

refactor

Browse files

Files changed (11) hide show

app.py +122 -97
checkpoint-best/baseline/config.json +0 -37
checkpoint-best/baseline/generation_config.json +0 -7
checkpoint-best/baseline/tokenizer.json +0 -0
checkpoint-best/baseline/tokenizer_config.json +0 -21
checkpoint-best/eol/config.json +0 -37
checkpoint-best/eol/generation_config.json +0 -7
checkpoint-best/eol/tokenizer.json +0 -0
checkpoint-best/eol/tokenizer_config.json +0 -21
requirements.txt +6 -5
retriever_stub.py +0 -19

app.py CHANGED Viewed

@@ -1,70 +1,139 @@
-import os
 import re
 import gc
 from pathlib import Path
 import torch
 import gradio as gr
 from model_utils import load_model_and_tokenizer, generate_completion
 # ============================================================
-# Path config
 # ============================================================
-BASE_DIR = Path(__file__).parent
-MODEL_PATHS = {
-    "Generator - Baseline": BASE_DIR / "checkpoint-best" / "baseline",
-    "Generator - EOL": BASE_DIR / "checkpoint-best" / "eol",
 }
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 _current_model_name = None
 _current_tokenizer = None
 _current_model = None
 # ============================================================
-# Model loading
 # ============================================================
-def get_model(model_name: str):
     """
-    Lazily load selected model.
-    Only one model is kept in memory at a time.
     """
-    global _current_model_name, _current_tokenizer, _current_model
-    if model_name not in MODEL_PATHS:
         raise ValueError(f"Unknown model option: {model_name}")
-    model_path = MODEL_PATHS[model_name]
     if not model_path.exists():
         raise FileNotFoundError(
-            f"Model path not found: {model_path}\n"
-            f"Expected structure: checkpoint-best/baseline and checkpoint-best/eol"
         )
-    # Reuse current loaded model
     if _current_model_name == model_name and _current_model is not None:
-        return _current_tokenizer, _current_model, model_path
-    # Unload old model if switching
     if _current_model is not None:
         del _current_model
         del _current_tokenizer
         _current_model = None
         _current_tokenizer = None
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-    print(f"Loading model: {model_name} from {model_path}")
     tokenizer, model = load_model_and_tokenizer(str(model_path))
     model.to(device)
@@ -73,84 +142,37 @@ def get_model(model_name: str):
     _current_model_name = model_name
     _current_tokenizer = tokenizer
     _current_model = model
     return tokenizer, model, model_path
 # ============================================================
-# Soft normalization adapters
 # ============================================================
 def normalize_line(line: str) -> str:
-    """
-    Soft-normalize one line to be closer to training token style.
-    Example:
-      def add(a, b):
-    becomes:
-      def add ( a , b ) :
-    """
-    # Put spaces around common Python punctuation/operators
     line = re.sub(r"([()\[\]{}:,.=+\-*/<>])", r" \1 ", line)
-    # Collapse spaces
     line = re.sub(r"\s+", " ", line)
     return line.strip()
 def context_to_tokens(code: str) -> str:
-    """
-    Convert normal-looking code into training-style token text.
-    Important:
-    - Preserve line boundaries as <EOL>
-    - Do not fake <STR_LIT> / <NUM_LIT>
-    """
-    code = code.replace("\t", "    ")
-    lines = code.splitlines()
-    normalized_lines = []
-    for line in lines:
-        norm = normalize_line(line)
-        if norm:
-            normalized_lines.append(norm)
-    return " <EOL> ".join(normalized_lines).strip()
 def tokens_to_readable(code: str) -> str:
-    """
-    Convert generated token text back to readable form.
-    This is demo-level detokenization, not a perfect Python formatter.
-    """
     code = code.replace("<EOL>", "\n")
-    # Remove spaces before punctuation
     code = re.sub(r"\s+([)\]\}:,])", r"\1", code)
-    # Remove spaces after opening punctuation
     code = re.sub(r"([(\[\{])\s+", r"\1", code)
-    # Compact common binary operators mildly
-    code = re.sub(r"\s*=\s*", " = ", code)
-    code = re.sub(r"\s*\+\s*", " + ", code)
-    code = re.sub(r"\s*-\s*", " - ", code)
-    code = re.sub(r"\s*\*\s*", " * ", code)
-    code = re.sub(r"\s*/\s*", " / ", code)
-    code = re.sub(r"\s*<\s*", " < ", code)
-    code = re.sub(r"\s*>\s*", " > ", code)
-    # Clean repeated spaces
-    code = re.sub(r"[ \t]+", " ", code)
     return code.strip()
 # ============================================================
-# Inference
 # ============================================================
 def run_demo(model_name: str, context: str):
@@ -158,13 +180,10 @@ def run_demo(model_name: str, context: str):
     token_context = context_to_tokens(context)
-    # No retriever for now
-    token_retrieved = ""
     token_output = generate_completion(
         model=model,
         tokenizer=tokenizer,
-        retrieved=token_retrieved,
         context=token_context,
         device=device,
         max_length=256,
@@ -178,50 +197,56 @@ def run_demo(model_name: str, context: str):
     logs = (
         "=== DEMO LOGS ===\n\n"
         f"[Selected model]\n{model_name}\n\n"
-        f"[Model path]\n{model_path}\n\n"
-        "[Raw Context]\n"
-        f"{context}\n\n"
         "[Context → Tokens]\n"
         f"{token_context}\n\n"
-        "[Retrieved → Tokens]\n"
-        f"{token_retrieved}\n\n"
-        "[Generator Output → Tokens]\n"
-        f"{token_output}\n\n"
-        "[Prediction]\n"
-        f"{prediction}\n"
     )
     return prediction, logs
 # ============================================================
-# Gradio UI
 # ============================================================
 demo = gr.Interface(
     fn=run_demo,
     inputs=[
         gr.Dropdown(
-            choices=["Generator - Baseline", "Generator - EOL"],
-            value="Generator - Baseline",
             label="Model",
         ),
         gr.Textbox(
-            lines=12,
             label="Context",
-            placeholder="def add(a, b):\n    return",
         ),
     ],
     outputs=[
-        gr.Textbox(lines=8, label="Prediction"),
-        gr.Textbox(lines=14, label="Logs"),
     ],
     title="ReACC Generator Demo",
-    description=(
-        "Compare Generator baseline and Generator + EOL. "
-        "Retriever integration will be added later."
-    ),
 )
 if __name__ == "__main__":
     demo.launch()

 import re
 import gc
+import hashlib
 from pathlib import Path
 import torch
 import gradio as gr
+from huggingface_hub import snapshot_download
 from model_utils import load_model_and_tokenizer, generate_completion
 # ============================================================
+# CONFIG
 # ============================================================
+REMOTE_MODEL_REPO = "TranTruongMMCII/UIT.CS2229.Generator"
+# Mapping dropdown → folder trong model repo
+MODEL_VARIANTS = {
+    "Generator - Baseline": "baseline",
+    "Generator - EOL": "eol",
 }
+# Hành vi khi start app
+PRE_DOWNLOAD_MODELS = True          # tải model về cache ngay khi start
+WARMUP_DEFAULT_MODEL = True         # load sẵn baseline vào RAM
+DEFAULT_MODEL_NAME = "Generator - Baseline"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ============================================================
+# GLOBAL CACHE (SESSION-LIFETIME)
+# ============================================================
+_model_paths_cache = {}             # cache path model đã download
 _current_model_name = None
 _current_tokenizer = None
 _current_model = None
+_current_model_path = None
 # ============================================================
+# UTILS
 # ============================================================
+def file_fingerprint(path: Path) -> str:
+    """Short SHA256 fingerprint to verify model identity."""
+    if not path.exists():
+        return "missing"
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(1024 * 1024), b""):
+            h.update(chunk)
+    return h.hexdigest()[:16]
+def resolve_remote_model_path(model_name: str) -> Path:
     """
+    Download model folder from remote HF model repo.
+    Download happens once per runtime and is cached.
     """
+    if model_name in _model_paths_cache:
+        return _model_paths_cache[model_name]
+    if model_name not in MODEL_VARIANTS:
         raise ValueError(f"Unknown model option: {model_name}")
+    variant = MODEL_VARIANTS[model_name]
+    remote_subdir = f"checkpoint-best/{variant}"
+    local_repo_dir = snapshot_download(
+        repo_id=REMOTE_MODEL_REPO,
+        repo_type="model",
+        allow_patterns=[f"{remote_subdir}/*"],
+    )
+    model_path = Path(local_repo_dir) / remote_subdir
     if not model_path.exists():
+        raise FileNotFoundError(f"Missing model folder: {model_path}")
+    if not (model_path / "model.safetensors").exists():
         raise FileNotFoundError(
+            f"model.safetensors not found in {model_path}"
         )
+    _model_paths_cache[model_name] = model_path
+    return model_path
+def preload_model_folders():
+    """Download all model folders into HF cache (no RAM load)."""
+    print("Pre-downloading model folders...")
+    for name in MODEL_VARIANTS:
+        try:
+            path = resolve_remote_model_path(name)
+            print(f"✔ Cached {name}: {path}")
+        except Exception as e:
+            print(f"⚠ Failed to preload {name}: {e}")
+# ============================================================
+# MODEL LOADING (RAM)
+# ============================================================
+def get_model(model_name: str):
+    """
+    Load selected model into RAM.
+    Only ONE model is kept in memory at a time.
+    """
+    global _current_model_name, _current_tokenizer, _current_model, _current_model_path
     if _current_model_name == model_name and _current_model is not None:
+        return _current_tokenizer, _current_model, _current_model_path
+    # unload old model
     if _current_model is not None:
         del _current_model
         del _current_tokenizer
         _current_model = None
         _current_tokenizer = None
+        _current_model_path = None
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+    model_path = resolve_remote_model_path(model_name)
+    print(f"Loading model: {model_name}")
+    print(f"Path: {model_path}")
+    print(f"SHA: {file_fingerprint(model_path / 'model.safetensors')}")
     tokenizer, model = load_model_and_tokenizer(str(model_path))
     model.to(device)
     _current_model_name = model_name
     _current_tokenizer = tokenizer
     _current_model = model
+    _current_model_path = model_path
     return tokenizer, model, model_path
 # ============================================================
+# SOFT NORMALIZATION
 # ============================================================
 def normalize_line(line: str) -> str:
     line = re.sub(r"([()\[\]{}:,.=+\-*/<>])", r" \1 ", line)
     line = re.sub(r"\s+", " ", line)
     return line.strip()
 def context_to_tokens(code: str) -> str:
+    lines = code.replace("\t", " ").splitlines()
+    tokens = [normalize_line(l) for l in lines if l.strip()]
+    return " <EOL> ".join(tokens)
 def tokens_to_readable(code: str) -> str:
     code = code.replace("<EOL>", "\n")
     code = re.sub(r"\s+([)\]\}:,])", r"\1", code)
     code = re.sub(r"([(\[\{])\s+", r"\1", code)
+    code = re.sub(r"\s+", " ", code)
     return code.strip()
 # ============================================================
+# INFERENCE
 # ============================================================
 def run_demo(model_name: str, context: str):
     token_context = context_to_tokens(context)
     token_output = generate_completion(
         model=model,
         tokenizer=tokenizer,
+        retrieved="",
         context=token_context,
         device=device,
         max_length=256,
     logs = (
         "=== DEMO LOGS ===\n\n"
         f"[Selected model]\n{model_name}\n\n"
+        f"[Model repo]\n{REMOTE_MODEL_REPO}\n\n"
+        f"[Local cache path]\n{model_path}\n\n"
+        f"[Model fingerprint]\n{file_fingerprint(model_path / 'model.safetensors')}\n\n"
+        f"[Device]\n{device}\n\n"
         "[Context → Tokens]\n"
         f"{token_context}\n\n"
+        "[Output → Tokens]\n"
+        f"{token_output}\n"
     )
     return prediction, logs
 # ============================================================
+# GRADIO UI
 # ============================================================
 demo = gr.Interface(
     fn=run_demo,
     inputs=[
         gr.Dropdown(
+            choices=list(MODEL_VARIANTS.keys()),
+            value=DEFAULT_MODEL_NAME,
             label="Model",
         ),
         gr.Textbox(
+            lines=10,
             label="Context",
+            placeholder="def sum(a, b):\n    return",
         ),
     ],
     outputs=[
+        gr.Textbox(lines=6, label="Prediction"),
+        gr.Textbox(lines=16, label="Logs"),
     ],
     title="ReACC Generator Demo",
+    description="Compare Generator Baseline vs Generator + EOL (model loaded from external HF repo).",
 )
+# ============================================================
+# STARTUP
+# ============================================================
+if PRE_DOWNLOAD_MODELS:
+    preload_model_folders()
+if WARMUP_DEFAULT_MODEL:
+    print(f"Warming up default model: {DEFAULT_MODEL_NAME}")
+    get_model(DEFAULT_MODEL_NAME)
 if __name__ == "__main__":
     demo.launch()

checkpoint-best/baseline/config.json DELETED Viewed

@@ -1,37 +0,0 @@
-{
-  "_num_labels": 2,
-  "activation_function": "gelu_new",
-  "add_cross_attention": false,
-  "architectures": [
-    "GPT2LMHeadModel"
-  ],
-  "attn_pdrop": 0.1,
-  "bos_token_id": 0,
-  "dtype": "float32",
-  "embd_pdrop": 0.1,
-  "eos_token_id": 2,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-05,
-  "model_type": "gpt2",
-  "n_ctx": 1024,
-  "n_embd": 768,
-  "n_head": 12,
-  "n_inner": null,
-  "n_layer": 12,
-  "n_positions": 1024,
-  "output_past": true,
-  "pad_token_id": 1,
-  "reorder_and_upcast_attn": false,
-  "resid_pdrop": 0.1,
-  "scale_attn_by_inverse_layer_idx": false,
-  "scale_attn_weights": true,
-  "summary_activation": null,
-  "summary_first_dropout": 0.1,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "tie_word_embeddings": true,
-  "transformers_version": "5.0.0",
-  "use_cache": true,
-  "vocab_size": 50007
-}

checkpoint-best/baseline/generation_config.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "pad_token_id": 1,
-  "transformers_version": "5.0.0"
-}

checkpoint-best/baseline/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-best/baseline/tokenizer_config.json DELETED Viewed

@@ -1,21 +0,0 @@
-{
-  "add_prefix_space": false,
-  "backend": "tokenizers",
-  "bos_token": "<s>",
-  "eos_token": "</s>",
-  "errors": "replace",
-  "extra_special_tokens": [
-    "<RET>",
-    "</RET>",
-    "<CTX>",
-    "</CTX>",
-    "<GEN>"
-  ],
-  "full_tokenizer_file": null,
-  "is_local": false,
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "<pad>",
-  "sep_token": "<EOL>",
-  "tokenizer_class": "GPT2Tokenizer",
-  "unk_token": "<|UNKNOWN|>"
-}

checkpoint-best/eol/config.json DELETED Viewed

@@ -1,37 +0,0 @@
-{
-  "_num_labels": 2,
-  "activation_function": "gelu_new",
-  "add_cross_attention": false,
-  "architectures": [
-    "GPT2LMHeadModel"
-  ],
-  "attn_pdrop": 0.1,
-  "bos_token_id": 0,
-  "dtype": "float32",
-  "embd_pdrop": 0.1,
-  "eos_token_id": 2,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-05,
-  "model_type": "gpt2",
-  "n_ctx": 1024,
-  "n_embd": 768,
-  "n_head": 12,
-  "n_inner": null,
-  "n_layer": 12,
-  "n_positions": 1024,
-  "output_past": true,
-  "pad_token_id": 1,
-  "reorder_and_upcast_attn": false,
-  "resid_pdrop": 0.1,
-  "scale_attn_by_inverse_layer_idx": false,
-  "scale_attn_weights": true,
-  "summary_activation": null,
-  "summary_first_dropout": 0.1,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "tie_word_embeddings": true,
-  "transformers_version": "5.0.0",
-  "use_cache": true,
-  "vocab_size": 50007
-}

checkpoint-best/eol/generation_config.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "pad_token_id": 1,
-  "transformers_version": "5.0.0"
-}

checkpoint-best/eol/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-best/eol/tokenizer_config.json DELETED Viewed

@@ -1,21 +0,0 @@
-{
-  "add_prefix_space": false,
-  "backend": "tokenizers",
-  "bos_token": "<s>",
-  "eos_token": "</s>",
-  "errors": "replace",
-  "extra_special_tokens": [
-    "<RET>",
-    "</RET>",
-    "<CTX>",
-    "</CTX>",
-    "<GEN>"
-  ],
-  "full_tokenizer_file": null,
-  "is_local": false,
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "<pad>",
-  "sep_token": "<EOL>",
-  "tokenizer_class": "GPT2Tokenizer",
-  "unk_token": "<|UNKNOWN|>"
-}

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-torch
-transformers
-gradio
-tqdm
-numpy

+torch
+transformers
+gradio
+tqdm
+numpy
+huggingface_hub

retriever_stub.py DELETED Viewed

@@ -1,19 +0,0 @@
-def retrieve_code_stub(context: str) -> str:
-    """
-    Mock retriever for demo purposes.
-    Later, replace this with real retriever logic.
-    """
-    # Simple heuristic demo (hardcoded or rule-based)
-    if "pytest" in context:
-        return (
-            "def data(): <EOL>"
-            "    tmpdir = py.test.ensuretemp('<STR_LIT>') <EOL>"
-            "    return tmpdir"
-        )
-    if "def add" in context:
-        return "def add(a, b): <EOL> return a + b"
-    # default fallback
-    return ""