Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import pandas as pd | |
| import requests | |
| import csv | |
| import json | |
| import threading | |
| import random | |
| from io import BytesIO | |
| from PIL import Image | |
| from datetime import datetime, timedelta | |
| from filelock import FileLock | |
| from huggingface_hub import HfApi, hf_hub_download | |
| DATASET_REPO_ID = os.environ.get("DATASET_REPO_ID", "fast-stager/property-labels") | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| CACHE_DIR = "/tmp/data" | |
| os.makedirs(CACHE_DIR, exist_ok=True) | |
| URL_FILE = "new_urls.json" | |
| LABEL_FILE = os.path.join(CACHE_DIR, "annotations.csv") | |
| VERIFY_FILE = os.path.join(CACHE_DIR, "verifications.csv") | |
| SKIP_FILE = os.path.join(CACHE_DIR, "skipped.csv") | |
| LOCK_FILE = os.path.join(CACHE_DIR, "data.lock") | |
| LEASE_FILE = os.path.join(CACHE_DIR, "leases.csv") | |
| LEASE_DURATION_SECONDS = 600 | |
| FETCH_SIZE = "w480_h360" | |
| MANUAL_EXCLUDE = {"075c8bb8a73c45d71788e711edd9e8d5l", "07a0544f217db88fe2b06fd5d38f02a6l", "6bf16112723de3318c44641958638a56l"} | |
| THUMB_SIZE = (350, 350) | |
| PAGE_SIZE = 50 | |
| # ββ sync ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def sync_pull(): | |
| token = HF_TOKEN if HF_TOKEN and len(HF_TOKEN) > 5 else None | |
| for filename in ["annotations.csv", "verifications.csv", "skipped.csv"]: | |
| try: | |
| local_path = os.path.join(CACHE_DIR, filename) | |
| if os.path.exists(local_path): os.remove(local_path) | |
| hf_hub_download(repo_id=DATASET_REPO_ID, filename=filename, repo_type="dataset", local_dir=CACHE_DIR, token=token, force_download=True) | |
| except: pass | |
| def sync_push_background(local_path, remote_filename): | |
| token = HF_TOKEN if HF_TOKEN and len(HF_TOKEN) > 5 else None | |
| if not token: return | |
| def _push(): | |
| try: | |
| api = HfApi(token=token) | |
| api.upload_file(path_or_fileobj=local_path, path_in_repo=remote_filename, repo_id=DATASET_REPO_ID, repo_type="dataset") | |
| except: pass | |
| threading.Thread(target=_push).start() | |
| # ββ init ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def init_files(): | |
| sync_pull() | |
| for f in [LABEL_FILE, VERIFY_FILE, SKIP_FILE]: | |
| if not os.path.exists(f): | |
| cols = ["timestamp", "user", "group_id", "url", "score", "label"] if f == LABEL_FILE else \ | |
| ["timestamp", "user", "group_id", "url", "is_correct", "corrected_label", "corrected_score"] if f == VERIFY_FILE else \ | |
| ["timestamp", "user", "group_id"] | |
| pd.DataFrame(columns=cols).to_csv(f, index=False) | |
| if not os.path.exists(LEASE_FILE): | |
| pd.DataFrame(columns=["user_id", "group_id", "mode", "leased_at", "expires_at"]).to_csv(LEASE_FILE, index=False) | |
| init_files() | |
| # ββ data loading ββββββββββββββββββββββββββββββββββββββ | |
| def load_all_urls(): | |
| if not os.path.exists(URL_FILE): return [] | |
| try: | |
| with open(URL_FILE, 'r') as f: | |
| data = json.load(f) | |
| return [img for g in data.get("groups", []) for img in g.get("images", [])] | |
| except: return [] | |
| _ORDERED_GROUPS_CACHE = None | |
| def get_ordered_groups(): | |
| global _ORDERED_GROUPS_CACHE | |
| if _ORDERED_GROUPS_CACHE is not None: return _ORDERED_GROUPS_CACHE | |
| groups, seen = [], set() | |
| for u in load_all_urls(): | |
| try: gid = u.split("-m")[0].split("/")[-1] | |
| except: gid = "unknown" | |
| if gid not in seen: groups.append(gid); seen.add(gid) | |
| _ORDERED_GROUPS_CACHE = groups | |
| return groups | |
| _ALL_PAIRS_CACHE = None | |
| def get_all_image_pairs(): | |
| global _ALL_PAIRS_CACHE | |
| if _ALL_PAIRS_CACHE is not None: return _ALL_PAIRS_CACHE | |
| pairs = [] | |
| for u in load_all_urls(): | |
| try: gid = u.split("-m")[0].split("/")[-1] | |
| except: gid = "unknown" | |
| if gid not in MANUAL_EXCLUDE: pairs.append((gid, u)) | |
| _ALL_PAIRS_CACHE = pairs | |
| return pairs | |
| def get_clean_df(filepath): | |
| if not os.path.exists(filepath): return pd.DataFrame() | |
| try: | |
| df = pd.read_csv(filepath) | |
| if df.empty: return df | |
| if 'label' in df.columns: df['label'] = df['label'].astype(str).str.strip().str.lower() | |
| if 'corrected_label' in df.columns: df['corrected_label'] = df['corrected_label'].astype(str).str.strip().str.lower() | |
| if 'score' in df.columns: df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0).astype(int) | |
| if 'corrected_score' in df.columns: df['corrected_score'] = pd.to_numeric(df['corrected_score'], errors='coerce').fillna(0).astype(int) | |
| return df.drop_duplicates(subset=['url'], keep='last') | |
| except: return pd.DataFrame() | |
| # ββ leases ββββββββββββββββββββββββββββββββββββββββββββ | |
| def _read_leases(): | |
| if not os.path.exists(LEASE_FILE) or os.path.getsize(LEASE_FILE) == 0: | |
| return pd.DataFrame(columns=["user_id", "group_id", "mode", "leased_at", "expires_at"]) | |
| try: | |
| df = pd.read_csv(LEASE_FILE) | |
| df['expires_at'] = pd.to_datetime(df['expires_at'], errors='coerce') | |
| return df | |
| except: | |
| return pd.DataFrame(columns=["user_id", "group_id", "mode", "leased_at", "expires_at"]) | |
| def acquire_lease(user_id, group_id, mode): | |
| now = datetime.now() | |
| expires = now + timedelta(seconds=LEASE_DURATION_SECONDS) | |
| with FileLock(LOCK_FILE): | |
| df = _read_leases() | |
| df = df[df['expires_at'] > now] | |
| existing = df[(df['group_id'] == group_id) & (df['user_id'] != user_id)] | |
| if not existing.empty: | |
| df.to_csv(LEASE_FILE, index=False) | |
| return False | |
| df = df[df['user_id'] != user_id] | |
| new_row = pd.DataFrame([{"user_id": user_id, "group_id": group_id, "mode": mode, | |
| "leased_at": now.isoformat(), "expires_at": expires.isoformat()}]) | |
| df = pd.concat([df, new_row], ignore_index=True) | |
| df.to_csv(LEASE_FILE, index=False) | |
| return True | |
| def release_lease(user_id): | |
| with FileLock(LOCK_FILE): | |
| df = _read_leases() | |
| df = df[df['user_id'] != user_id] | |
| df.to_csv(LEASE_FILE, index=False) | |
| def get_leased_group_ids(exclude_user=None): | |
| now = datetime.now() | |
| with FileLock(LOCK_FILE): | |
| df = _read_leases() | |
| if df.empty: return set() | |
| df = df[df['expires_at'] > now] | |
| df.to_csv(LEASE_FILE, index=False) | |
| if exclude_user: | |
| df = df[df['user_id'] != exclude_user] | |
| return set(df['group_id'].unique()) | |
| def renew_lease(user_id): | |
| if not user_id: return | |
| new_expires = (datetime.now() + timedelta(seconds=LEASE_DURATION_SECONDS)).isoformat() | |
| with FileLock(LOCK_FILE): | |
| df = _read_leases() | |
| mask = df['user_id'] == user_id | |
| if mask.any(): | |
| df.loc[mask, 'expires_at'] = new_expires | |
| df.to_csv(LEASE_FILE, index=False) | |
| # ββ helpers βββββββββββββββββββββββββββββββββββββββββββ | |
| def thumb_url(url): | |
| return url.replace("w2048_h1536", FETCH_SIZE) | |
| def get_stats_text(): | |
| all_pairs = get_all_image_pairs() | |
| total = len(all_pairs) | |
| df_l = get_clean_df(LABEL_FILE) | |
| df_v = get_clean_df(VERIFY_FILE) | |
| l_count = len(df_l['url'].unique()) if not df_l.empty else 0 | |
| v_count = len(df_v['url'].unique()) if not df_v.empty else 0 | |
| return f"**Images:** {total} | **Labeled:** {l_count} | **Verified:** {v_count}" | |
| # ββ core: one image at a time βββββββββββββββββββββββββ | |
| def render_workspace(mode, history, user_id="user", specific_index=None, move_back=False): | |
| all_pairs = get_all_image_pairs() | |
| target_url = None | |
| target_gid = None | |
| if move_back and len(history) > 1: | |
| history.pop() | |
| target_url = history[-1] | |
| try: target_gid = target_url.split("-m")[0].split("/")[-1] | |
| except: target_gid = "unknown" | |
| elif specific_index is not None: | |
| all_ordered = get_ordered_groups() | |
| if 0 <= specific_index < len(all_ordered): | |
| target_gid = all_ordered[specific_index] | |
| group_urls = [u for g, u in all_pairs if g == target_gid] | |
| df_mode = get_clean_df(LABEL_FILE if mode == "label" else VERIFY_FILE) | |
| done = set(df_mode['url'].unique()) if not df_mode.empty else set() | |
| undone = [u for u in group_urls if u not in done] | |
| target_url = random.choice(undone) if undone else (random.choice(group_urls) if group_urls else None) | |
| else: | |
| df_mode = get_clean_df(LABEL_FILE if mode == "label" else VERIFY_FILE) | |
| done = set(df_mode['url'].unique()) if not df_mode.empty else set() | |
| if mode == "label": | |
| candidates = [u for _, u in all_pairs if u not in done] | |
| else: | |
| df_l = get_clean_df(LABEL_FILE) | |
| labeled = set(df_l['url'].unique()) if not df_l.empty else set() | |
| candidates = [u for _, u in all_pairs if u in labeled and u not in done] | |
| if candidates: | |
| target_url = random.choice(candidates) | |
| if target_url and not target_gid: | |
| try: target_gid = target_url.split("-m")[0].split("/")[-1] | |
| except: target_gid = "unknown" | |
| if not target_url: | |
| return {screen_menu: gr.update(visible=True), screen_work: gr.update(visible=False), | |
| log_box: "Done! All images processed for this mode."} | |
| if not history or history[-1] != target_url: | |
| history.append(target_url) | |
| # fetch single image | |
| try: | |
| res = requests.get(thumb_url(target_url), timeout=5, headers={'User-Agent': 'Mozilla/5.0'}) | |
| img = Image.open(BytesIO(res.content)); img.thumbnail(THUMB_SIZE) | |
| except: | |
| img = None | |
| # saved values | |
| saved = {} | |
| df_check = get_clean_df(LABEL_FILE if mode == "label" else VERIFY_FILE) | |
| if not df_check.empty: | |
| match = df_check[df_check['url'] == target_url] | |
| if not match.empty: | |
| r = match.iloc[-1] | |
| if mode == "label": | |
| saved = {"score": int(r['score'])} | |
| else: | |
| saved = {"is_correct": r['is_correct'], "score": int(r['corrected_score'])} | |
| v_sc = int(saved.get('score', 5)) | |
| done_count = len(set(df_check['url'].unique())) if not df_check.empty else 0 | |
| total = len(all_pairs) | |
| updates = { | |
| screen_menu: gr.update(visible=False), screen_work: gr.update(visible=True), | |
| header_md: f"# {mode.upper()} β {done_count} / {total} done", | |
| state_urls: [target_url], state_hist: history, state_idx: 0, | |
| top_stats: get_stats_text(), log_box: f"Property: {target_gid}", | |
| img_display: gr.update(value=img, visible=True), | |
| score_slider: gr.update(visible=True, value=v_sc, interactive=True), | |
| } | |
| if mode == "label": | |
| updates[verify_checkbox] = gr.update(visible=False) | |
| else: | |
| updates[verify_checkbox] = gr.update(visible=True, value=True) | |
| return updates | |
| def save_data(mode, history, urls, user_id, score, is_correct): | |
| if not urls: return render_workspace(mode, history, user_id) | |
| url = urls[0] | |
| try: gid = url.split("-m")[0].split("/")[-1] | |
| except: gid = "unknown" | |
| ts = datetime.now().isoformat() | |
| if mode == "label": | |
| row = [ts, user_id, gid, url, int(score), ""] | |
| else: | |
| row = [ts, user_id, gid, url, is_correct, "", int(score)] | |
| target_file = LABEL_FILE if mode == "label" else VERIFY_FILE | |
| with FileLock(LOCK_FILE): | |
| with open(target_file, "a", newline="") as f: csv.writer(f).writerow(row) | |
| sync_push_background(target_file, os.path.basename(target_file)) | |
| release_lease(user_id) | |
| return render_workspace(mode, history, user_id) | |
| def render_catalog(filter_val="All", page=1): | |
| all_pairs = get_all_image_pairs() | |
| all_gids = get_ordered_groups() | |
| df_l = get_clean_df(LABEL_FILE) | |
| df_v = get_clean_df(VERIFY_FILE) | |
| l_urls = set(df_l['url'].unique()) if not df_l.empty else set() | |
| v_urls = set(df_v['url'].unique()) if not df_v.empty else set() | |
| group_total, group_labeled, group_verified = {}, {}, {} | |
| for gid, url in all_pairs: | |
| group_total[gid] = group_total.get(gid, 0) + 1 | |
| if url in l_urls: group_labeled[gid] = group_labeled.get(gid, 0) + 1 | |
| if url in v_urls: group_verified[gid] = group_verified.get(gid, 0) + 1 | |
| entries = [] | |
| counts = {"all": 0, "pending": 0, "labeled": 0, "done": 0} | |
| for i, gid in enumerate(all_gids): | |
| tot = group_total.get(gid, 0) | |
| lab = group_labeled.get(gid, 0) | |
| ver = group_verified.get(gid, 0) | |
| if ver == tot and tot > 0: status = "done" | |
| elif lab > 0: status = "labeled" | |
| else: status = "pending" | |
| counts[status] += 1; counts["all"] += 1 | |
| entries.append((i+1, gid, status, tot, lab, ver)) | |
| # summary cards | |
| cards = [("All", counts["all"], "#607D8B"), ("Pending", counts["pending"], "#9E9E9E"), | |
| ("Labeled", counts["labeled"], "#FFC107"), ("Done", counts["done"], "#4CAF50")] | |
| summary = "<div style='display:flex;gap:10px;margin-bottom:12px;'>" | |
| for lbl, count, color in cards: | |
| bold = "font-weight:bold;box-shadow:0 2px 8px rgba(0,0,0,0.15);" if filter_val == lbl else "" | |
| summary += (f"<div style='flex:1;background:#f0f0f0;padding:14px;border-radius:8px;" | |
| f"text-align:center;border-left:4px solid {color};cursor:pointer;{bold}'>" | |
| f"<div style='font-size:22px;font-weight:bold;color:#222;'>{count}</div>" | |
| f"<div style='color:#555;font-size:13px;'>{lbl}</div></div>") | |
| summary += "</div>" | |
| # filter | |
| if filter_val != "All": | |
| entries = [e for e in entries if e[2] == filter_val.lower()] | |
| # paginate | |
| total_entries = len(entries) | |
| total_pages = max(1, (total_entries + PAGE_SIZE - 1) // PAGE_SIZE) | |
| page = max(1, min(int(page), total_pages)) | |
| page_entries = entries[(page-1)*PAGE_SIZE : page*PAGE_SIZE] | |
| status_style = {"pending": ("#9E9E9E","Pending"), "labeled": ("#FFC107","Labeled"), | |
| "done": ("#4CAF50","Done")} | |
| html = """<style> | |
| .ct{width:100%;border-collapse:collapse;font-size:14px;color:#222} | |
| .ct th{background:#2d2d2d;color:#fff;padding:10px 14px;text-align:left;font-weight:600;border-bottom:2px solid #444} | |
| .ct td{padding:10px 14px;border-bottom:1px solid #ddd;background:#f9f9f9!important;color:#222!important} | |
| .ct tr:hover td{background:#e8f0fe!important;color:#111!important} | |
| .sb{padding:4px 12px;border-radius:12px;color:#fff;font-size:12px;font-weight:500;display:inline-block} | |
| .pb{background:#e0e0e0;border-radius:4px;height:8px;width:100px;display:inline-block;overflow:hidden} | |
| .pf{height:100%;border-radius:4px} | |
| </style>""" | |
| html += "<table class='ct'><tr><th>#</th><th>Property ID</th><th>Status</th><th>Progress</th></tr>" | |
| for num, gid, st, tot, lab, ver in page_entries: | |
| color, slbl = status_style[st] | |
| pct = int(lab/tot*100) if tot > 0 else 0 | |
| html += (f"<tr><td>{num}</td><td><code style='font-size:12px;color:#222;background:#e8e8e8;padding:2px 6px;border-radius:4px'>{gid}</code></td>" | |
| f"<td><span class='sb' style='background:{color}'>{slbl}</span></td>" | |
| f"<td><span class='pb'><span class='pf' style='width:{pct}%;background:{color}'></span>" | |
| f"</span> {lab}/{tot}</td></tr>") | |
| html += "</table>" | |
| if not page_entries: | |
| html = "<p style='text-align:center;color:#999;padding:40px'>No properties match this filter.</p>" | |
| page_text = f"**Page {page} of {total_pages}** ({total_entries} properties)" | |
| return summary, html, page_text, page | |
| def do_login(name): | |
| name = str(name).strip().lower() | |
| if not name: | |
| return gr.update(), gr.update(), gr.update(visible=True, value="Please enter your name."), "" | |
| return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), name | |
| def renew_lease_handler(user_id): | |
| renew_lease(user_id) | |
| # ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Labeler Pro") as demo: | |
| state_mode, state_hist, state_urls, state_idx = gr.State("label"), gr.State([]), gr.State([]), gr.State(0) | |
| state_user = gr.State("") | |
| with gr.Row(): | |
| top_stats = gr.Markdown("Loading...") | |
| btn_home = gr.Button("π Home", size="sm", scale=0) | |
| with gr.Tabs(): | |
| with gr.Tab("Workspace"): | |
| with gr.Group() as screen_login: | |
| gr.Markdown("# Property Labeler Pro\n### Enter your name to start") | |
| user_input = gr.Textbox(label="Your Name / ID", placeholder="e.g., alice") | |
| b_login = gr.Button("Start Labeling", variant="primary") | |
| login_error = gr.Markdown("", visible=False) | |
| with gr.Group(visible=False) as screen_menu: | |
| gr.Markdown("# Property Labeler Pro") | |
| with gr.Row(): | |
| b_start_l = gr.Button("Label", variant="primary") | |
| b_start_v = gr.Button("Verify") | |
| with gr.Group(visible=False) as screen_work: | |
| header_md = gr.Markdown() | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| img_display = gr.Image(interactive=False, height=400) | |
| with gr.Column(scale=1): | |
| score_slider = gr.Slider(-10, 10, step=1, value=5, label="Score") | |
| verify_checkbox = gr.Checkbox(label="Correct?", visible=False) | |
| with gr.Row(): | |
| b_back = gr.Button("β¬ Back") | |
| b_save = gr.Button("πΎ Save & Next", variant="primary") | |
| log_box = gr.Textbox(label="Status", interactive=False) | |
| with gr.Tab("Catalog") as tab_catalog: | |
| cat_page = gr.State(1) | |
| cat_summary = gr.HTML() | |
| cat_filter = gr.Radio(["All", "Pending", "Labeled", "Done"], value="All", label="Filter") | |
| cat_html = gr.HTML() | |
| with gr.Row(): | |
| cat_prev = gr.Button("β Prev", size="sm", scale=1) | |
| cat_page_info = gr.Markdown("Page 1 of 1", scale=2) | |
| cat_next = gr.Button("Next β", size="sm", scale=1) | |
| with gr.Row(): | |
| num_in = gr.Number(value=1, label="Go to Prop #", precision=0) | |
| b_go_l = gr.Button("Go Label") | |
| b_go_v = gr.Button("Go Verify") | |
| ALL_IO = [screen_menu, screen_work, header_md, state_urls, state_hist, state_idx, top_stats, log_box, | |
| img_display, score_slider, verify_checkbox] | |
| input_objs = [score_slider, verify_checkbox] | |
| b_login.click(do_login, [user_input], [screen_login, screen_menu, login_error, state_user]) | |
| b_start_l.click(lambda: "label", None, state_mode).then(render_workspace, [state_mode, state_hist, state_user], ALL_IO) | |
| b_start_v.click(lambda: "verify", None, state_mode).then(render_workspace, [state_mode, state_hist, state_user], ALL_IO) | |
| b_save.click(save_data, [state_mode, state_hist, state_urls, state_user] + input_objs, ALL_IO) | |
| b_back.click(lambda m, h, u: render_workspace(m, h, u, move_back=True), [state_mode, state_hist, state_user], ALL_IO) | |
| def go_home(user_id): | |
| release_lease(user_id) | |
| return gr.update(visible=True), gr.update(visible=False), [] | |
| btn_home.click(go_home, [state_user], [screen_menu, screen_work, state_hist]) | |
| b_go_l.click(lambda: "label", None, state_mode).then(lambda n,m,h,u: render_workspace(m,h,u,int(n)-1), [num_in, state_mode, state_hist, state_user], ALL_IO) | |
| b_go_v.click(lambda: "verify", None, state_mode).then(lambda n,m,h,u: render_workspace(m,h,u,int(n)-1), [num_in, state_mode, state_hist, state_user], ALL_IO) | |
| score_slider.change(renew_lease_handler, [state_user], None) | |
| CAT_IO = [cat_summary, cat_html, cat_page_info, cat_page] | |
| cat_filter.change(lambda: 1, None, cat_page).then(render_catalog, [cat_filter, cat_page], CAT_IO) | |
| cat_prev.click(lambda p: max(1, int(p) - 1), [cat_page], [cat_page]).then(render_catalog, [cat_filter, cat_page], CAT_IO) | |
| cat_next.click(lambda p: int(p) + 1, [cat_page], [cat_page]).then(render_catalog, [cat_filter, cat_page], CAT_IO) | |
| tab_catalog.select(render_catalog, [cat_filter, cat_page], CAT_IO).then(get_stats_text, None, top_stats) | |
| demo.load(lambda: render_catalog("All", 1), None, CAT_IO).then(get_stats_text, None, top_stats) | |
| demo.queue().launch(server_name="0.0.0.0", server_port=7860) | |