data-collection / app.py
Nightfury16's picture
Removed class label option
8095305
import os
import gradio as gr
import pandas as pd
import requests
import csv
import json
import threading
import random
from io import BytesIO
from PIL import Image
from datetime import datetime, timedelta
from filelock import FileLock
from huggingface_hub import HfApi, hf_hub_download
DATASET_REPO_ID = os.environ.get("DATASET_REPO_ID", "fast-stager/property-labels")
HF_TOKEN = os.environ.get("HF_TOKEN")
CACHE_DIR = "/tmp/data"
os.makedirs(CACHE_DIR, exist_ok=True)
URL_FILE = "new_urls.json"
LABEL_FILE = os.path.join(CACHE_DIR, "annotations.csv")
VERIFY_FILE = os.path.join(CACHE_DIR, "verifications.csv")
SKIP_FILE = os.path.join(CACHE_DIR, "skipped.csv")
LOCK_FILE = os.path.join(CACHE_DIR, "data.lock")
LEASE_FILE = os.path.join(CACHE_DIR, "leases.csv")
LEASE_DURATION_SECONDS = 600
FETCH_SIZE = "w480_h360"
MANUAL_EXCLUDE = {"075c8bb8a73c45d71788e711edd9e8d5l", "07a0544f217db88fe2b06fd5d38f02a6l", "6bf16112723de3318c44641958638a56l"}
THUMB_SIZE = (350, 350)
PAGE_SIZE = 50
# ── sync ──────────────────────────────────────────────
def sync_pull():
token = HF_TOKEN if HF_TOKEN and len(HF_TOKEN) > 5 else None
for filename in ["annotations.csv", "verifications.csv", "skipped.csv"]:
try:
local_path = os.path.join(CACHE_DIR, filename)
if os.path.exists(local_path): os.remove(local_path)
hf_hub_download(repo_id=DATASET_REPO_ID, filename=filename, repo_type="dataset", local_dir=CACHE_DIR, token=token, force_download=True)
except: pass
def sync_push_background(local_path, remote_filename):
token = HF_TOKEN if HF_TOKEN and len(HF_TOKEN) > 5 else None
if not token: return
def _push():
try:
api = HfApi(token=token)
api.upload_file(path_or_fileobj=local_path, path_in_repo=remote_filename, repo_id=DATASET_REPO_ID, repo_type="dataset")
except: pass
threading.Thread(target=_push).start()
# ── init ──────────────────────────────────────────────
def init_files():
sync_pull()
for f in [LABEL_FILE, VERIFY_FILE, SKIP_FILE]:
if not os.path.exists(f):
cols = ["timestamp", "user", "group_id", "url", "score", "label"] if f == LABEL_FILE else \
["timestamp", "user", "group_id", "url", "is_correct", "corrected_label", "corrected_score"] if f == VERIFY_FILE else \
["timestamp", "user", "group_id"]
pd.DataFrame(columns=cols).to_csv(f, index=False)
if not os.path.exists(LEASE_FILE):
pd.DataFrame(columns=["user_id", "group_id", "mode", "leased_at", "expires_at"]).to_csv(LEASE_FILE, index=False)
init_files()
# ── data loading ──────────────────────────────────────
def load_all_urls():
if not os.path.exists(URL_FILE): return []
try:
with open(URL_FILE, 'r') as f:
data = json.load(f)
return [img for g in data.get("groups", []) for img in g.get("images", [])]
except: return []
_ORDERED_GROUPS_CACHE = None
def get_ordered_groups():
global _ORDERED_GROUPS_CACHE
if _ORDERED_GROUPS_CACHE is not None: return _ORDERED_GROUPS_CACHE
groups, seen = [], set()
for u in load_all_urls():
try: gid = u.split("-m")[0].split("/")[-1]
except: gid = "unknown"
if gid not in seen: groups.append(gid); seen.add(gid)
_ORDERED_GROUPS_CACHE = groups
return groups
_ALL_PAIRS_CACHE = None
def get_all_image_pairs():
global _ALL_PAIRS_CACHE
if _ALL_PAIRS_CACHE is not None: return _ALL_PAIRS_CACHE
pairs = []
for u in load_all_urls():
try: gid = u.split("-m")[0].split("/")[-1]
except: gid = "unknown"
if gid not in MANUAL_EXCLUDE: pairs.append((gid, u))
_ALL_PAIRS_CACHE = pairs
return pairs
def get_clean_df(filepath):
if not os.path.exists(filepath): return pd.DataFrame()
try:
df = pd.read_csv(filepath)
if df.empty: return df
if 'label' in df.columns: df['label'] = df['label'].astype(str).str.strip().str.lower()
if 'corrected_label' in df.columns: df['corrected_label'] = df['corrected_label'].astype(str).str.strip().str.lower()
if 'score' in df.columns: df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0).astype(int)
if 'corrected_score' in df.columns: df['corrected_score'] = pd.to_numeric(df['corrected_score'], errors='coerce').fillna(0).astype(int)
return df.drop_duplicates(subset=['url'], keep='last')
except: return pd.DataFrame()
# ── leases ────────────────────────────────────────────
def _read_leases():
if not os.path.exists(LEASE_FILE) or os.path.getsize(LEASE_FILE) == 0:
return pd.DataFrame(columns=["user_id", "group_id", "mode", "leased_at", "expires_at"])
try:
df = pd.read_csv(LEASE_FILE)
df['expires_at'] = pd.to_datetime(df['expires_at'], errors='coerce')
return df
except:
return pd.DataFrame(columns=["user_id", "group_id", "mode", "leased_at", "expires_at"])
def acquire_lease(user_id, group_id, mode):
now = datetime.now()
expires = now + timedelta(seconds=LEASE_DURATION_SECONDS)
with FileLock(LOCK_FILE):
df = _read_leases()
df = df[df['expires_at'] > now]
existing = df[(df['group_id'] == group_id) & (df['user_id'] != user_id)]
if not existing.empty:
df.to_csv(LEASE_FILE, index=False)
return False
df = df[df['user_id'] != user_id]
new_row = pd.DataFrame([{"user_id": user_id, "group_id": group_id, "mode": mode,
"leased_at": now.isoformat(), "expires_at": expires.isoformat()}])
df = pd.concat([df, new_row], ignore_index=True)
df.to_csv(LEASE_FILE, index=False)
return True
def release_lease(user_id):
with FileLock(LOCK_FILE):
df = _read_leases()
df = df[df['user_id'] != user_id]
df.to_csv(LEASE_FILE, index=False)
def get_leased_group_ids(exclude_user=None):
now = datetime.now()
with FileLock(LOCK_FILE):
df = _read_leases()
if df.empty: return set()
df = df[df['expires_at'] > now]
df.to_csv(LEASE_FILE, index=False)
if exclude_user:
df = df[df['user_id'] != exclude_user]
return set(df['group_id'].unique())
def renew_lease(user_id):
if not user_id: return
new_expires = (datetime.now() + timedelta(seconds=LEASE_DURATION_SECONDS)).isoformat()
with FileLock(LOCK_FILE):
df = _read_leases()
mask = df['user_id'] == user_id
if mask.any():
df.loc[mask, 'expires_at'] = new_expires
df.to_csv(LEASE_FILE, index=False)
# ── helpers ───────────────────────────────────────────
def thumb_url(url):
return url.replace("w2048_h1536", FETCH_SIZE)
def get_stats_text():
all_pairs = get_all_image_pairs()
total = len(all_pairs)
df_l = get_clean_df(LABEL_FILE)
df_v = get_clean_df(VERIFY_FILE)
l_count = len(df_l['url'].unique()) if not df_l.empty else 0
v_count = len(df_v['url'].unique()) if not df_v.empty else 0
return f"**Images:** {total} | **Labeled:** {l_count} | **Verified:** {v_count}"
# ── core: one image at a time ─────────────────────────
def render_workspace(mode, history, user_id="user", specific_index=None, move_back=False):
all_pairs = get_all_image_pairs()
target_url = None
target_gid = None
if move_back and len(history) > 1:
history.pop()
target_url = history[-1]
try: target_gid = target_url.split("-m")[0].split("/")[-1]
except: target_gid = "unknown"
elif specific_index is not None:
all_ordered = get_ordered_groups()
if 0 <= specific_index < len(all_ordered):
target_gid = all_ordered[specific_index]
group_urls = [u for g, u in all_pairs if g == target_gid]
df_mode = get_clean_df(LABEL_FILE if mode == "label" else VERIFY_FILE)
done = set(df_mode['url'].unique()) if not df_mode.empty else set()
undone = [u for u in group_urls if u not in done]
target_url = random.choice(undone) if undone else (random.choice(group_urls) if group_urls else None)
else:
df_mode = get_clean_df(LABEL_FILE if mode == "label" else VERIFY_FILE)
done = set(df_mode['url'].unique()) if not df_mode.empty else set()
if mode == "label":
candidates = [u for _, u in all_pairs if u not in done]
else:
df_l = get_clean_df(LABEL_FILE)
labeled = set(df_l['url'].unique()) if not df_l.empty else set()
candidates = [u for _, u in all_pairs if u in labeled and u not in done]
if candidates:
target_url = random.choice(candidates)
if target_url and not target_gid:
try: target_gid = target_url.split("-m")[0].split("/")[-1]
except: target_gid = "unknown"
if not target_url:
return {screen_menu: gr.update(visible=True), screen_work: gr.update(visible=False),
log_box: "Done! All images processed for this mode."}
if not history or history[-1] != target_url:
history.append(target_url)
# fetch single image
try:
res = requests.get(thumb_url(target_url), timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
img = Image.open(BytesIO(res.content)); img.thumbnail(THUMB_SIZE)
except:
img = None
# saved values
saved = {}
df_check = get_clean_df(LABEL_FILE if mode == "label" else VERIFY_FILE)
if not df_check.empty:
match = df_check[df_check['url'] == target_url]
if not match.empty:
r = match.iloc[-1]
if mode == "label":
saved = {"score": int(r['score'])}
else:
saved = {"is_correct": r['is_correct'], "score": int(r['corrected_score'])}
v_sc = int(saved.get('score', 5))
done_count = len(set(df_check['url'].unique())) if not df_check.empty else 0
total = len(all_pairs)
updates = {
screen_menu: gr.update(visible=False), screen_work: gr.update(visible=True),
header_md: f"# {mode.upper()} β€” {done_count} / {total} done",
state_urls: [target_url], state_hist: history, state_idx: 0,
top_stats: get_stats_text(), log_box: f"Property: {target_gid}",
img_display: gr.update(value=img, visible=True),
score_slider: gr.update(visible=True, value=v_sc, interactive=True),
}
if mode == "label":
updates[verify_checkbox] = gr.update(visible=False)
else:
updates[verify_checkbox] = gr.update(visible=True, value=True)
return updates
def save_data(mode, history, urls, user_id, score, is_correct):
if not urls: return render_workspace(mode, history, user_id)
url = urls[0]
try: gid = url.split("-m")[0].split("/")[-1]
except: gid = "unknown"
ts = datetime.now().isoformat()
if mode == "label":
row = [ts, user_id, gid, url, int(score), ""]
else:
row = [ts, user_id, gid, url, is_correct, "", int(score)]
target_file = LABEL_FILE if mode == "label" else VERIFY_FILE
with FileLock(LOCK_FILE):
with open(target_file, "a", newline="") as f: csv.writer(f).writerow(row)
sync_push_background(target_file, os.path.basename(target_file))
release_lease(user_id)
return render_workspace(mode, history, user_id)
def render_catalog(filter_val="All", page=1):
all_pairs = get_all_image_pairs()
all_gids = get_ordered_groups()
df_l = get_clean_df(LABEL_FILE)
df_v = get_clean_df(VERIFY_FILE)
l_urls = set(df_l['url'].unique()) if not df_l.empty else set()
v_urls = set(df_v['url'].unique()) if not df_v.empty else set()
group_total, group_labeled, group_verified = {}, {}, {}
for gid, url in all_pairs:
group_total[gid] = group_total.get(gid, 0) + 1
if url in l_urls: group_labeled[gid] = group_labeled.get(gid, 0) + 1
if url in v_urls: group_verified[gid] = group_verified.get(gid, 0) + 1
entries = []
counts = {"all": 0, "pending": 0, "labeled": 0, "done": 0}
for i, gid in enumerate(all_gids):
tot = group_total.get(gid, 0)
lab = group_labeled.get(gid, 0)
ver = group_verified.get(gid, 0)
if ver == tot and tot > 0: status = "done"
elif lab > 0: status = "labeled"
else: status = "pending"
counts[status] += 1; counts["all"] += 1
entries.append((i+1, gid, status, tot, lab, ver))
# summary cards
cards = [("All", counts["all"], "#607D8B"), ("Pending", counts["pending"], "#9E9E9E"),
("Labeled", counts["labeled"], "#FFC107"), ("Done", counts["done"], "#4CAF50")]
summary = "<div style='display:flex;gap:10px;margin-bottom:12px;'>"
for lbl, count, color in cards:
bold = "font-weight:bold;box-shadow:0 2px 8px rgba(0,0,0,0.15);" if filter_val == lbl else ""
summary += (f"<div style='flex:1;background:#f0f0f0;padding:14px;border-radius:8px;"
f"text-align:center;border-left:4px solid {color};cursor:pointer;{bold}'>"
f"<div style='font-size:22px;font-weight:bold;color:#222;'>{count}</div>"
f"<div style='color:#555;font-size:13px;'>{lbl}</div></div>")
summary += "</div>"
# filter
if filter_val != "All":
entries = [e for e in entries if e[2] == filter_val.lower()]
# paginate
total_entries = len(entries)
total_pages = max(1, (total_entries + PAGE_SIZE - 1) // PAGE_SIZE)
page = max(1, min(int(page), total_pages))
page_entries = entries[(page-1)*PAGE_SIZE : page*PAGE_SIZE]
status_style = {"pending": ("#9E9E9E","Pending"), "labeled": ("#FFC107","Labeled"),
"done": ("#4CAF50","Done")}
html = """<style>
.ct{width:100%;border-collapse:collapse;font-size:14px;color:#222}
.ct th{background:#2d2d2d;color:#fff;padding:10px 14px;text-align:left;font-weight:600;border-bottom:2px solid #444}
.ct td{padding:10px 14px;border-bottom:1px solid #ddd;background:#f9f9f9!important;color:#222!important}
.ct tr:hover td{background:#e8f0fe!important;color:#111!important}
.sb{padding:4px 12px;border-radius:12px;color:#fff;font-size:12px;font-weight:500;display:inline-block}
.pb{background:#e0e0e0;border-radius:4px;height:8px;width:100px;display:inline-block;overflow:hidden}
.pf{height:100%;border-radius:4px}
</style>"""
html += "<table class='ct'><tr><th>#</th><th>Property ID</th><th>Status</th><th>Progress</th></tr>"
for num, gid, st, tot, lab, ver in page_entries:
color, slbl = status_style[st]
pct = int(lab/tot*100) if tot > 0 else 0
html += (f"<tr><td>{num}</td><td><code style='font-size:12px;color:#222;background:#e8e8e8;padding:2px 6px;border-radius:4px'>{gid}</code></td>"
f"<td><span class='sb' style='background:{color}'>{slbl}</span></td>"
f"<td><span class='pb'><span class='pf' style='width:{pct}%;background:{color}'></span>"
f"</span> {lab}/{tot}</td></tr>")
html += "</table>"
if not page_entries:
html = "<p style='text-align:center;color:#999;padding:40px'>No properties match this filter.</p>"
page_text = f"**Page {page} of {total_pages}** ({total_entries} properties)"
return summary, html, page_text, page
def do_login(name):
name = str(name).strip().lower()
if not name:
return gr.update(), gr.update(), gr.update(visible=True, value="Please enter your name."), ""
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), name
def renew_lease_handler(user_id):
renew_lease(user_id)
# ── UI ────────────────────────────────────────────────
with gr.Blocks(theme=gr.themes.Soft(), title="Labeler Pro") as demo:
state_mode, state_hist, state_urls, state_idx = gr.State("label"), gr.State([]), gr.State([]), gr.State(0)
state_user = gr.State("")
with gr.Row():
top_stats = gr.Markdown("Loading...")
btn_home = gr.Button("🏠 Home", size="sm", scale=0)
with gr.Tabs():
with gr.Tab("Workspace"):
with gr.Group() as screen_login:
gr.Markdown("# Property Labeler Pro\n### Enter your name to start")
user_input = gr.Textbox(label="Your Name / ID", placeholder="e.g., alice")
b_login = gr.Button("Start Labeling", variant="primary")
login_error = gr.Markdown("", visible=False)
with gr.Group(visible=False) as screen_menu:
gr.Markdown("# Property Labeler Pro")
with gr.Row():
b_start_l = gr.Button("Label", variant="primary")
b_start_v = gr.Button("Verify")
with gr.Group(visible=False) as screen_work:
header_md = gr.Markdown()
with gr.Row():
with gr.Column(scale=2):
img_display = gr.Image(interactive=False, height=400)
with gr.Column(scale=1):
score_slider = gr.Slider(-10, 10, step=1, value=5, label="Score")
verify_checkbox = gr.Checkbox(label="Correct?", visible=False)
with gr.Row():
b_back = gr.Button("β¬… Back")
b_save = gr.Button("πŸ’Ύ Save & Next", variant="primary")
log_box = gr.Textbox(label="Status", interactive=False)
with gr.Tab("Catalog") as tab_catalog:
cat_page = gr.State(1)
cat_summary = gr.HTML()
cat_filter = gr.Radio(["All", "Pending", "Labeled", "Done"], value="All", label="Filter")
cat_html = gr.HTML()
with gr.Row():
cat_prev = gr.Button("← Prev", size="sm", scale=1)
cat_page_info = gr.Markdown("Page 1 of 1", scale=2)
cat_next = gr.Button("Next β†’", size="sm", scale=1)
with gr.Row():
num_in = gr.Number(value=1, label="Go to Prop #", precision=0)
b_go_l = gr.Button("Go Label")
b_go_v = gr.Button("Go Verify")
ALL_IO = [screen_menu, screen_work, header_md, state_urls, state_hist, state_idx, top_stats, log_box,
img_display, score_slider, verify_checkbox]
input_objs = [score_slider, verify_checkbox]
b_login.click(do_login, [user_input], [screen_login, screen_menu, login_error, state_user])
b_start_l.click(lambda: "label", None, state_mode).then(render_workspace, [state_mode, state_hist, state_user], ALL_IO)
b_start_v.click(lambda: "verify", None, state_mode).then(render_workspace, [state_mode, state_hist, state_user], ALL_IO)
b_save.click(save_data, [state_mode, state_hist, state_urls, state_user] + input_objs, ALL_IO)
b_back.click(lambda m, h, u: render_workspace(m, h, u, move_back=True), [state_mode, state_hist, state_user], ALL_IO)
def go_home(user_id):
release_lease(user_id)
return gr.update(visible=True), gr.update(visible=False), []
btn_home.click(go_home, [state_user], [screen_menu, screen_work, state_hist])
b_go_l.click(lambda: "label", None, state_mode).then(lambda n,m,h,u: render_workspace(m,h,u,int(n)-1), [num_in, state_mode, state_hist, state_user], ALL_IO)
b_go_v.click(lambda: "verify", None, state_mode).then(lambda n,m,h,u: render_workspace(m,h,u,int(n)-1), [num_in, state_mode, state_hist, state_user], ALL_IO)
score_slider.change(renew_lease_handler, [state_user], None)
CAT_IO = [cat_summary, cat_html, cat_page_info, cat_page]
cat_filter.change(lambda: 1, None, cat_page).then(render_catalog, [cat_filter, cat_page], CAT_IO)
cat_prev.click(lambda p: max(1, int(p) - 1), [cat_page], [cat_page]).then(render_catalog, [cat_filter, cat_page], CAT_IO)
cat_next.click(lambda p: int(p) + 1, [cat_page], [cat_page]).then(render_catalog, [cat_filter, cat_page], CAT_IO)
tab_catalog.select(render_catalog, [cat_filter, cat_page], CAT_IO).then(get_stats_text, None, top_stats)
demo.load(lambda: render_catalog("All", 1), None, CAT_IO).then(get_stats_text, None, top_stats)
demo.queue().launch(server_name="0.0.0.0", server_port=7860)