Spaces:

InternScience
/

ResearchClawBench-Task-Submit

Sleeping

App Files Files Community

CoCoOne commited on 22 days ago

Commit

1259d99

1 Parent(s): 7df30c5

Harden submission cleanup and ID allocation

Browse files

Files changed (3) hide show

app.py +187 -27
repo_ops.py +10 -0
validator.py +93 -1

app.py CHANGED Viewed

@@ -5,29 +5,54 @@ import os
 from pathlib import Path
 import gradio as gr
 try:
-    from .repo_ops import DEFAULT_REPO_ID, allocate_next_task_id, create_dataset_pr, list_existing_task_ids, load_hf_token
     from .validator import (
         DOMAINS,
         PreparedSubmission,
         SubmissionMetadata,
         ValidationError,
         build_public_report,
         cleanup_work_dir,
         normalize_domain_token,
         validate_and_prepare_submission,
     )
 except ImportError:
-    from repo_ops import DEFAULT_REPO_ID, allocate_next_task_id, create_dataset_pr, list_existing_task_ids, load_hf_token
     from validator import (
         DOMAINS,
         PreparedSubmission,
         SubmissionMetadata,
         ValidationError,
         build_public_report,
         cleanup_work_dir,
         normalize_domain_token,
         validate_and_prepare_submission,
     )
@@ -36,6 +61,18 @@ SPACE_TITLE = 'ResearchClawBench Task Submission'
 GITHUB_REPO_URL = 'https://github.com/InternScience/ResearchClawBench'
 DATASET_URL = f'https://huggingface.co/datasets/{DEFAULT_REPO_ID}'
 SPACE_URL = 'https://huggingface.co/spaces/InternScience/ResearchClawBench-Task-Submit'
 CSS = """
 @import url('https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;600;700;800&display=swap');
@@ -469,11 +506,26 @@ def resolve_domain(selected_domain: str, custom_domain: str) -> str:
     return normalized
-def handle_archive_upload(archive_path: str | None):
     if not archive_path:
         return '', 'No ZIP file selected yet.'
-    filename = Path(archive_path).name
-    return archive_path, f'Selected ZIP: `{filename}`'
 def build_validation_markdown(prepared: PreparedSubmission) -> str:
@@ -498,6 +550,36 @@ def build_failure_markdown(message: str) -> str:
     return f'## Validation failed\n\n{bullets}'
 def validate_submission(
     archive_path: str,
     suggested_domain: str,
@@ -513,7 +595,16 @@ def validate_submission(
         cleanup_work_dir(current_state.get('work_dir'))
     if not archive_path:
-        return None, '', '## Validation failed\n\n- Please upload a zip file.', '{}', gr.update(interactive=False), ''
     domain = resolve_domain(suggested_domain, custom_domain)
     token = load_hf_token()
@@ -533,57 +624,100 @@ def validate_submission(
         pr_ready = bool(token)
         return (
             prepared.to_state(),
             prepared.assigned_task_id,
             build_validation_markdown(prepared),
             json.dumps(build_public_report(prepared), indent=2, ensure_ascii=False),
             gr.update(interactive=pr_ready),
             '' if pr_ready else 'Validation passed, but PR creation is disabled until a write token is configured.',
         )
     except ValidationError as exc:
         return (
             None,
             '',
             build_failure_markdown(str(exc)),
             json.dumps({'status': 'error', 'errors': str(exc).splitlines()}, indent=2, ensure_ascii=False),
             gr.update(interactive=False),
             '',
         )
     except Exception as exc:
         return (
             None,
             '',
             build_failure_markdown(str(exc)),
             json.dumps({'status': 'error', 'errors': [str(exc)]}, indent=2, ensure_ascii=False),
             gr.update(interactive=False),
             '',
         )
-def create_pr(state: dict | None):
     if not state:
-        return None, gr.update(interactive=False), '## PR creation failed\n\n- Validate a submission first.'
     prepared = PreparedSubmission.from_state(state)
     token = load_hf_token()
-    try:
-        commit_info = create_dataset_pr(prepared, repo_id=DEFAULT_REPO_ID, token=token)
-        pr_url = commit_info.pr_url or commit_info.commit_url
-        message = '\n'.join([
-            '## PR created',
-            '',
-            f'- Task ID: `{prepared.assigned_task_id}`',
-            f'- PR: {pr_url}',
-        ])
-        return None, gr.update(interactive=False), message
-    except Exception as exc:
-        return None, gr.update(interactive=False), build_failure_markdown(str(exc).strip() or 'Unknown PR creation error')
-    finally:
-        cleanup_work_dir(prepared.work_dir)
 with gr.Blocks(title=SPACE_TITLE, fill_width=True) as demo:
-    state = gr.State(None)
-    archive_state = gr.State('')
     gr.HTML(build_hero_html())
@@ -674,7 +808,11 @@ with gr.Blocks(title=SPACE_TITLE, fill_width=True) as demo:
             with gr.Column(scale=1, min_width=0, elem_classes=['shell-spacer']):
                 gr.HTML('')
-    archive.upload(fn=handle_archive_upload, inputs=[archive], outputs=[archive_state, archive_notice])
     validate_btn.click(
         fn=validate_submission,
@@ -689,9 +827,31 @@ with gr.Blocks(title=SPACE_TITLE, fill_width=True) as demo:
             notes,
             state,
         ],
-        outputs=[state, assigned_task_id, validation_md, validation_report, create_pr_btn, pr_md],
     )
-    create_pr_btn.click(fn=create_pr, inputs=[state], outputs=[state, create_pr_btn, pr_md])
 if __name__ == '__main__':

 from pathlib import Path
 import gradio as gr
+from huggingface_hub.utils import HfHubHTTPError
 try:
+    from .repo_ops import (
+        DEFAULT_REPO_ID,
+        allocate_next_task_id,
+        create_dataset_pr,
+        get_repo_head_sha,
+        list_existing_task_ids,
+        load_hf_token,
+    )
     from .validator import (
         DOMAINS,
         PreparedSubmission,
         SubmissionMetadata,
         ValidationError,
         build_public_report,
+        cleanup_stale_managed_files,
+        cleanup_submission_state,
+        cleanup_uploaded_archive,
         cleanup_work_dir,
         normalize_domain_token,
+        persist_uploaded_archive,
+        stage_submission,
         validate_and_prepare_submission,
     )
 except ImportError:
+    from repo_ops import (
+        DEFAULT_REPO_ID,
+        allocate_next_task_id,
+        create_dataset_pr,
+        get_repo_head_sha,
+        list_existing_task_ids,
+        load_hf_token,
+    )
     from validator import (
         DOMAINS,
         PreparedSubmission,
         SubmissionMetadata,
         ValidationError,
         build_public_report,
+        cleanup_stale_managed_files,
+        cleanup_submission_state,
+        cleanup_uploaded_archive,
         cleanup_work_dir,
         normalize_domain_token,
+        persist_uploaded_archive,
+        stage_submission,
         validate_and_prepare_submission,
     )
 GITHUB_REPO_URL = 'https://github.com/InternScience/ResearchClawBench'
 DATASET_URL = f'https://huggingface.co/datasets/{DEFAULT_REPO_ID}'
 SPACE_URL = 'https://huggingface.co/spaces/InternScience/ResearchClawBench-Task-Submit'
+STATE_TTL_SECONDS = int(os.environ.get('RCB_SPACE_STATE_TTL_SECONDS', '3600'))
+STALE_WORK_DIR_TTL_SECONDS = int(
+    os.environ.get('RCB_SPACE_STALE_WORK_DIR_TTL_SECONDS', str(max(STATE_TTL_SECONDS * 2, 24 * 3600)))
+)
+_removed_stale_managed_files = cleanup_stale_managed_files(STALE_WORK_DIR_TTL_SECONDS)
+if _removed_stale_managed_files:
+    print(
+        f'[startup] Removed {_removed_stale_managed_files} stale managed submission file(s) '
+        f'older than {STALE_WORK_DIR_TTL_SECONDS}s.',
+        flush=True,
+    )
 CSS = """
 @import url('https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;600;700;800&display=swap');
     return normalized
+def handle_archive_upload(archive_path: str | None, current_archive_path: str | None):
+    if current_archive_path and current_archive_path != archive_path:
+        cleanup_uploaded_archive(current_archive_path)
     if not archive_path:
         return '', 'No ZIP file selected yet.'
+    managed_archive_path = persist_uploaded_archive(archive_path)
+    original_path = Path(archive_path)
+    managed_name = managed_archive_path.name
+    if managed_archive_path.resolve() != original_path.resolve():
+        try:
+            original_path.unlink()
+        except OSError:
+            pass
+    return str(managed_archive_path), f'Selected ZIP: `{managed_name}`'
+def archive_notice_text(archive_path: str | None) -> str:
+    if not archive_path:
+        return 'No ZIP file selected yet.'
+    return f'Selected ZIP: `{Path(archive_path).name}`'
 def build_validation_markdown(prepared: PreparedSubmission) -> str:
     return f'## Validation failed\n\n{bullets}'
+def refresh_prepared_submission_for_pr(
+    prepared: PreparedSubmission,
+    *,
+    repo_id: str,
+    token: str | None,
+) -> tuple[PreparedSubmission, bool, str]:
+    head_sha = get_repo_head_sha(repo_id=repo_id, token=token)
+    existing_ids = list_existing_task_ids(repo_id=repo_id, token=token)
+    reassigned = False
+    final_task_id = prepared.assigned_task_id
+    if final_task_id in existing_ids:
+        final_task_id = allocate_next_task_id(prepared.metadata.domain, existing_ids)
+        prepared.assigned_task_id = final_task_id
+        prepared.staged_task_dir = str(
+            stage_submission(prepared.uploaded_task_dir, final_task_id, prepared.work_dir)
+        )
+        reassigned = True
+    return prepared, reassigned, head_sha
+def is_retryable_pr_error(exc: Exception) -> bool:
+    if not isinstance(exc, HfHubHTTPError):
+        return False
+    status_code = getattr(getattr(exc, 'response', None), 'status_code', None)
+    message = str(exc).lower()
+    return status_code in {409, 412} or 'parent commit' in message or 'conflict' in message or 'stale' in message
 def validate_submission(
     archive_path: str,
     suggested_domain: str,
         cleanup_work_dir(current_state.get('work_dir'))
     if not archive_path:
+        return (
+            None,
+            '',
+            '',
+            '## Validation failed\n\n- Please upload a zip file.',
+            '{}',
+            gr.update(interactive=False),
+            '',
+            archive_notice_text(None),
+        )
     domain = resolve_domain(suggested_domain, custom_domain)
     token = load_hf_token()
         pr_ready = bool(token)
         return (
             prepared.to_state(),
+            archive_path,
             prepared.assigned_task_id,
             build_validation_markdown(prepared),
             json.dumps(build_public_report(prepared), indent=2, ensure_ascii=False),
             gr.update(interactive=pr_ready),
             '' if pr_ready else 'Validation passed, but PR creation is disabled until a write token is configured.',
+            archive_notice_text(archive_path),
         )
     except ValidationError as exc:
+        cleanup_uploaded_archive(archive_path)
         return (
             None,
             '',
+            '',
             build_failure_markdown(str(exc)),
             json.dumps({'status': 'error', 'errors': str(exc).splitlines()}, indent=2, ensure_ascii=False),
             gr.update(interactive=False),
             '',
+            archive_notice_text(None),
         )
     except Exception as exc:
+        cleanup_uploaded_archive(archive_path)
         return (
             None,
             '',
+            '',
             build_failure_markdown(str(exc)),
             json.dumps({'status': 'error', 'errors': [str(exc)]}, indent=2, ensure_ascii=False),
             gr.update(interactive=False),
             '',
+            archive_notice_text(None),
         )
+def create_pr(state: dict | None, archive_path: str | None):
     if not state:
+        return (
+            None,
+            '',
+            gr.update(interactive=False),
+            '## PR creation failed\n\n- Validate a submission first.',
+            'No ZIP file selected yet.',
+        )
     prepared = PreparedSubmission.from_state(state)
     token = load_hf_token()
+    reassigned = False
+    for attempt in range(2):
+        try:
+            prepared, was_reassigned, head_sha = refresh_prepared_submission_for_pr(
+                prepared,
+                repo_id=DEFAULT_REPO_ID,
+                token=token,
+            )
+            reassigned = reassigned or was_reassigned
+            commit_info = create_dataset_pr(
+                prepared,
+                repo_id=DEFAULT_REPO_ID,
+                token=token,
+                parent_commit=head_sha,
+            )
+            pr_url = commit_info.pr_url or commit_info.commit_url
+            lines = [
+                '## PR created',
+                '',
+                f'- Task ID: `{prepared.assigned_task_id}`',
+                f'- PR: {pr_url}',
+            ]
+            if reassigned:
+                lines.insert(3, '- The task ID was reassigned at PR time because the previously validated ID is no longer available on the dataset main branch.')
+            message = '\n'.join(lines)
+            cleanup_work_dir(prepared.work_dir)
+            cleanup_uploaded_archive(archive_path)
+            return None, '', gr.update(interactive=False), message, archive_notice_text(None)
+        except Exception as exc:
+            if attempt == 0 and is_retryable_pr_error(exc):
+                continue
+            message = str(exc).strip() or 'Unknown PR creation error'
+            if is_retryable_pr_error(exc):
+                message += '\nPlease click "Create Dataset PR" again. The dataset main branch changed while your PR was being created.'
+            return (
+                prepared.to_state(),
+                archive_path or '',
+                gr.update(interactive=bool(token)),
+                build_failure_markdown(message),
+                archive_notice_text(archive_path),
+            )
 with gr.Blocks(title=SPACE_TITLE, fill_width=True) as demo:
+    state = gr.State(None, time_to_live=STATE_TTL_SECONDS, delete_callback=cleanup_submission_state)
+    archive_state = gr.State('', time_to_live=STATE_TTL_SECONDS, delete_callback=cleanup_uploaded_archive)
     gr.HTML(build_hero_html())
             with gr.Column(scale=1, min_width=0, elem_classes=['shell-spacer']):
                 gr.HTML('')
+    archive.upload(
+        fn=handle_archive_upload,
+        inputs=[archive, archive_state],
+        outputs=[archive_state, archive_notice],
+    )
     validate_btn.click(
         fn=validate_submission,
             notes,
             state,
         ],
+        outputs=[
+            state,
+            archive_state,
+            assigned_task_id,
+            validation_md,
+            validation_report,
+            create_pr_btn,
+            pr_md,
+            archive_notice,
+        ],
+    )
+    create_pr_btn.click(
+        fn=create_pr,
+        inputs=[
+            state,
+            archive_state,
+        ],
+        outputs=[
+            state,
+            archive_state,
+            create_pr_btn,
+            pr_md,
+            archive_notice,
+        ],
     )
 if __name__ == '__main__':

repo_ops.py CHANGED Viewed

@@ -39,6 +39,14 @@ def list_existing_task_ids(repo_id: str = DEFAULT_REPO_ID, token: str | None = N
     return task_ids
 def allocate_next_task_id(domain: str, existing_task_ids: Iterable[str]) -> str:
     domain = normalize_domain_token(domain)
     if not DOMAIN_TOKEN_RE.fullmatch(domain):
@@ -80,6 +88,7 @@ def create_dataset_pr(
     *,
     repo_id: str = DEFAULT_REPO_ID,
     token: str | None = None,
 ):
     token = token or load_hf_token()
     if not token:
@@ -111,4 +120,5 @@ def create_dataset_pr(
         token=token,
         create_pr=True,
         revision='main',
     )

     return task_ids
+def get_repo_head_sha(repo_id: str = DEFAULT_REPO_ID, token: str | None = None) -> str:
+    api = HfApi(token=token)
+    info = api.repo_info(repo_id=repo_id, repo_type='dataset', revision='main', token=token)
+    if not getattr(info, 'sha', None):
+        raise RuntimeError(f'Failed to fetch HEAD SHA for dataset repo {repo_id}.')
+    return info.sha
 def allocate_next_task_id(domain: str, existing_task_ids: Iterable[str]) -> str:
     domain = normalize_domain_token(domain)
     if not DOMAIN_TOKEN_RE.fullmatch(domain):
     *,
     repo_id: str = DEFAULT_REPO_ID,
     token: str | None = None,
+    parent_commit: str | None = None,
 ):
     token = token or load_hf_token()
     if not token:
         token=token,
         create_pr=True,
         revision='main',
+        parent_commit=parent_commit,
     )

validator.py CHANGED Viewed

@@ -6,6 +6,8 @@ import re
 import shutil
 import stat
 import tempfile
 import zipfile
 from dataclasses import asdict, dataclass
 from pathlib import Path, PurePosixPath
@@ -43,6 +45,16 @@ IGNORED_ARCHIVE_NAMES = {'.DS_Store'}
 DEFAULT_MAX_FILES = int(os.environ.get('RCB_SPACE_MAX_FILES', '5000'))
 DEFAULT_MAX_TOTAL_BYTES = int(os.environ.get('RCB_SPACE_MAX_TOTAL_BYTES', str(5 * 1024 * 1024 * 1024)))
 DEFAULT_MAX_SINGLE_FILE_BYTES = int(os.environ.get('RCB_SPACE_MAX_SINGLE_FILE_BYTES', str(1024 * 1024 * 1024)))
 @dataclass
@@ -151,8 +163,88 @@ def cleanup_work_dir(work_dir: str | Path | None) -> None:
     shutil.rmtree(Path(work_dir), ignore_errors=True)
 def create_work_dir() -> Path:
-    return Path(tempfile.mkdtemp(prefix='rcb_space_submit_'))
 def extract_submission_zip(

 import shutil
 import stat
 import tempfile
+import time
+import uuid
 import zipfile
 from dataclasses import asdict, dataclass
 from pathlib import Path, PurePosixPath
 DEFAULT_MAX_FILES = int(os.environ.get('RCB_SPACE_MAX_FILES', '5000'))
 DEFAULT_MAX_TOTAL_BYTES = int(os.environ.get('RCB_SPACE_MAX_TOTAL_BYTES', str(5 * 1024 * 1024 * 1024)))
 DEFAULT_MAX_SINGLE_FILE_BYTES = int(os.environ.get('RCB_SPACE_MAX_SINGLE_FILE_BYTES', str(1024 * 1024 * 1024)))
+WORK_DIR_PREFIX = 'rcb_space_submit_'
+ARCHIVE_PREFIX = 'rcb_space_upload_'
+TEMP_ROOTS = tuple(
+    Path(root).resolve()
+    for root in {
+        tempfile.gettempdir(),
+        os.environ.get('GRADIO_TEMP_DIR', ''),
+    }
+    if root
+)
 @dataclass
     shutil.rmtree(Path(work_dir), ignore_errors=True)
+def cleanup_submission_state(state: dict[str, Any] | None) -> None:
+    if not state or not isinstance(state, dict):
+        return
+    cleanup_work_dir(state.get('work_dir'))
+def _is_under_temp_root(path: Path) -> bool:
+    try:
+        resolved = path.resolve()
+    except OSError:
+        return False
+    return any(root == resolved or root in resolved.parents for root in TEMP_ROOTS)
+def cleanup_uploaded_archive(archive_path: str | Path | None) -> None:
+    if not archive_path:
+        return
+    path = Path(archive_path)
+    if not path.exists() or not path.is_file():
+        return
+    if path.suffix.lower() != '.zip':
+        return
+    if not _is_under_temp_root(path):
+        return
+    if not path.name.startswith(ARCHIVE_PREFIX):
+        return
+    try:
+        path.unlink()
+    except OSError:
+        pass
+def persist_uploaded_archive(archive_path: str | Path) -> Path:
+    source = Path(archive_path)
+    if not source.exists() or not source.is_file():
+        raise ValidationError(f'Uploaded archive does not exist: {source}')
+    if source.suffix.lower() != '.zip':
+        raise ValidationError('Only .zip uploads are supported.')
+    managed_root = Path(tempfile.gettempdir())
+    managed_path = managed_root / f'{ARCHIVE_PREFIX}{uuid.uuid4().hex}.zip'
+    shutil.copy2(source, managed_path)
+    return managed_path
+def cleanup_stale_managed_files(max_age_seconds: int) -> int:
+    if max_age_seconds <= 0:
+        return 0
+    temp_root = Path(tempfile.gettempdir())
+    if not temp_root.exists():
+        return 0
+    cutoff = time.time() - max_age_seconds
+    removed = 0
+    for path in temp_root.iterdir():
+        if path.is_dir():
+            if not path.name.startswith(WORK_DIR_PREFIX):
+                continue
+        elif path.is_file():
+            if not path.name.startswith(ARCHIVE_PREFIX) or path.suffix.lower() != '.zip':
+                continue
+        else:
+            continue
+        try:
+            if path.stat().st_mtime > cutoff:
+                continue
+        except OSError:
+            continue
+        if path.is_dir():
+            shutil.rmtree(path, ignore_errors=True)
+        else:
+            try:
+                path.unlink()
+            except OSError:
+                continue
+        removed += 1
+    return removed
 def create_work_dir() -> Path:
+    return Path(tempfile.mkdtemp(prefix=WORK_DIR_PREFIX))
 def extract_submission_zip(