croissant-checker

Running

File size: 9,048 Bytes

8ed167c
b81d3dd
f65aaaf
a8989f9
f65aaaf
b81d3dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f65aaaf
f0eee3f
 
 
 
f65aaaf
f0eee3f
 
b81d3dd
 
f0eee3f
b81d3dd
f0eee3f
 
 
 
f65aaaf
f0eee3f
 
 
a8989f9
 
 
 
d5f5654
f0eee3f
 
 
 
 
 
 
 
71ddcd2
d5f5654
 
 
 
 
 
07c18c7
d5f5654
07c18c7
d5f5654
 
07c18c7
d5f5654
 
2060674
 
d5f5654
 
 
 
2060674
 
 
e76200d
 
 
 
07fd3b8
757c484
e76200d
2060674
d5f5654
 
07c18c7
2060674
d5f5654
 
07c18c7
2060674
6ec1943
b218e8e
6ec1943
e360100
 
 
6ec1943
 
 
d5f5654
 
 
 
 
 
 
 
a5b79af
d5f5654
 
 
 
 
b218e8e
 
 
 
6ec1943
 
 
 
07c18c7
6ec1943
d5f5654
6ec1943
 
 
d5f5654
 
6ec1943
 
 
 
 
d5f5654
a5b79af
d5f5654
 
07c18c7
a5b79af
07c18c7
f0eee3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757c484
f0eee3f
 
 
07c18c7
 
 
 
 
 
 
 
 
 
6ec1943
 
 
 
 
 
 
07c18c7
6ec1943
 
 
 
 
 
07c18c7
 
 
 
 
 
 
 
 
6ec1943

import mlcroissant._src.operation_graph.operations.download as dl_mod
import requests as _requests_mod
import requests
import os

_SERVER_HF_TOKEN = os.environ.get("HF_TOKEN")
print("[DEBUG] HF_TOKEN is", "set" if _SERVER_HF_TOKEN else "missing")

# _active_token holds the HF token to use for the current validation request.
# It defaults to the server-level HF_TOKEN but can be overridden per-request
# via set_active_token() so that logged-in users' own tokens are used instead.
_active_token: dict = {"token": _SERVER_HF_TOKEN}


def set_active_token(token: str | None) -> None:
    """Set the HF token to use for the current validation request."""
    _active_token["token"] = token if token else _SERVER_HF_TOKEN


def clear_active_token() -> None:
    """Reset the HF token back to the server-level default."""
    _active_token["token"] = _SERVER_HF_TOKEN


# Patch requests.Session.send to fail immediately on HTTP 429 instead of
# letting mlcroissant / fsspec / huggingface_hub retry silently for minutes.
_orig_session_send = _requests_mod.Session.send

def _rate_limit_aware_send(self, request, **kwargs):
    response = _orig_session_send(self, request, **kwargs)
    if response.status_code == 429:
        retry_after = response.headers.get("Retry-After", "unknown")
        raise _requests_mod.exceptions.HTTPError(
            f"HTTP 429 Too Many Requests for {request.url}. "
            f"Retry-After: {retry_after}s. "
            "You are being rate limited. Log in with your Hugging Face account to avoid this.",
            response=response,
        )
    return response

_requests_mod.Session.send = _rate_limit_aware_send


# Only send HF credentials when downloading from huggingface.co.
# The default get_basic_auth_from_env() applies auth to ALL URLs, which
# causes non-HF hosts (e.g. OpenML) to return 400 Bad Request.
_orig_download_from_http = dl_mod.Download._download_from_http

def _hf_aware_download(self, filepath):
    url = self.node.content_url or ""
    token = _active_token["token"]
    if token and "huggingface.co" in url:
        os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
        os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = token
    else:
        os.environ.pop("CROISSANT_BASIC_AUTH_USERNAME", None)
        os.environ.pop("CROISSANT_BASIC_AUTH_PASSWORD", None)
    return _orig_download_from_http(self, filepath)

dl_mod.Download._download_from_http = _hf_aware_download

import logging
import mlcroissant as mlc
import func_timeout
import json
import traceback

# Suppress noisy mlcroissant pattern-matching warnings
logging.getLogger("root").addFilter(
    lambda r: "Could not match" not in r.getMessage()
)
logging.getLogger().addFilter(
    lambda r: "Could not match" not in r.getMessage()
)

WAIT_TIME = 10 * 60  # seconds

def validate_json(file_path):
    """Validate that the file is proper JSON."""
    try:
        with open(file_path, 'r') as f:
            json_data = json.load(f)
        return True, "The file is valid JSON.", json_data
    except json.JSONDecodeError as e:
        error_message = f"Invalid JSON format: {str(e)}"
        return False, error_message, None
    except Exception as e:
        error_message = f"Error reading file: {str(e)}"
        return False, error_message, None

REQUIRED_SCHEMA_FIELDS = ["license"]

def validate_croissant(json_data):
    """Validate that the JSON follows Croissant schema."""
    try:
        dataset = mlc.Dataset(jsonld=json_data)
        missing = [f for f in REQUIRED_SCHEMA_FIELDS if not json_data.get(f)]
        if missing:
            return True, (
                "The `license` field is missing. This is required for NeurIPS dataset submissions. "
                "Please add a `license` field to your Croissant file with the name or URL of the licence governing your dataset. "
                "Where possible, use <a href='https://www.kaggle.com/discussions/getting-started/116476' target='_blank'>open licenses</a> that "
                "allow reuse and reproducibility. However, when the dataset contains sensitive data or stricter licensing is unavoidable, "
                "please select an appropriate license that is as open as possible given the constraints. "
                "You can use our <a href='https://huggingface.co/spaces/JoaquinVanschoren/croissant-rai-checker' target='_blank'>online RAI editing tool</a> to fill this information."
            ), "error"
        return True, "The dataset passes Croissant validation.", "pass"
    except mlc.ValidationError as e:
        error_details = traceback.format_exc()
        error_message = f"Validation failed: {str(e)}\n\n{error_details}"
        return False, error_message, "error"
    except Exception as e:
        error_details = traceback.format_exc()
        error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
        return False, error_message, "error"
    
def try_generate_record(record_collection):
    try:
        for i, record in enumerate(record_collection):
            if i == 0:
                break
        return "success"
    except Exception as e:
        return e

def validate_records(json_data):
    """Validate that records can be generated within the time limit."""
    try:
        dataset = mlc.Dataset(jsonld=json_data)
        record_sets = dataset.metadata.record_sets
        
        if not record_sets:
            return True, "No record sets found to validate.", "pass"
        
        results = []
        
        for record_set in record_sets:
            try:
                result = func_timeout.func_timeout(
                    WAIT_TIME,
                    lambda: try_generate_record(dataset.records(record_set=record_set.uuid))
                )

                if isinstance(result, Exception):
                    raise result  # re-raise actual error outside timeout

                results.append(f"Record set '{record_set.uuid}' passed validation.")

            except func_timeout.exceptions.FunctionTimedOut:
                error_message = f"Record set '{record_set.uuid}' generation took too long (>10 minutes)."
                return False, error_message, "warning"

            except Exception as e:
                error_details = traceback.format_exc()
                error_message = (
                    f"Record set '{record_set.uuid}' failed due to generation error:\n\n"
                    f"```text\n{str(e)}\n\n{error_details}```"
                )
                return False, error_message, "warning"
        
        return True, "\n".join(results), "pass"
    except Exception as e:
        error_details = traceback.format_exc()
        error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
        return False, error_message, "error"
    
RAI_FIELDS = [
    "rai:dataLimitations",
    "rai:dataBiases",
    "rai:personalSensitiveInformation",
    "rai:dataUseCases",
    "rai:dataSocialImpact",
    "rai:hasSyntheticData",
]

RAI_GUIDELINES_URL = "https://neurips.cc/Conferences/2026/EvaluationsDatasetsHosting"

def validate_rai(json_data):
    """Check that all required Responsible AI metadata fields are present."""
    missing = [field for field in RAI_FIELDS if field not in json_data]
    if not missing:
        return True, "All required Responsible AI metadata fields are present."
    missing_list = "\n".join(f"- `{f}`" for f in missing)
    message = (
        f"The following required Responsible AI metadata fields are missing:\n{missing_list}\n\n"
        f"Please refer to the <a href='{RAI_GUIDELINES_URL}' target='_blank'>NeurIPS guidelines for instructions</a> on how to add them.\n\n You can use our <a href='https://huggingface.co/spaces/JoaquinVanschoren/croissant-rai-checker' target='_blank'>online RAI editing tool</a> to facilitate this process."
    )
    return False, message

def generate_validation_report(filename, json_data, results):
    """Generate a detailed validation report in markdown format."""
    report = []
    report.append("# CROISSANT VALIDATION REPORT")
    report.append("=" * 80)
    report.append("## VALIDATION RESULTS")
    report.append("-" * 80)
    report.append(f"Starting validation for file: {filename}")

    # Add validation results
    for result in results:
        if len(result) == 4:
            test_name, passed, message, status = result
        else:
            test_name, passed, message = result
            status = "pass" if passed else "error"

        report.append(f"### {test_name}")
        if status == "pass":
            report.append("✓")
        elif status == "warning":
            report.append("?")  # Question mark for warning
        else:
            report.append("✗")
        report.append(message.strip())  # Remove any trailing newlines

    # Add JSON-LD reference
    report.append("## JSON-LD REFERENCE")
    report.append("=" * 80)
    report.append("```json")
    report.append(json.dumps(json_data, indent=2))
    report.append("```")

    return "\n".join(report)