Spaces:
Running
Running
Commit ·
f0eee3f
1
Parent(s): 9c3a4be
Adding RAI checks
Browse files- app.py +20 -16
- validation.py +49 -6
app.py
CHANGED
|
@@ -1,22 +1,10 @@
|
|
| 1 |
-
import mlcroissant._src.operation_graph.operations.download as dl_mod
|
| 2 |
import requests
|
| 3 |
import os
|
| 4 |
-
|
| 5 |
-
# Make sure the HF token is loaded
|
| 6 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 7 |
-
|
| 8 |
-
# Set the environment variables Croissant expects
|
| 9 |
-
os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
|
| 10 |
-
os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN or ""
|
| 11 |
-
|
| 12 |
-
print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing")
|
| 13 |
-
print("[DEBUG] Basic auth env set for Croissant")
|
| 14 |
-
|
| 15 |
import gradio as gr
|
| 16 |
import json
|
| 17 |
import time
|
| 18 |
import traceback
|
| 19 |
-
from validation import validate_json, validate_croissant, validate_records, generate_validation_report
|
| 20 |
|
| 21 |
def process_file(file):
|
| 22 |
results = []
|
|
@@ -45,6 +33,10 @@ def process_file(file):
|
|
| 45 |
records_message = records_message.replace("\n✓\n", "\n")
|
| 46 |
results.append(("Records Generation Test", records_valid, records_message, records_status))
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# Generate final report
|
| 49 |
report = generate_validation_report(filename, json_data, results)
|
| 50 |
|
|
@@ -64,6 +56,7 @@ def create_ui():
|
|
| 64 |
1. If the file is valid JSON
|
| 65 |
2. If it passes Croissant schema validation
|
| 66 |
3. If records can be generated within a reasonable time
|
|
|
|
| 67 |
""")
|
| 68 |
|
| 69 |
# Track the active tab for conditional UI updates
|
|
@@ -107,7 +100,8 @@ def create_ui():
|
|
| 107 |
label="Report Content",
|
| 108 |
visible=True,
|
| 109 |
show_copy_button=True,
|
| 110 |
-
lines=10
|
|
|
|
| 111 |
)
|
| 112 |
|
| 113 |
# Define CSS for the validation UI
|
|
@@ -287,6 +281,12 @@ def create_ui():
|
|
| 287 |
.gr-accordion[data-open=true] > .label-wrap::after {
|
| 288 |
transform: rotate(90deg);
|
| 289 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
</style>
|
| 291 |
""")
|
| 292 |
|
|
@@ -380,6 +380,10 @@ def create_ui():
|
|
| 380 |
records_valid, records_message, records_status = validate_records(json_data)
|
| 381 |
results.append(("Records Generation Test (Optional)", records_valid, records_message, records_status))
|
| 382 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
# Generate report
|
| 384 |
report = generate_validation_report(url.split("/")[-1], json_data, results)
|
| 385 |
report_filename = f"report_croissant-validation_{json_data.get('name', 'unnamed')}.md"
|
|
@@ -438,11 +442,11 @@ def create_ui():
|
|
| 438 |
status_class = "status-success"
|
| 439 |
status_icon = "✓"
|
| 440 |
message_with_emoji = "✅ " + message
|
| 441 |
-
elif status == "warning":
|
| 442 |
status_class = "status-warning"
|
| 443 |
status_icon = "?"
|
| 444 |
message_with_emoji = "⚠️ Could not automatically generate records. This is oftentimes not an issue (e.g. datasets could be too large or too complex), and it's not required to pass this test to submit to NeurIPS.\n\n" + message
|
| 445 |
-
else: # error
|
| 446 |
status_class = "status-error"
|
| 447 |
status_icon = "✗"
|
| 448 |
message_with_emoji = "❌ " + message
|
|
|
|
|
|
|
| 1 |
import requests
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import json
|
| 5 |
import time
|
| 6 |
import traceback
|
| 7 |
+
from validation import validate_json, validate_croissant, validate_records, validate_rai, generate_validation_report
|
| 8 |
|
| 9 |
def process_file(file):
|
| 10 |
results = []
|
|
|
|
| 33 |
records_message = records_message.replace("\n✓\n", "\n")
|
| 34 |
results.append(("Records Generation Test", records_valid, records_message, records_status))
|
| 35 |
|
| 36 |
+
# Check 4: Responsible AI metadata
|
| 37 |
+
rai_valid, rai_message = validate_rai(json_data)
|
| 38 |
+
results.append(("Responsible AI Metadata", rai_valid, rai_message, "pass" if rai_valid else "error"))
|
| 39 |
+
|
| 40 |
# Generate final report
|
| 41 |
report = generate_validation_report(filename, json_data, results)
|
| 42 |
|
|
|
|
| 56 |
1. If the file is valid JSON
|
| 57 |
2. If it passes Croissant schema validation
|
| 58 |
3. If records can be generated within a reasonable time
|
| 59 |
+
4. If all required Responsible AI metadata fields are present
|
| 60 |
""")
|
| 61 |
|
| 62 |
# Track the active tab for conditional UI updates
|
|
|
|
| 100 |
label="Report Content",
|
| 101 |
visible=True,
|
| 102 |
show_copy_button=True,
|
| 103 |
+
lines=10,
|
| 104 |
+
elem_id="report-text-box"
|
| 105 |
)
|
| 106 |
|
| 107 |
# Define CSS for the validation UI
|
|
|
|
| 281 |
.gr-accordion[data-open=true] > .label-wrap::after {
|
| 282 |
transform: rotate(90deg);
|
| 283 |
}
|
| 284 |
+
|
| 285 |
+
/* Prevent report textbox from bubbling scroll to the page */
|
| 286 |
+
#report-text-box textarea {
|
| 287 |
+
overflow-y: auto !important;
|
| 288 |
+
overscroll-behavior: contain;
|
| 289 |
+
}
|
| 290 |
</style>
|
| 291 |
""")
|
| 292 |
|
|
|
|
| 380 |
records_valid, records_message, records_status = validate_records(json_data)
|
| 381 |
results.append(("Records Generation Test (Optional)", records_valid, records_message, records_status))
|
| 382 |
|
| 383 |
+
# Check 4: Responsible AI metadata
|
| 384 |
+
rai_valid, rai_message = validate_rai(json_data)
|
| 385 |
+
results.append(("Responsible AI Metadata", rai_valid, rai_message, "pass" if rai_valid else "error"))
|
| 386 |
+
|
| 387 |
# Generate report
|
| 388 |
report = generate_validation_report(url.split("/")[-1], json_data, results)
|
| 389 |
report_filename = f"report_croissant-validation_{json_data.get('name', 'unnamed')}.md"
|
|
|
|
| 442 |
status_class = "status-success"
|
| 443 |
status_icon = "✓"
|
| 444 |
message_with_emoji = "✅ " + message
|
| 445 |
+
elif status == "warning" and "Records" in test_name:
|
| 446 |
status_class = "status-warning"
|
| 447 |
status_icon = "?"
|
| 448 |
message_with_emoji = "⚠️ Could not automatically generate records. This is oftentimes not an issue (e.g. datasets could be too large or too complex), and it's not required to pass this test to submit to NeurIPS.\n\n" + message
|
| 449 |
+
else: # error or non-records warning
|
| 450 |
status_class = "status-error"
|
| 451 |
status_icon = "✗"
|
| 452 |
message_with_emoji = "❌ " + message
|
validation.py
CHANGED
|
@@ -2,21 +2,40 @@ import mlcroissant._src.operation_graph.operations.download as dl_mod
|
|
| 2 |
import requests
|
| 3 |
import os
|
| 4 |
|
| 5 |
-
# Make sure the HF token is loaded
|
| 6 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
|
|
|
| 15 |
import mlcroissant as mlc
|
| 16 |
import func_timeout
|
| 17 |
import json
|
| 18 |
import traceback
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
WAIT_TIME = 10 * 60 # seconds
|
| 21 |
|
| 22 |
def validate_json(file_path):
|
|
@@ -96,6 +115,30 @@ def validate_records(json_data):
|
|
| 96 |
error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
|
| 97 |
return False, error_message, "error"
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
def generate_validation_report(filename, json_data, results):
|
| 100 |
"""Generate a detailed validation report in markdown format."""
|
| 101 |
report = []
|
|
|
|
| 2 |
import requests
|
| 3 |
import os
|
| 4 |
|
|
|
|
| 5 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 6 |
+
print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing")
|
| 7 |
|
| 8 |
+
# Only send HF credentials when downloading from huggingface.co.
|
| 9 |
+
# The default get_basic_auth_from_env() applies auth to ALL URLs, which
|
| 10 |
+
# causes non-HF hosts (e.g. OpenML) to return 400 Bad Request.
|
| 11 |
+
_orig_download_from_http = dl_mod.Download._download_from_http
|
| 12 |
|
| 13 |
+
def _hf_aware_download(self, filepath):
|
| 14 |
+
url = self.node.content_url or ""
|
| 15 |
+
if HF_TOKEN and "huggingface.co" in url:
|
| 16 |
+
os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
|
| 17 |
+
os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN
|
| 18 |
+
else:
|
| 19 |
+
os.environ.pop("CROISSANT_BASIC_AUTH_USERNAME", None)
|
| 20 |
+
os.environ.pop("CROISSANT_BASIC_AUTH_PASSWORD", None)
|
| 21 |
+
return _orig_download_from_http(self, filepath)
|
| 22 |
+
|
| 23 |
+
dl_mod.Download._download_from_http = _hf_aware_download
|
| 24 |
|
| 25 |
+
import logging
|
| 26 |
import mlcroissant as mlc
|
| 27 |
import func_timeout
|
| 28 |
import json
|
| 29 |
import traceback
|
| 30 |
|
| 31 |
+
# Suppress noisy mlcroissant pattern-matching warnings
|
| 32 |
+
logging.getLogger("root").addFilter(
|
| 33 |
+
lambda r: "Could not match" not in r.getMessage()
|
| 34 |
+
)
|
| 35 |
+
logging.getLogger().addFilter(
|
| 36 |
+
lambda r: "Could not match" not in r.getMessage()
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
WAIT_TIME = 10 * 60 # seconds
|
| 40 |
|
| 41 |
def validate_json(file_path):
|
|
|
|
| 115 |
error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
|
| 116 |
return False, error_message, "error"
|
| 117 |
|
| 118 |
+
RAI_FIELDS = [
|
| 119 |
+
"rai:dataLimitations",
|
| 120 |
+
"rai:dataBiases",
|
| 121 |
+
"rai:personalSensitiveInformation",
|
| 122 |
+
"rai:dataUseCases",
|
| 123 |
+
"rai:dataSocialImpact",
|
| 124 |
+
"rai:hasSyntheticData",
|
| 125 |
+
"prov:wasGeneratedBy",
|
| 126 |
+
]
|
| 127 |
+
|
| 128 |
+
RAI_GUIDELINES_URL = "https://neurips.cc/Conferences/2026/EvaluationsDatasetsHosting"
|
| 129 |
+
|
| 130 |
+
def validate_rai(json_data):
|
| 131 |
+
"""Check that all required Responsible AI metadata fields are present."""
|
| 132 |
+
missing = [field for field in RAI_FIELDS if field not in json_data]
|
| 133 |
+
if not missing:
|
| 134 |
+
return True, "All required Responsible AI metadata fields are present."
|
| 135 |
+
missing_list = "\n".join(f"- `{f}`" for f in missing)
|
| 136 |
+
message = (
|
| 137 |
+
f"The following required Responsible AI metadata fields are missing:\n{missing_list}\n\n"
|
| 138 |
+
f"Please refer to the <a href='{RAI_GUIDELINES_URL}' target='_blank'>NeurIPS guidelines for instructions</a> on how to add them."
|
| 139 |
+
)
|
| 140 |
+
return False, message
|
| 141 |
+
|
| 142 |
def generate_validation_report(filename, json_data, results):
|
| 143 |
"""Generate a detailed validation report in markdown format."""
|
| 144 |
report = []
|