JoaquinVanschoren commited on
Commit
f0eee3f
·
1 Parent(s): 9c3a4be

Adding RAI checks

Browse files
Files changed (2) hide show
  1. app.py +20 -16
  2. validation.py +49 -6
app.py CHANGED
@@ -1,22 +1,10 @@
1
- import mlcroissant._src.operation_graph.operations.download as dl_mod
2
  import requests
3
  import os
4
-
5
- # Make sure the HF token is loaded
6
- HF_TOKEN = os.environ.get("HF_TOKEN")
7
-
8
- # Set the environment variables Croissant expects
9
- os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
10
- os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN or ""
11
-
12
- print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing")
13
- print("[DEBUG] Basic auth env set for Croissant")
14
-
15
  import gradio as gr
16
  import json
17
  import time
18
  import traceback
19
- from validation import validate_json, validate_croissant, validate_records, generate_validation_report
20
 
21
  def process_file(file):
22
  results = []
@@ -45,6 +33,10 @@ def process_file(file):
45
  records_message = records_message.replace("\n✓\n", "\n")
46
  results.append(("Records Generation Test", records_valid, records_message, records_status))
47
 
 
 
 
 
48
  # Generate final report
49
  report = generate_validation_report(filename, json_data, results)
50
 
@@ -64,6 +56,7 @@ def create_ui():
64
  1. If the file is valid JSON
65
  2. If it passes Croissant schema validation
66
  3. If records can be generated within a reasonable time
 
67
  """)
68
 
69
  # Track the active tab for conditional UI updates
@@ -107,7 +100,8 @@ def create_ui():
107
  label="Report Content",
108
  visible=True,
109
  show_copy_button=True,
110
- lines=10
 
111
  )
112
 
113
  # Define CSS for the validation UI
@@ -287,6 +281,12 @@ def create_ui():
287
  .gr-accordion[data-open=true] > .label-wrap::after {
288
  transform: rotate(90deg);
289
  }
 
 
 
 
 
 
290
  </style>
291
  """)
292
 
@@ -380,6 +380,10 @@ def create_ui():
380
  records_valid, records_message, records_status = validate_records(json_data)
381
  results.append(("Records Generation Test (Optional)", records_valid, records_message, records_status))
382
 
 
 
 
 
383
  # Generate report
384
  report = generate_validation_report(url.split("/")[-1], json_data, results)
385
  report_filename = f"report_croissant-validation_{json_data.get('name', 'unnamed')}.md"
@@ -438,11 +442,11 @@ def create_ui():
438
  status_class = "status-success"
439
  status_icon = "✓"
440
  message_with_emoji = "✅ " + message
441
- elif status == "warning":
442
  status_class = "status-warning"
443
  status_icon = "?"
444
  message_with_emoji = "⚠️ Could not automatically generate records. This is oftentimes not an issue (e.g. datasets could be too large or too complex), and it's not required to pass this test to submit to NeurIPS.\n\n" + message
445
- else: # error
446
  status_class = "status-error"
447
  status_icon = "✗"
448
  message_with_emoji = "❌ " + message
 
 
1
  import requests
2
  import os
 
 
 
 
 
 
 
 
 
 
 
3
  import gradio as gr
4
  import json
5
  import time
6
  import traceback
7
+ from validation import validate_json, validate_croissant, validate_records, validate_rai, generate_validation_report
8
 
9
  def process_file(file):
10
  results = []
 
33
  records_message = records_message.replace("\n✓\n", "\n")
34
  results.append(("Records Generation Test", records_valid, records_message, records_status))
35
 
36
+ # Check 4: Responsible AI metadata
37
+ rai_valid, rai_message = validate_rai(json_data)
38
+ results.append(("Responsible AI Metadata", rai_valid, rai_message, "pass" if rai_valid else "error"))
39
+
40
  # Generate final report
41
  report = generate_validation_report(filename, json_data, results)
42
 
 
56
  1. If the file is valid JSON
57
  2. If it passes Croissant schema validation
58
  3. If records can be generated within a reasonable time
59
+ 4. If all required Responsible AI metadata fields are present
60
  """)
61
 
62
  # Track the active tab for conditional UI updates
 
100
  label="Report Content",
101
  visible=True,
102
  show_copy_button=True,
103
+ lines=10,
104
+ elem_id="report-text-box"
105
  )
106
 
107
  # Define CSS for the validation UI
 
281
  .gr-accordion[data-open=true] > .label-wrap::after {
282
  transform: rotate(90deg);
283
  }
284
+
285
+ /* Prevent report textbox from bubbling scroll to the page */
286
+ #report-text-box textarea {
287
+ overflow-y: auto !important;
288
+ overscroll-behavior: contain;
289
+ }
290
  </style>
291
  """)
292
 
 
380
  records_valid, records_message, records_status = validate_records(json_data)
381
  results.append(("Records Generation Test (Optional)", records_valid, records_message, records_status))
382
 
383
+ # Check 4: Responsible AI metadata
384
+ rai_valid, rai_message = validate_rai(json_data)
385
+ results.append(("Responsible AI Metadata", rai_valid, rai_message, "pass" if rai_valid else "error"))
386
+
387
  # Generate report
388
  report = generate_validation_report(url.split("/")[-1], json_data, results)
389
  report_filename = f"report_croissant-validation_{json_data.get('name', 'unnamed')}.md"
 
442
  status_class = "status-success"
443
  status_icon = "✓"
444
  message_with_emoji = "✅ " + message
445
+ elif status == "warning" and "Records" in test_name:
446
  status_class = "status-warning"
447
  status_icon = "?"
448
  message_with_emoji = "⚠️ Could not automatically generate records. This is oftentimes not an issue (e.g. datasets could be too large or too complex), and it's not required to pass this test to submit to NeurIPS.\n\n" + message
449
+ else: # error or non-records warning
450
  status_class = "status-error"
451
  status_icon = "✗"
452
  message_with_emoji = "❌ " + message
validation.py CHANGED
@@ -2,21 +2,40 @@ import mlcroissant._src.operation_graph.operations.download as dl_mod
2
  import requests
3
  import os
4
 
5
- # Make sure the HF token is loaded
6
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
7
 
8
- # Set the environment variables Croissant expects
9
- os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
10
- os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN or ""
 
11
 
12
- print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing")
13
- print("[DEBUG] Basic auth env set for Croissant")
 
 
 
 
 
 
 
 
 
14
 
 
15
  import mlcroissant as mlc
16
  import func_timeout
17
  import json
18
  import traceback
19
 
 
 
 
 
 
 
 
 
20
  WAIT_TIME = 10 * 60 # seconds
21
 
22
  def validate_json(file_path):
@@ -96,6 +115,30 @@ def validate_records(json_data):
96
  error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
97
  return False, error_message, "error"
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def generate_validation_report(filename, json_data, results):
100
  """Generate a detailed validation report in markdown format."""
101
  report = []
 
2
  import requests
3
  import os
4
 
 
5
  HF_TOKEN = os.environ.get("HF_TOKEN")
6
+ print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing")
7
 
8
+ # Only send HF credentials when downloading from huggingface.co.
9
+ # The default get_basic_auth_from_env() applies auth to ALL URLs, which
10
+ # causes non-HF hosts (e.g. OpenML) to return 400 Bad Request.
11
+ _orig_download_from_http = dl_mod.Download._download_from_http
12
 
13
+ def _hf_aware_download(self, filepath):
14
+ url = self.node.content_url or ""
15
+ if HF_TOKEN and "huggingface.co" in url:
16
+ os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
17
+ os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN
18
+ else:
19
+ os.environ.pop("CROISSANT_BASIC_AUTH_USERNAME", None)
20
+ os.environ.pop("CROISSANT_BASIC_AUTH_PASSWORD", None)
21
+ return _orig_download_from_http(self, filepath)
22
+
23
+ dl_mod.Download._download_from_http = _hf_aware_download
24
 
25
+ import logging
26
  import mlcroissant as mlc
27
  import func_timeout
28
  import json
29
  import traceback
30
 
31
+ # Suppress noisy mlcroissant pattern-matching warnings
32
+ logging.getLogger("root").addFilter(
33
+ lambda r: "Could not match" not in r.getMessage()
34
+ )
35
+ logging.getLogger().addFilter(
36
+ lambda r: "Could not match" not in r.getMessage()
37
+ )
38
+
39
  WAIT_TIME = 10 * 60 # seconds
40
 
41
  def validate_json(file_path):
 
115
  error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
116
  return False, error_message, "error"
117
 
118
+ RAI_FIELDS = [
119
+ "rai:dataLimitations",
120
+ "rai:dataBiases",
121
+ "rai:personalSensitiveInformation",
122
+ "rai:dataUseCases",
123
+ "rai:dataSocialImpact",
124
+ "rai:hasSyntheticData",
125
+ "prov:wasGeneratedBy",
126
+ ]
127
+
128
+ RAI_GUIDELINES_URL = "https://neurips.cc/Conferences/2026/EvaluationsDatasetsHosting"
129
+
130
+ def validate_rai(json_data):
131
+ """Check that all required Responsible AI metadata fields are present."""
132
+ missing = [field for field in RAI_FIELDS if field not in json_data]
133
+ if not missing:
134
+ return True, "All required Responsible AI metadata fields are present."
135
+ missing_list = "\n".join(f"- `{f}`" for f in missing)
136
+ message = (
137
+ f"The following required Responsible AI metadata fields are missing:\n{missing_list}\n\n"
138
+ f"Please refer to the <a href='{RAI_GUIDELINES_URL}' target='_blank'>NeurIPS guidelines for instructions</a> on how to add them."
139
+ )
140
+ return False, message
141
+
142
  def generate_validation_report(filename, json_data, results):
143
  """Generate a detailed validation report in markdown format."""
144
  report = []