lukashelff commited on
Commit
1383d87
·
1 Parent(s): c37dbf3
Files changed (2) hide show
  1. IsomorphicPerturbationTesting.py +20 -4
  2. ipt/verifier.py +20 -3
IsomorphicPerturbationTesting.py CHANGED
@@ -91,6 +91,12 @@ Args:
91
  positive_predicate (`str`, default "eastbound")
92
  negative_predicate (`str`, default "westbound")
93
 
 
 
 
 
 
 
94
  Returns:
95
  extensional_accuracy (`float`): Fraction correct under extensional verification.
96
  isomorphic_accuracy (`float`): Fraction correct under isomorphic verification.
@@ -112,8 +118,8 @@ Returns:
112
  # ---------------------------------------------------------------------------
113
 
114
  def _run_eval(args):
115
- prediction, validation_program, eval_config, timeout = args
116
- return verify_ipt(prediction, validation_program, eval_config, timeout=timeout)
117
 
118
 
119
  # ---------------------------------------------------------------------------
@@ -179,7 +185,17 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
179
  " Windows : https://www.swi-prolog.org/download/stable"
180
  )
181
 
182
- def _compute(self, predictions: list, references: list, verbose: bool = True) -> dict:
 
 
 
 
 
 
 
 
 
 
183
  if len(predictions) != len(references):
184
  raise ValueError(
185
  f"predictions ({len(predictions)}) and references ({len(references)}) must have the same length."
@@ -194,7 +210,7 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
194
  cfg = ref.get("evaluation_config", _default_config)
195
  if not vp:
196
  raise ValueError("Each reference must contain a 'validation_program' field.")
197
- inputs.append((pred, vp, cfg, timeout))
198
 
199
  use_parallel = len(predictions) > 500
200
  if use_parallel:
 
91
  positive_predicate (`str`, default "eastbound")
92
  negative_predicate (`str`, default "westbound")
93
 
94
+ enable_parsing (`bool`, default True):
95
+ If True, apply extraction heuristics to pull the Prolog hypothesis out
96
+ of free-form model output (think-blocks, code fences, marker sections,
97
+ etc.) before verification. Set to False when predictions are already
98
+ clean Prolog strings to skip all parsing overhead.
99
+
100
  Returns:
101
  extensional_accuracy (`float`): Fraction correct under extensional verification.
102
  isomorphic_accuracy (`float`): Fraction correct under isomorphic verification.
 
118
  # ---------------------------------------------------------------------------
119
 
120
  def _run_eval(args):
121
+ prediction, validation_program, eval_config, timeout, enable_parsing = args
122
+ return verify_ipt(prediction, validation_program, eval_config, timeout=timeout, enable_parsing=enable_parsing)
123
 
124
 
125
  # ---------------------------------------------------------------------------
 
185
  " Windows : https://www.swi-prolog.org/download/stable"
186
  )
187
 
188
+ def _compute(self, predictions: list, references: list, verbose: bool = True, enable_parsing: bool = True) -> dict:
189
+ """
190
+ Args:
191
+ predictions: List of candidate Prolog hypotheses (or free-form model output).
192
+ references: List of dicts with 'validation_program' and optional 'evaluation_config'.
193
+ verbose: Show a tqdm progress bar (default True).
194
+ enable_parsing: If True (default), apply extraction heuristics to pull the
195
+ Prolog hypothesis out of free-form model output before
196
+ verification. Set to False when predictions are already
197
+ clean Prolog strings to skip all parsing overhead.
198
+ """
199
  if len(predictions) != len(references):
200
  raise ValueError(
201
  f"predictions ({len(predictions)}) and references ({len(references)}) must have the same length."
 
210
  cfg = ref.get("evaluation_config", _default_config)
211
  if not vp:
212
  raise ValueError("Each reference must contain a 'validation_program' field.")
213
+ inputs.append((pred, vp, cfg, timeout, enable_parsing))
214
 
215
  use_parallel = len(predictions) > 500
216
  if use_parallel:
ipt/verifier.py CHANGED
@@ -266,6 +266,7 @@ def verify(
266
  eval_config: dict,
267
  isomorphic: bool = True,
268
  timeout: int = 5,
 
269
  ) -> dict:
270
  """
271
  Verify a hypothesis against a validation program.
@@ -279,6 +280,11 @@ def verify(
279
  isomorphic: If True, apply isomorphic renaming (shortcut-resistant).
280
  If False, use extensional evaluation (shortcuts can pass).
281
  timeout: Prolog execution timeout in seconds.
 
 
 
 
 
282
 
283
  Returns:
284
  dict with keys:
@@ -299,7 +305,8 @@ def verify(
299
  "error": f"Positive predicate '{pos_pred}' not found in hypothesis.",
300
  }
301
 
302
- hypothesis = extract_hypothesis(hypothesis)
 
303
 
304
  pos_examples = re.findall(rf"{pos_pred}\(([^)]+)\)", validation_program)
305
  neg_examples = re.findall(rf"{neg_pred}\(([^)]+)\)", validation_program)
@@ -395,6 +402,7 @@ def verify_ipt(
395
  validation_program: str,
396
  eval_config: dict,
397
  timeout: int = 5,
 
398
  ) -> dict:
399
  """
400
  Run both extensional and isomorphic verification and return a single
@@ -407,6 +415,15 @@ def verify_ipt(
407
  unstructured or prose-containing output (fallback_text extractions) without
408
  affecting the accuracy measurement for models that solved correctly.
409
 
 
 
 
 
 
 
 
 
 
410
  Returns:
411
  dict with keys:
412
  - extensional_correct (bool)
@@ -420,8 +437,8 @@ def verify_ipt(
420
  pos_pred = eval_config.get("positive_predicate", "eastbound")
421
  neg_pred = eval_config.get("negative_predicate", "westbound")
422
 
423
- ext = verify(hypothesis, validation_program, eval_config, isomorphic=False, timeout=timeout)
424
- iso = verify(hypothesis, validation_program, eval_config, isomorphic=True, timeout=timeout)
425
  is_shortcut = ext["is_correct"] and not iso["is_correct"]
426
 
427
  # Secondary scan: only when the standard hypothesis failed the isomorphic test.
 
266
  eval_config: dict,
267
  isomorphic: bool = True,
268
  timeout: int = 5,
269
+ enable_parsing: bool = True,
270
  ) -> dict:
271
  """
272
  Verify a hypothesis against a validation program.
 
280
  isomorphic: If True, apply isomorphic renaming (shortcut-resistant).
281
  If False, use extensional evaluation (shortcuts can pass).
282
  timeout: Prolog execution timeout in seconds.
283
+ enable_parsing: If True (default), extract the Prolog hypothesis from
284
+ free-form text before verification. Set to False when
285
+ predictions are already clean Prolog strings, skipping
286
+ all extraction heuristics and passing the text directly
287
+ to SWI-Prolog.
288
 
289
  Returns:
290
  dict with keys:
 
305
  "error": f"Positive predicate '{pos_pred}' not found in hypothesis.",
306
  }
307
 
308
+ if enable_parsing:
309
+ hypothesis = extract_hypothesis(hypothesis)
310
 
311
  pos_examples = re.findall(rf"{pos_pred}\(([^)]+)\)", validation_program)
312
  neg_examples = re.findall(rf"{neg_pred}\(([^)]+)\)", validation_program)
 
402
  validation_program: str,
403
  eval_config: dict,
404
  timeout: int = 5,
405
+ enable_parsing: bool = True,
406
  ) -> dict:
407
  """
408
  Run both extensional and isomorphic verification and return a single
 
415
  unstructured or prose-containing output (fallback_text extractions) without
416
  affecting the accuracy measurement for models that solved correctly.
417
 
418
+ Args:
419
+ hypothesis: A Prolog rule or set of facts (or free-form model output).
420
+ validation_program: Background knowledge + labeled examples in Prolog.
421
+ eval_config: Dict with positive_predicate / negative_predicate keys.
422
+ timeout: Prolog execution timeout in seconds.
423
+ enable_parsing: If True (default), extract the Prolog hypothesis from
424
+ free-form text before verification. Set to False when
425
+ predictions are already clean Prolog strings.
426
+
427
  Returns:
428
  dict with keys:
429
  - extensional_correct (bool)
 
437
  pos_pred = eval_config.get("positive_predicate", "eastbound")
438
  neg_pred = eval_config.get("negative_predicate", "westbound")
439
 
440
+ ext = verify(hypothesis, validation_program, eval_config, isomorphic=False, timeout=timeout, enable_parsing=enable_parsing)
441
+ iso = verify(hypothesis, validation_program, eval_config, isomorphic=True, timeout=timeout, enable_parsing=enable_parsing)
442
  is_shortcut = ext["is_correct"] and not iso["is_correct"]
443
 
444
  # Secondary scan: only when the standard hypothesis failed the isomorphic test.