lukashelff commited on
Commit ·
1383d87
1
Parent(s): c37dbf3
parsing
Browse files- IsomorphicPerturbationTesting.py +20 -4
- ipt/verifier.py +20 -3
IsomorphicPerturbationTesting.py
CHANGED
|
@@ -91,6 +91,12 @@ Args:
|
|
| 91 |
positive_predicate (`str`, default "eastbound")
|
| 92 |
negative_predicate (`str`, default "westbound")
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
Returns:
|
| 95 |
extensional_accuracy (`float`): Fraction correct under extensional verification.
|
| 96 |
isomorphic_accuracy (`float`): Fraction correct under isomorphic verification.
|
|
@@ -112,8 +118,8 @@ Returns:
|
|
| 112 |
# ---------------------------------------------------------------------------
|
| 113 |
|
| 114 |
def _run_eval(args):
|
| 115 |
-
prediction, validation_program, eval_config, timeout = args
|
| 116 |
-
return verify_ipt(prediction, validation_program, eval_config, timeout=timeout)
|
| 117 |
|
| 118 |
|
| 119 |
# ---------------------------------------------------------------------------
|
|
@@ -179,7 +185,17 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
|
|
| 179 |
" Windows : https://www.swi-prolog.org/download/stable"
|
| 180 |
)
|
| 181 |
|
| 182 |
-
def _compute(self, predictions: list, references: list, verbose: bool = True) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
if len(predictions) != len(references):
|
| 184 |
raise ValueError(
|
| 185 |
f"predictions ({len(predictions)}) and references ({len(references)}) must have the same length."
|
|
@@ -194,7 +210,7 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
|
|
| 194 |
cfg = ref.get("evaluation_config", _default_config)
|
| 195 |
if not vp:
|
| 196 |
raise ValueError("Each reference must contain a 'validation_program' field.")
|
| 197 |
-
inputs.append((pred, vp, cfg, timeout))
|
| 198 |
|
| 199 |
use_parallel = len(predictions) > 500
|
| 200 |
if use_parallel:
|
|
|
|
| 91 |
positive_predicate (`str`, default "eastbound")
|
| 92 |
negative_predicate (`str`, default "westbound")
|
| 93 |
|
| 94 |
+
enable_parsing (`bool`, default True):
|
| 95 |
+
If True, apply extraction heuristics to pull the Prolog hypothesis out
|
| 96 |
+
of free-form model output (think-blocks, code fences, marker sections,
|
| 97 |
+
etc.) before verification. Set to False when predictions are already
|
| 98 |
+
clean Prolog strings to skip all parsing overhead.
|
| 99 |
+
|
| 100 |
Returns:
|
| 101 |
extensional_accuracy (`float`): Fraction correct under extensional verification.
|
| 102 |
isomorphic_accuracy (`float`): Fraction correct under isomorphic verification.
|
|
|
|
| 118 |
# ---------------------------------------------------------------------------
|
| 119 |
|
| 120 |
def _run_eval(args):
|
| 121 |
+
prediction, validation_program, eval_config, timeout, enable_parsing = args
|
| 122 |
+
return verify_ipt(prediction, validation_program, eval_config, timeout=timeout, enable_parsing=enable_parsing)
|
| 123 |
|
| 124 |
|
| 125 |
# ---------------------------------------------------------------------------
|
|
|
|
| 185 |
" Windows : https://www.swi-prolog.org/download/stable"
|
| 186 |
)
|
| 187 |
|
| 188 |
+
def _compute(self, predictions: list, references: list, verbose: bool = True, enable_parsing: bool = True) -> dict:
|
| 189 |
+
"""
|
| 190 |
+
Args:
|
| 191 |
+
predictions: List of candidate Prolog hypotheses (or free-form model output).
|
| 192 |
+
references: List of dicts with 'validation_program' and optional 'evaluation_config'.
|
| 193 |
+
verbose: Show a tqdm progress bar (default True).
|
| 194 |
+
enable_parsing: If True (default), apply extraction heuristics to pull the
|
| 195 |
+
Prolog hypothesis out of free-form model output before
|
| 196 |
+
verification. Set to False when predictions are already
|
| 197 |
+
clean Prolog strings to skip all parsing overhead.
|
| 198 |
+
"""
|
| 199 |
if len(predictions) != len(references):
|
| 200 |
raise ValueError(
|
| 201 |
f"predictions ({len(predictions)}) and references ({len(references)}) must have the same length."
|
|
|
|
| 210 |
cfg = ref.get("evaluation_config", _default_config)
|
| 211 |
if not vp:
|
| 212 |
raise ValueError("Each reference must contain a 'validation_program' field.")
|
| 213 |
+
inputs.append((pred, vp, cfg, timeout, enable_parsing))
|
| 214 |
|
| 215 |
use_parallel = len(predictions) > 500
|
| 216 |
if use_parallel:
|
ipt/verifier.py
CHANGED
|
@@ -266,6 +266,7 @@ def verify(
|
|
| 266 |
eval_config: dict,
|
| 267 |
isomorphic: bool = True,
|
| 268 |
timeout: int = 5,
|
|
|
|
| 269 |
) -> dict:
|
| 270 |
"""
|
| 271 |
Verify a hypothesis against a validation program.
|
|
@@ -279,6 +280,11 @@ def verify(
|
|
| 279 |
isomorphic: If True, apply isomorphic renaming (shortcut-resistant).
|
| 280 |
If False, use extensional evaluation (shortcuts can pass).
|
| 281 |
timeout: Prolog execution timeout in seconds.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
Returns:
|
| 284 |
dict with keys:
|
|
@@ -299,7 +305,8 @@ def verify(
|
|
| 299 |
"error": f"Positive predicate '{pos_pred}' not found in hypothesis.",
|
| 300 |
}
|
| 301 |
|
| 302 |
-
|
|
|
|
| 303 |
|
| 304 |
pos_examples = re.findall(rf"{pos_pred}\(([^)]+)\)", validation_program)
|
| 305 |
neg_examples = re.findall(rf"{neg_pred}\(([^)]+)\)", validation_program)
|
|
@@ -395,6 +402,7 @@ def verify_ipt(
|
|
| 395 |
validation_program: str,
|
| 396 |
eval_config: dict,
|
| 397 |
timeout: int = 5,
|
|
|
|
| 398 |
) -> dict:
|
| 399 |
"""
|
| 400 |
Run both extensional and isomorphic verification and return a single
|
|
@@ -407,6 +415,15 @@ def verify_ipt(
|
|
| 407 |
unstructured or prose-containing output (fallback_text extractions) without
|
| 408 |
affecting the accuracy measurement for models that solved correctly.
|
| 409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
Returns:
|
| 411 |
dict with keys:
|
| 412 |
- extensional_correct (bool)
|
|
@@ -420,8 +437,8 @@ def verify_ipt(
|
|
| 420 |
pos_pred = eval_config.get("positive_predicate", "eastbound")
|
| 421 |
neg_pred = eval_config.get("negative_predicate", "westbound")
|
| 422 |
|
| 423 |
-
ext = verify(hypothesis, validation_program, eval_config, isomorphic=False, timeout=timeout)
|
| 424 |
-
iso = verify(hypothesis, validation_program, eval_config, isomorphic=True, timeout=timeout)
|
| 425 |
is_shortcut = ext["is_correct"] and not iso["is_correct"]
|
| 426 |
|
| 427 |
# Secondary scan: only when the standard hypothesis failed the isomorphic test.
|
|
|
|
| 266 |
eval_config: dict,
|
| 267 |
isomorphic: bool = True,
|
| 268 |
timeout: int = 5,
|
| 269 |
+
enable_parsing: bool = True,
|
| 270 |
) -> dict:
|
| 271 |
"""
|
| 272 |
Verify a hypothesis against a validation program.
|
|
|
|
| 280 |
isomorphic: If True, apply isomorphic renaming (shortcut-resistant).
|
| 281 |
If False, use extensional evaluation (shortcuts can pass).
|
| 282 |
timeout: Prolog execution timeout in seconds.
|
| 283 |
+
enable_parsing: If True (default), extract the Prolog hypothesis from
|
| 284 |
+
free-form text before verification. Set to False when
|
| 285 |
+
predictions are already clean Prolog strings, skipping
|
| 286 |
+
all extraction heuristics and passing the text directly
|
| 287 |
+
to SWI-Prolog.
|
| 288 |
|
| 289 |
Returns:
|
| 290 |
dict with keys:
|
|
|
|
| 305 |
"error": f"Positive predicate '{pos_pred}' not found in hypothesis.",
|
| 306 |
}
|
| 307 |
|
| 308 |
+
if enable_parsing:
|
| 309 |
+
hypothesis = extract_hypothesis(hypothesis)
|
| 310 |
|
| 311 |
pos_examples = re.findall(rf"{pos_pred}\(([^)]+)\)", validation_program)
|
| 312 |
neg_examples = re.findall(rf"{neg_pred}\(([^)]+)\)", validation_program)
|
|
|
|
| 402 |
validation_program: str,
|
| 403 |
eval_config: dict,
|
| 404 |
timeout: int = 5,
|
| 405 |
+
enable_parsing: bool = True,
|
| 406 |
) -> dict:
|
| 407 |
"""
|
| 408 |
Run both extensional and isomorphic verification and return a single
|
|
|
|
| 415 |
unstructured or prose-containing output (fallback_text extractions) without
|
| 416 |
affecting the accuracy measurement for models that solved correctly.
|
| 417 |
|
| 418 |
+
Args:
|
| 419 |
+
hypothesis: A Prolog rule or set of facts (or free-form model output).
|
| 420 |
+
validation_program: Background knowledge + labeled examples in Prolog.
|
| 421 |
+
eval_config: Dict with positive_predicate / negative_predicate keys.
|
| 422 |
+
timeout: Prolog execution timeout in seconds.
|
| 423 |
+
enable_parsing: If True (default), extract the Prolog hypothesis from
|
| 424 |
+
free-form text before verification. Set to False when
|
| 425 |
+
predictions are already clean Prolog strings.
|
| 426 |
+
|
| 427 |
Returns:
|
| 428 |
dict with keys:
|
| 429 |
- extensional_correct (bool)
|
|
|
|
| 437 |
pos_pred = eval_config.get("positive_predicate", "eastbound")
|
| 438 |
neg_pred = eval_config.get("negative_predicate", "westbound")
|
| 439 |
|
| 440 |
+
ext = verify(hypothesis, validation_program, eval_config, isomorphic=False, timeout=timeout, enable_parsing=enable_parsing)
|
| 441 |
+
iso = verify(hypothesis, validation_program, eval_config, isomorphic=True, timeout=timeout, enable_parsing=enable_parsing)
|
| 442 |
is_shortcut = ext["is_correct"] and not iso["is_correct"]
|
| 443 |
|
| 444 |
# Secondary scan: only when the standard hypothesis failed the isomorphic test.
|