Spaces:

AIML-TUDA
/

IsomorphicPerturbationTesting

Running

App Files Files Community

lukashelff commited on Apr 2

Commit

1383d87

1 Parent(s): c37dbf3

parsing

Browse files

Files changed (2) hide show

IsomorphicPerturbationTesting.py +20 -4
ipt/verifier.py +20 -3

IsomorphicPerturbationTesting.py CHANGED Viewed

@@ -91,6 +91,12 @@ Args:
               positive_predicate (`str`, default "eastbound")
               negative_predicate (`str`, default "westbound")
 Returns:
     extensional_accuracy (`float`): Fraction correct under extensional verification.
     isomorphic_accuracy  (`float`): Fraction correct under isomorphic verification.
@@ -112,8 +118,8 @@ Returns:
 # ---------------------------------------------------------------------------
 def _run_eval(args):
-    prediction, validation_program, eval_config, timeout = args
-    return verify_ipt(prediction, validation_program, eval_config, timeout=timeout)
 # ---------------------------------------------------------------------------
@@ -179,7 +185,17 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
                 "  Windows       : https://www.swi-prolog.org/download/stable"
             )
-    def _compute(self, predictions: list, references: list, verbose: bool = True) -> dict:
         if len(predictions) != len(references):
             raise ValueError(
                 f"predictions ({len(predictions)}) and references ({len(references)}) must have the same length."
@@ -194,7 +210,7 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
             cfg = ref.get("evaluation_config", _default_config)
             if not vp:
                 raise ValueError("Each reference must contain a 'validation_program' field.")
-            inputs.append((pred, vp, cfg, timeout))
         use_parallel = len(predictions) > 500
         if use_parallel:

               positive_predicate (`str`, default "eastbound")
               negative_predicate (`str`, default "westbound")
+    enable_parsing (`bool`, default True):
+        If True, apply extraction heuristics to pull the Prolog hypothesis out
+        of free-form model output (think-blocks, code fences, marker sections,
+        etc.) before verification.  Set to False when predictions are already
+        clean Prolog strings to skip all parsing overhead.
 Returns:
     extensional_accuracy (`float`): Fraction correct under extensional verification.
     isomorphic_accuracy  (`float`): Fraction correct under isomorphic verification.
 # ---------------------------------------------------------------------------
 def _run_eval(args):
+    prediction, validation_program, eval_config, timeout, enable_parsing = args
+    return verify_ipt(prediction, validation_program, eval_config, timeout=timeout, enable_parsing=enable_parsing)
 # ---------------------------------------------------------------------------
                 "  Windows       : https://www.swi-prolog.org/download/stable"
             )
+    def _compute(self, predictions: list, references: list, verbose: bool = True, enable_parsing: bool = True) -> dict:
+        """
+        Args:
+            predictions: List of candidate Prolog hypotheses (or free-form model output).
+            references:  List of dicts with 'validation_program' and optional 'evaluation_config'.
+            verbose:     Show a tqdm progress bar (default True).
+            enable_parsing: If True (default), apply extraction heuristics to pull the
+                            Prolog hypothesis out of free-form model output before
+                            verification.  Set to False when predictions are already
+                            clean Prolog strings to skip all parsing overhead.
+        """
         if len(predictions) != len(references):
             raise ValueError(
                 f"predictions ({len(predictions)}) and references ({len(references)}) must have the same length."
             cfg = ref.get("evaluation_config", _default_config)
             if not vp:
                 raise ValueError("Each reference must contain a 'validation_program' field.")
+            inputs.append((pred, vp, cfg, timeout, enable_parsing))
         use_parallel = len(predictions) > 500
         if use_parallel:

ipt/verifier.py CHANGED Viewed

@@ -266,6 +266,7 @@ def verify(
     eval_config: dict,
     isomorphic: bool = True,
     timeout: int = 5,
 ) -> dict:
     """
     Verify a hypothesis against a validation program.
@@ -279,6 +280,11 @@ def verify(
         isomorphic: If True, apply isomorphic renaming (shortcut-resistant).
                     If False, use extensional evaluation (shortcuts can pass).
         timeout: Prolog execution timeout in seconds.
     Returns:
         dict with keys:
@@ -299,7 +305,8 @@ def verify(
             "error": f"Positive predicate '{pos_pred}' not found in hypothesis.",
         }
-    hypothesis = extract_hypothesis(hypothesis)
     pos_examples = re.findall(rf"{pos_pred}\(([^)]+)\)", validation_program)
     neg_examples = re.findall(rf"{neg_pred}\(([^)]+)\)", validation_program)
@@ -395,6 +402,7 @@ def verify_ipt(
     validation_program: str,
     eval_config: dict,
     timeout: int = 5,
 ) -> dict:
     """
     Run both extensional and isomorphic verification and return a single
@@ -407,6 +415,15 @@ def verify_ipt(
     unstructured or prose-containing output (fallback_text extractions) without
     affecting the accuracy measurement for models that solved correctly.
     Returns:
         dict with keys:
             - extensional_correct (bool)
@@ -420,8 +437,8 @@ def verify_ipt(
     pos_pred = eval_config.get("positive_predicate", "eastbound")
     neg_pred = eval_config.get("negative_predicate", "westbound")
-    ext = verify(hypothesis, validation_program, eval_config, isomorphic=False, timeout=timeout)
-    iso = verify(hypothesis, validation_program, eval_config, isomorphic=True,  timeout=timeout)
     is_shortcut = ext["is_correct"] and not iso["is_correct"]
     # Secondary scan: only when the standard hypothesis failed the isomorphic test.

     eval_config: dict,
     isomorphic: bool = True,
     timeout: int = 5,
+    enable_parsing: bool = True,
 ) -> dict:
     """
     Verify a hypothesis against a validation program.
         isomorphic: If True, apply isomorphic renaming (shortcut-resistant).
                     If False, use extensional evaluation (shortcuts can pass).
         timeout: Prolog execution timeout in seconds.
+        enable_parsing: If True (default), extract the Prolog hypothesis from
+                        free-form text before verification.  Set to False when
+                        predictions are already clean Prolog strings, skipping
+                        all extraction heuristics and passing the text directly
+                        to SWI-Prolog.
     Returns:
         dict with keys:
             "error": f"Positive predicate '{pos_pred}' not found in hypothesis.",
         }
+    if enable_parsing:
+        hypothesis = extract_hypothesis(hypothesis)
     pos_examples = re.findall(rf"{pos_pred}\(([^)]+)\)", validation_program)
     neg_examples = re.findall(rf"{neg_pred}\(([^)]+)\)", validation_program)
     validation_program: str,
     eval_config: dict,
     timeout: int = 5,
+    enable_parsing: bool = True,
 ) -> dict:
     """
     Run both extensional and isomorphic verification and return a single
     unstructured or prose-containing output (fallback_text extractions) without
     affecting the accuracy measurement for models that solved correctly.
+    Args:
+        hypothesis: A Prolog rule or set of facts (or free-form model output).
+        validation_program: Background knowledge + labeled examples in Prolog.
+        eval_config: Dict with positive_predicate / negative_predicate keys.
+        timeout: Prolog execution timeout in seconds.
+        enable_parsing: If True (default), extract the Prolog hypothesis from
+                        free-form text before verification.  Set to False when
+                        predictions are already clean Prolog strings.
     Returns:
         dict with keys:
             - extensional_correct (bool)
     pos_pred = eval_config.get("positive_predicate", "eastbound")
     neg_pred = eval_config.get("negative_predicate", "westbound")
+    ext = verify(hypothesis, validation_program, eval_config, isomorphic=False, timeout=timeout, enable_parsing=enable_parsing)
+    iso = verify(hypothesis, validation_program, eval_config, isomorphic=True,  timeout=timeout, enable_parsing=enable_parsing)
     is_shortcut = ext["is_correct"] and not iso["is_correct"]
     # Secondary scan: only when the standard hypothesis failed the isomorphic test.