diff --git "a/Geolip_Procrustes_Bert_Model_Step_Model_Scaling.ipynb" "b/Geolip_Procrustes_Bert_Model_Step_Model_Scaling.ipynb"
new file mode 100644--- /dev/null
+++ "b/Geolip_Procrustes_Bert_Model_Step_Model_Scaling.ipynb"
@@ -0,0 +1,3522 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [
+        "dp7hwReFXokU",
+        "0wpUVBCiXmJg"
+      ],
+      "machine_shape": "hm",
+      "gpuType": "G4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU",
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "d5c2a63f6f8544e79268b9bade807345": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_f027a57fcb6a49cfbbd28b95a6c0adf7",
+              "IPY_MODEL_4df257d047cb4e7ab2a0a6c57be58b81",
+              "IPY_MODEL_214861fbbd124b45ba164e934f91a024"
+            ],
+            "layout": "IPY_MODEL_f6cf4e1cd78c4c0da8bf756a7cc8760a"
+          }
+        },
+        "f027a57fcb6a49cfbbd28b95a6c0adf7": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_bd3077d18b9640abb14a4c385cac7c39",
+            "placeholder": "​",
+            "style": "IPY_MODEL_c39efce145764de59d3dc9cb7a818d33",
+            "value": "Loading weights: 100%"
+          }
+        },
+        "4df257d047cb4e7ab2a0a6c57be58b81": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_b9fc695d6d70408690bbd7ef8bb5841a",
+            "max": 202,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_a1486fd248674e9584e90d907b5156c2",
+            "value": 202
+          }
+        },
+        "214861fbbd124b45ba164e934f91a024": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_3472ba9857a543568ec3cd2f340571d6",
+            "placeholder": "​",
+            "style": "IPY_MODEL_89eae54541d447bdaa9625cbbe357e55",
+            "value": " 202/202 [00:00&lt;00:00, 4335.55it/s, Materializing param=cls.predictions.transform.dense.weight]"
+          }
+        },
+        "f6cf4e1cd78c4c0da8bf756a7cc8760a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "bd3077d18b9640abb14a4c385cac7c39": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "c39efce145764de59d3dc9cb7a818d33": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "b9fc695d6d70408690bbd7ef8bb5841a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "a1486fd248674e9584e90d907b5156c2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "3472ba9857a543568ec3cd2f340571d6": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "89eae54541d447bdaa9625cbbe357e55": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "ae1bdb89a9704d09a1c03ec82354905e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_8d03191c19b64a698be6d6fc141817cb",
+              "IPY_MODEL_8d24919b783b47748aefac2a6c234313",
+              "IPY_MODEL_38bea3901f6c4e339d17a0269f0e535b"
+            ],
+            "layout": "IPY_MODEL_e4655e0917814b1b950d53a8f59d1fa5"
+          }
+        },
+        "8d03191c19b64a698be6d6fc141817cb": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_325923e6a4054298be70581c2cd1061d",
+            "placeholder": "​",
+            "style": "IPY_MODEL_97bcf95ab5ac4823b2f696f42db534da",
+            "value": "Loading weights: 100%"
+          }
+        },
+        "8d24919b783b47748aefac2a6c234313": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_866d9713b1f5436b924a5505e36b9e7d",
+            "max": 202,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_8f481ca713e04111a7dcab82f19a072b",
+            "value": 202
+          }
+        },
+        "38bea3901f6c4e339d17a0269f0e535b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_8566ff523c5d4bb5b799cdd12a95c633",
+            "placeholder": "​",
+            "style": "IPY_MODEL_bd53b01466524897aea749b68000684a",
+            "value": " 202/202 [00:00&lt;00:00, 4209.01it/s, Materializing param=cls.predictions.transform.dense.weight]"
+          }
+        },
+        "e4655e0917814b1b950d53a8f59d1fa5": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "325923e6a4054298be70581c2cd1061d": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "97bcf95ab5ac4823b2f696f42db534da": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "866d9713b1f5436b924a5505e36b9e7d": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "8f481ca713e04111a7dcab82f19a072b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "8566ff523c5d4bb5b799cdd12a95c633": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "bd53b01466524897aea749b68000684a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        }
+      }
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# geolip-rescale experimentation"
+      ],
+      "metadata": {
+        "id": "dp7hwReFXokU"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "KrVSx-_rMPBx",
+        "outputId": "0972f0ef-a170-4554-cfac-13f98e6a894a"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "======================================================================\n",
+            "ITERATIVE MULTI-SCALE GEOMETRIC TRANSFER\n",
+            "======================================================================\n",
+            "  Device: cuda\n",
+            "\n",
+            "  Task: 16-class pattern recognition, seq_len=64, noise=0.3\n",
+            "  Chance accuracy: 6.2%\n",
+            "  Scales: 256 → 224 → 192 → 160 → 131 → 113 → 97 → 73 → 64\n",
+            "  CV tolerance: ±0.05\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 0: 256-dim (ROOT — train from scratch)\n",
+            "======================================================================\n",
+            "  Params: 153,872\n",
+            "  CV before training: 0.0771\n",
+            "  Trained: 200 epochs → acc=0.8720, cv=0.0839\n",
+            "    layer_0: CV=0.1256  eff_rank=62.2\n",
+            "    layer_1: CV=0.0966  eff_rank=203.9\n",
+            "    layer_2: CV=0.1023  eff_rank=194.6\n",
+            "    layer_3: CV=0.0308  eff_rank=15.9\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 1: 256-dim → 224-dim (12% reduction)\n",
+            "======================================================================\n",
+            "  Params: 120,304 (78.2% of root)\n",
+            "\n",
+            "  Projecting 256 → 224...\n",
+            "  After transfer: acc=0.8214, cv=0.1000\n",
+            "    layer_0: CV=0.1393  eff_rank=61.9\n",
+            "    layer_1: CV=0.0941  eff_rank=176.7\n",
+            "    layer_2: CV=0.1014  eff_rank=169.4\n",
+            "    layer_3: CV=0.0395  eff_rank=15.9\n",
+            "\n",
+            "  Healing toward parent CV=0.0839 (±0.05)...\n",
+            "  Healed: 1 epochs (0.5s) → acc=0.8600, cv=0.0950\n",
+            "    layer_0: CV=0.1305  eff_rank=61.9\n",
+            "    layer_1: CV=0.0888  eff_rank=176.7\n",
+            "    layer_2: CV=0.1015  eff_rank=170.0\n",
+            "    layer_3: CV=0.0421  eff_rank=15.9\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 2: 224-dim → 192-dim (14% reduction)\n",
+            "======================================================================\n",
+            "  Params: 90,832 (59.0% of root)\n",
+            "\n",
+            "  Projecting 224 → 192...\n",
+            "  After transfer: acc=0.8190, cv=0.0968\n",
+            "    layer_0: CV=0.1230  eff_rank=61.4\n",
+            "    layer_1: CV=0.1080  eff_rank=151.3\n",
+            "    layer_2: CV=0.1105  eff_rank=145.6\n",
+            "    layer_3: CV=0.0510  eff_rank=15.9\n",
+            "\n",
+            "  Healing toward parent CV=0.0950 (±0.05)...\n",
+            "  Healed: 1 epochs (0.5s) → acc=0.8855, cv=0.0971\n",
+            "    layer_0: CV=0.1356  eff_rank=61.5\n",
+            "    layer_1: CV=0.0970  eff_rank=151.3\n",
+            "    layer_2: CV=0.1015  eff_rank=145.9\n",
+            "    layer_3: CV=0.0538  eff_rank=15.9\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 3: 192-dim → 160-dim (17% reduction)\n",
+            "======================================================================\n",
+            "  Params: 65,456 (42.5% of root)\n",
+            "\n",
+            "  Projecting 192 → 160...\n",
+            "  After transfer: acc=0.7910, cv=0.0999\n",
+            "    layer_0: CV=0.1229  eff_rank=60.8\n",
+            "    layer_1: CV=0.1212  eff_rank=125.2\n",
+            "    layer_2: CV=0.1105  eff_rank=122.0\n",
+            "    layer_3: CV=0.0496  eff_rank=15.8\n",
+            "\n",
+            "  Healing toward parent CV=0.0971 (±0.05)...\n",
+            "  Healed: 1 epochs (0.5s) → acc=0.8845, cv=0.1006\n",
+            "    layer_0: CV=0.1307  eff_rank=60.8\n",
+            "    layer_1: CV=0.1207  eff_rank=125.3\n",
+            "    layer_2: CV=0.1217  eff_rank=122.1\n",
+            "    layer_3: CV=0.0535  eff_rank=15.9\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 4: 160-dim → 131-dim (18% reduction)\n",
+            "======================================================================\n",
+            "  Params: 45,997 (29.9% of root)\n",
+            "\n",
+            "  Projecting 160 → 131...\n",
+            "  After transfer: acc=0.7444, cv=0.1079\n",
+            "    layer_0: CV=0.1331  eff_rank=59.9\n",
+            "    layer_1: CV=0.1135  eff_rank=102.2\n",
+            "    layer_2: CV=0.1180  eff_rank=100.1\n",
+            "    layer_3: CV=0.0790  eff_rank=15.8\n",
+            "\n",
+            "  Healing toward parent CV=0.1006 (±0.05)...\n",
+            "  Healed: 1 epochs (0.5s) → acc=0.8670, cv=0.1126\n",
+            "    layer_0: CV=0.1277  eff_rank=59.9\n",
+            "    layer_1: CV=0.1292  eff_rank=102.2\n",
+            "    layer_2: CV=0.1101  eff_rank=99.9\n",
+            "    layer_3: CV=0.0695  eff_rank=15.8\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 5: 131-dim → 113-dim (14% reduction)\n",
+            "======================================================================\n",
+            "  Params: 35,611 (23.1% of root)\n",
+            "\n",
+            "  Projecting 131 → 113...\n",
+            "  After transfer: acc=0.7944, cv=0.1136\n",
+            "    layer_0: CV=0.1412  eff_rank=59.0\n",
+            "    layer_1: CV=0.1237  eff_rank=88.6\n",
+            "    layer_2: CV=0.1269  eff_rank=86.3\n",
+            "    layer_3: CV=0.0863  eff_rank=15.8\n",
+            "\n",
+            "  Healing toward parent CV=0.1126 (±0.05)...\n",
+            "  Healed: 1 epochs (0.5s) → acc=0.8775, cv=0.1123\n",
+            "    layer_0: CV=0.1186  eff_rank=59.0\n",
+            "    layer_1: CV=0.1226  eff_rank=88.6\n",
+            "    layer_2: CV=0.1189  eff_rank=86.1\n",
+            "    layer_3: CV=0.0726  eff_rank=15.8\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 6: 113-dim → 97-dim (14% reduction)\n",
+            "======================================================================\n",
+            "  Params: 27,467 (17.9% of root)\n",
+            "\n",
+            "  Projecting 113 → 97...\n",
+            "  After transfer: acc=0.7706, cv=0.1218\n",
+            "    layer_0: CV=0.1447  eff_rank=57.8\n",
+            "    layer_1: CV=0.1389  eff_rank=76.0\n",
+            "    layer_2: CV=0.1464  eff_rank=73.9\n",
+            "    layer_3: CV=0.0872  eff_rank=15.7\n",
+            "\n",
+            "  Healing toward parent CV=0.1123 (±0.05)...\n",
+            "  Healed: 1 epochs (0.5s) → acc=0.8685, cv=0.1193\n",
+            "    layer_0: CV=0.1383  eff_rank=57.8\n",
+            "    layer_1: CV=0.1388  eff_rank=76.0\n",
+            "    layer_2: CV=0.1364  eff_rank=73.5\n",
+            "    layer_3: CV=0.0780  eff_rank=15.8\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 7: 97-dim → 73-dim (25% reduction)\n",
+            "======================================================================\n",
+            "  Params: 17,171 (11.2% of root)\n",
+            "\n",
+            "  Projecting 97 → 73...\n",
+            "  After transfer: acc=0.6966, cv=0.1427\n",
+            "    layer_0: CV=0.1180  eff_rank=54.3\n",
+            "    layer_1: CV=0.1580  eff_rank=56.0\n",
+            "    layer_2: CV=0.1436  eff_rank=54.4\n",
+            "    layer_3: CV=0.1031  eff_rank=15.6\n",
+            "\n",
+            "  Healing toward parent CV=0.1193 (±0.05)...\n",
+            "  Healed: 1 epochs (0.5s) → acc=0.8670, cv=0.1325\n",
+            "    layer_0: CV=0.1279  eff_rank=54.3\n",
+            "    layer_1: CV=0.1600  eff_rank=56.0\n",
+            "    layer_2: CV=0.1592  eff_rank=54.3\n",
+            "    layer_3: CV=0.0945  eff_rank=15.7\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 8: 73-dim → 64-dim (12% reduction)\n",
+            "======================================================================\n",
+            "  Params: 13,904 (9.0% of root)\n",
+            "\n",
+            "  Projecting 73 → 64...\n",
+            "  After transfer: acc=0.7416, cv=0.1433\n",
+            "    layer_0: CV=0.1420  eff_rank=51.5\n",
+            "    layer_1: CV=0.1614  eff_rank=49.2\n",
+            "    layer_2: CV=0.1705  eff_rank=48.2\n",
+            "    layer_3: CV=0.1047  eff_rank=15.6\n",
+            "\n",
+            "  Healing toward parent CV=0.1325 (±0.05)...\n",
+            "  Healed: 1 epochs (0.5s) → acc=0.8455, cv=0.1372\n",
+            "    layer_0: CV=0.1499  eff_rank=51.6\n",
+            "    layer_1: CV=0.1602  eff_rank=49.2\n",
+            "    layer_2: CV=0.1749  eff_rank=47.9\n",
+            "    layer_3: CV=0.0936  eff_rank=15.7\n",
+            "\n",
+            "======================================================================\n",
+            "BASELINE: Train 64-dim from scratch\n",
+            "======================================================================\n",
+            "  Trained: 200 epochs → acc=0.8560, cv=0.1436\n",
+            "\n",
+            "======================================================================\n",
+            "DIRECT PROJECTION: 256 → 64 (single jump)\n",
+            "======================================================================\n",
+            "  After direct transfer: acc=0.2960, cv=0.1740\n",
+            "\n",
+            "======================================================================\n",
+            "RESULTS — ITERATIVE CASCADE\n",
+            "======================================================================\n",
+            "\n",
+            "  Scale      Params  Acc(proj)  Acc(heal)  CV(proj)  CV(heal)  Epochs\n",
+            "  ──────── ──────── ────────── ────────── ───────── ───────── ───────\n",
+            "  256       153,872     0.8720     0.8720    0.0839    0.0839     200\n",
+            "  224       120,304     0.8214     0.8600    0.1000    0.0950       1\n",
+            "  192        90,832     0.8190     0.8855    0.0968    0.0971       1\n",
+            "  160        65,456     0.7910     0.8845    0.0999    0.1006       1\n",
+            "  131        45,997     0.7444     0.8670    0.1079    0.1126       1\n",
+            "  113        35,611     0.7944     0.8775    0.1136    0.1123       1\n",
+            "  97         27,467     0.7706     0.8685    0.1218    0.1193       1\n",
+            "  73         17,171     0.6966     0.8670    0.1427    0.1325       1\n",
+            "  64         13,904     0.7416     0.8455    0.1433    0.1372       1\n",
+            "\n",
+            "  COMPARISONS\n",
+            "  ────────────────────────────────────────────────────────────\n",
+            "  Cascade 64-dim:  acc=0.8455  cv=0.1372  (total 8 heal epochs)\n",
+            "  Direct proj 64-dim:  acc=0.2960  cv=0.1740  (0 training)\n",
+            "  Scratch 64-dim:  acc=0.8560  cv=0.1436  (200 epochs)\n",
+            "  Chance:             acc=0.0625\n",
+            "\n",
+            "  COMPRESSION:\n",
+            "    Root:     153,872 params\n",
+            "    Target:    13,904 params (9.0%)\n",
+            "    Ratio:   11.1×\n",
+            "\n",
+            "  GEOMETRIC PRESERVATION:\n",
+            "    Root CV:     0.0839\n",
+            "    Final CV:    0.1372\n",
+            "    Δ CV:        0.0533\n",
+            "    Direct CV:   0.1740\n",
+            "    Scratch CV:  0.1436\n",
+            "\n",
+            "Done.\n"
+          ]
+        }
+      ],
+      "source": [
+        "# ============================================================================\n",
+        "# ITERATIVE MULTI-SCALE GEOMETRIC TRANSFER\n",
+        "#\n",
+        "# Cascade: 256 → 224 → 192 → 160 → 131\n",
+        "# At each scale:\n",
+        "#   1. Procrustes-project from parent\n",
+        "#   2. Measure accuracy + CV\n",
+        "#   3. Train ONLY until CV reaches parent's CV band (±tolerance)\n",
+        "#   4. Measure accuracy again\n",
+        "#   5. Project down to next scale\n",
+        "#\n",
+        "# The hypothesis: small iterative steps preserve geometric structure\n",
+        "# better than one large jump, because each intermediate model can\n",
+        "# \"heal\" the projection distortion through minimal training.\n",
+        "# ============================================================================\n",
+        "\n",
+        "import math\n",
+        "import time\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "import numpy as np\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# GEOMETRIC UTILITIES\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def cayley_menger_vol2(pts):\n",
+        "    with torch.amp.autocast(\"cuda\", enabled=False):\n",
+        "        pts = pts.float()\n",
+        "        diff = pts.unsqueeze(-2) - pts.unsqueeze(-3)\n",
+        "        d2 = (diff * diff).sum(-1)\n",
+        "        B, V, _ = d2.shape\n",
+        "        cm = torch.zeros(B, V+1, V+1, device=d2.device, dtype=torch.float32)\n",
+        "        cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2\n",
+        "        s = (-1.0)**V; f = math.factorial(V-1)\n",
+        "        return s / ((2.0**(V-1)) * f*f) * torch.linalg.det(cm)\n",
+        "\n",
+        "\n",
+        "def pentachoron_cv(embeddings, n_samples=200):\n",
+        "    B = embeddings.shape[0]\n",
+        "    if B < 5:\n",
+        "        return 0.0\n",
+        "    vols = []\n",
+        "    for _ in range(n_samples):\n",
+        "        idx = torch.randperm(B, device=embeddings.device)[:5]\n",
+        "        v2 = cayley_menger_vol2(embeddings[idx].unsqueeze(0))\n",
+        "        v = torch.sqrt(F.relu(v2[0]) + 1e-12).item()\n",
+        "        if v > 0:\n",
+        "            vols.append(v)\n",
+        "    if len(vols) < 10:\n",
+        "        return 0.0\n",
+        "    a = np.array(vols, dtype=np.float64)\n",
+        "    return float(a.std() / max(a.mean(), 1e-12))\n",
+        "\n",
+        "\n",
+        "def profile_model(model):\n",
+        "    \"\"\"Profile all linear layers: CV, effective rank.\"\"\"\n",
+        "    results = {}\n",
+        "    for i, layer in enumerate(model.get_linear_layers()):\n",
+        "        W = layer.weight.detach().float()\n",
+        "        cv = pentachoron_cv(W, n_samples=200)\n",
+        "        S = torch.linalg.svdvals(W)\n",
+        "        S_norm = S / S.sum()\n",
+        "        eff_rank = torch.exp(-torch.sum(S_norm * torch.log(S_norm + 1e-12))).item()\n",
+        "        results[f\"layer_{i}\"] = {\"cv\": cv, \"eff_rank\": eff_rank}\n",
+        "    mean_cv = np.mean([v[\"cv\"] for v in results.values()])\n",
+        "    return results, mean_cv\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# TASK: Multi-class sequence pattern recognition (harder than needle)\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "class PatternTask:\n",
+        "    \"\"\"\n",
+        "    Multi-pattern classification. Each class has a distinct learned template.\n",
+        "    The model must learn to recognize WHICH pattern, not just WHERE.\n",
+        "    This forces genuine geometric restructuring during training.\n",
+        "\n",
+        "    Input:  (B, seq_len) — noisy pattern\n",
+        "    Target: (B,) — pattern class\n",
+        "    \"\"\"\n",
+        "    def __init__(self, n_classes=16, seq_len=64, noise=0.3, device=\"cpu\"):\n",
+        "        self.n_classes = n_classes\n",
+        "        self.seq_len = seq_len\n",
+        "        self.noise = noise\n",
+        "        self.device = device\n",
+        "\n",
+        "        # Fixed random templates — each class has a unique pattern\n",
+        "        torch.manual_seed(42)\n",
+        "        self.templates = torch.randn(n_classes, seq_len, device=device)\n",
+        "        self.templates = F.normalize(self.templates, dim=-1)\n",
+        "\n",
+        "    def generate(self, n_samples):\n",
+        "        labels = torch.randint(0, self.n_classes, (n_samples,), device=self.device)\n",
+        "        patterns = self.templates[labels]\n",
+        "        noise = torch.randn_like(patterns) * self.noise\n",
+        "        inputs = patterns + noise\n",
+        "        return inputs, labels\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# MODEL\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "class PatternModel(nn.Module):\n",
+        "    def __init__(self, seq_len, hidden_dim, n_classes, n_layers=4):\n",
+        "        super().__init__()\n",
+        "        self.seq_len = seq_len\n",
+        "        self.hidden_dim = hidden_dim\n",
+        "        self.n_classes = n_classes\n",
+        "\n",
+        "        layers = []\n",
+        "        layers.append(nn.Linear(seq_len, hidden_dim))\n",
+        "        layers.append(nn.GELU())\n",
+        "        layers.append(nn.LayerNorm(hidden_dim))\n",
+        "        for _ in range(n_layers - 2):\n",
+        "            layers.append(nn.Linear(hidden_dim, hidden_dim))\n",
+        "            layers.append(nn.GELU())\n",
+        "            layers.append(nn.LayerNorm(hidden_dim))\n",
+        "        layers.append(nn.Linear(hidden_dim, n_classes))\n",
+        "        self.network = nn.Sequential(*layers)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.network(x)\n",
+        "\n",
+        "    def get_linear_layers(self):\n",
+        "        return [m for m in self.network.modules() if isinstance(m, nn.Linear)]\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# PROCRUSTES PROJECTION: truncated SVD\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def svd_project(W_large, out_dim, in_dim):\n",
+        "    \"\"\"Project weight matrix via truncated SVD reconstruction.\"\"\"\n",
+        "    W = W_large.float()\n",
+        "    U, S, Vt = torch.linalg.svd(W, full_matrices=True)\n",
+        "    k = min(S.shape[0], out_dim, in_dim)\n",
+        "    U_k = U[:min(W.shape[0], out_dim), :k]\n",
+        "    Vt_k = Vt[:k, :min(W.shape[1], in_dim)]\n",
+        "    W_small = U_k @ torch.diag(S[:k]) @ Vt_k\n",
+        "    result = torch.zeros(out_dim, in_dim, device=W.device)\n",
+        "    r, c = W_small.shape\n",
+        "    result[:r, :c] = W_small\n",
+        "    return result\n",
+        "\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def transfer_weights(source, target):\n",
+        "    \"\"\"Procrustes-project all linear layers + layernorms from source → target.\"\"\"\n",
+        "    src_layers = source.get_linear_layers()\n",
+        "    tgt_layers = target.get_linear_layers()\n",
+        "\n",
+        "    for L, S in zip(src_layers, tgt_layers):\n",
+        "        to, ti = S.weight.shape\n",
+        "        S.weight.data.copy_(svd_project(L.weight.data, to, ti))\n",
+        "\n",
+        "        if L.bias is not None and S.bias is not None:\n",
+        "            b = L.bias.data.float()\n",
+        "            if b.shape[0] > to:\n",
+        "                U, _, _ = torch.linalg.svd(L.weight.data.float(), full_matrices=True)\n",
+        "                S.bias.data.copy_(U[:, :to].T @ b)\n",
+        "            elif b.shape[0] < to:\n",
+        "                S.bias.data.zero_()\n",
+        "                S.bias.data[:b.shape[0]].copy_(b)\n",
+        "            else:\n",
+        "                S.bias.data.copy_(b)\n",
+        "\n",
+        "    # LayerNorms\n",
+        "    src_norms = [m for m in source.network.modules() if isinstance(m, nn.LayerNorm)]\n",
+        "    tgt_norms = [m for m in target.network.modules() if isinstance(m, nn.LayerNorm)]\n",
+        "    for ln_s, ln_t in zip(src_norms, tgt_norms):\n",
+        "        d = min(ln_s.weight.shape[0], ln_t.weight.shape[0])\n",
+        "        ln_t.weight.data[:d].copy_(ln_s.weight.data[:d])\n",
+        "        ln_t.bias.data[:d].copy_(ln_s.bias.data[:d])\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# CV-GATED TRAINING\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def train_until_cv(model, task, target_cv, cv_tolerance=0.05,\n",
+        "                   max_epochs=200, lr=3e-4, batch_size=256):\n",
+        "    \"\"\"\n",
+        "    Train until mean CV reaches target ± tolerance.\n",
+        "    Returns: epochs_used, final_acc, final_cv\n",
+        "    \"\"\"\n",
+        "    device = next(model.parameters()).device\n",
+        "    train_x, train_y = task.generate(10000)\n",
+        "    test_x, test_y = task.generate(2000)\n",
+        "    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)\n",
+        "    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_epochs)\n",
+        "\n",
+        "    for epoch in range(max_epochs):\n",
+        "        model.train()\n",
+        "        perm = torch.randperm(train_x.shape[0], device=device)\n",
+        "        for i in range(0, train_x.shape[0], batch_size):\n",
+        "            idx = perm[i:i+batch_size]\n",
+        "            loss = F.cross_entropy(model(train_x[idx]), train_y[idx])\n",
+        "            optimizer.zero_grad()\n",
+        "            loss.backward()\n",
+        "            optimizer.step()\n",
+        "        scheduler.step()\n",
+        "\n",
+        "        # Check CV every 5 epochs\n",
+        "        if (epoch + 1) % 5 == 0 or epoch == 0:\n",
+        "            model.eval()\n",
+        "            _, mean_cv = profile_model(model)\n",
+        "            with torch.no_grad():\n",
+        "                acc = (model(test_x).argmax(-1) == test_y).float().mean().item()\n",
+        "\n",
+        "            if abs(mean_cv - target_cv) <= cv_tolerance:\n",
+        "                return epoch + 1, acc, mean_cv\n",
+        "            if acc >= 0.99 and epoch > 20:\n",
+        "                return epoch + 1, acc, mean_cv\n",
+        "\n",
+        "    # Max epochs reached\n",
+        "    model.eval()\n",
+        "    _, final_cv = profile_model(model)\n",
+        "    with torch.no_grad():\n",
+        "        final_acc = (model(test_x).argmax(-1) == test_y).float().mean().item()\n",
+        "    return max_epochs, final_acc, final_cv\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# EXPERIMENT\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def run_experiment():\n",
+        "    print(\"=\" * 70)\n",
+        "    print(\"ITERATIVE MULTI-SCALE GEOMETRIC TRANSFER\")\n",
+        "    print(\"=\" * 70)\n",
+        "\n",
+        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "    print(f\"  Device: {device}\")\n",
+        "\n",
+        "    # ── Configuration ──\n",
+        "    SEQ_LEN = 64\n",
+        "    N_CLASSES = 16\n",
+        "    NOISE = 0.3\n",
+        "    N_LAYERS = 4\n",
+        "    SCALES = [256, 224, 192, 160, 131, 113, 97, 73, 64]  # cascade\n",
+        "    CV_TOLERANCE = 0.05\n",
+        "    MAX_HEAL_EPOCHS = 500\n",
+        "\n",
+        "    print(f\"\\n  Task: {N_CLASSES}-class pattern recognition, seq_len={SEQ_LEN}, noise={NOISE}\")\n",
+        "    print(f\"  Chance accuracy: {1/N_CLASSES:.1%}\")\n",
+        "    print(f\"  Scales: {' → '.join(str(s) for s in SCALES)}\")\n",
+        "    print(f\"  CV tolerance: ±{CV_TOLERANCE}\")\n",
+        "\n",
+        "    task = PatternTask(N_CLASSES, SEQ_LEN, NOISE, device)\n",
+        "    test_x, test_y = task.generate(5000)\n",
+        "\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "    # STEP 0: Train root model (256-dim) to convergence\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "\n",
+        "    print(f\"\\n{'='*70}\")\n",
+        "    print(f\"SCALE 0: {SCALES[0]}-dim (ROOT — train from scratch)\")\n",
+        "    print(f\"{'='*70}\")\n",
+        "\n",
+        "    root = PatternModel(SEQ_LEN, SCALES[0], N_CLASSES, N_LAYERS).to(device)\n",
+        "    n_root = sum(p.numel() for p in root.parameters())\n",
+        "    print(f\"  Params: {n_root:,}\")\n",
+        "\n",
+        "    # Profile before training\n",
+        "    _, cv_before = profile_model(root)\n",
+        "    print(f\"  CV before training: {cv_before:.4f}\")\n",
+        "\n",
+        "    epochs, acc, cv = train_until_cv(root, task, target_cv=0.20,\n",
+        "                                      cv_tolerance=0.1, max_epochs=200)\n",
+        "    print(f\"  Trained: {epochs} epochs → acc={acc:.4f}, cv={cv:.4f}\")\n",
+        "\n",
+        "    # Full profile\n",
+        "    profile, _ = profile_model(root)\n",
+        "    for name, stats in profile.items():\n",
+        "        print(f\"    {name}: CV={stats['cv']:.4f}  eff_rank={stats['eff_rank']:.1f}\")\n",
+        "\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "    # ITERATIVE CASCADE\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "\n",
+        "    results = [{\n",
+        "        \"scale\": SCALES[0],\n",
+        "        \"params\": n_root,\n",
+        "        \"acc_after_transfer\": acc,\n",
+        "        \"acc_after_heal\": acc,\n",
+        "        \"cv_after_transfer\": cv,\n",
+        "        \"cv_after_heal\": cv,\n",
+        "        \"heal_epochs\": epochs,\n",
+        "        \"source\": \"scratch\",\n",
+        "    }]\n",
+        "\n",
+        "    parent_model = root\n",
+        "    parent_cv = cv\n",
+        "\n",
+        "    for i in range(1, len(SCALES)):\n",
+        "        dim = SCALES[i]\n",
+        "        parent_dim = SCALES[i-1]\n",
+        "\n",
+        "        print(f\"\\n{'='*70}\")\n",
+        "        print(f\"SCALE {i}: {parent_dim}-dim → {dim}-dim \"\n",
+        "              f\"({(parent_dim-dim)/parent_dim:.0%} reduction)\")\n",
+        "        print(f\"{'='*70}\")\n",
+        "\n",
+        "        # Build target model\n",
+        "        child = PatternModel(SEQ_LEN, dim, N_CLASSES, N_LAYERS).to(device)\n",
+        "        n_child = sum(p.numel() for p in child.parameters())\n",
+        "        print(f\"  Params: {n_child:,} ({n_child/n_root:.1%} of root)\")\n",
+        "\n",
+        "        # ── Transfer ──\n",
+        "        print(f\"\\n  Projecting {parent_dim} → {dim}...\")\n",
+        "        transfer_weights(parent_model, child)\n",
+        "\n",
+        "        # Measure immediately after transfer (no training)\n",
+        "        child.eval()\n",
+        "        _, cv_transfer = profile_model(child)\n",
+        "        with torch.no_grad():\n",
+        "            acc_transfer = (child(test_x).argmax(-1) == test_y).float().mean().item()\n",
+        "        print(f\"  After transfer: acc={acc_transfer:.4f}, cv={cv_transfer:.4f}\")\n",
+        "\n",
+        "        child_profile, _ = profile_model(child)\n",
+        "        for name, stats in child_profile.items():\n",
+        "            print(f\"    {name}: CV={stats['cv']:.4f}  eff_rank={stats['eff_rank']:.1f}\")\n",
+        "\n",
+        "        # ── Heal: train until CV matches parent ──\n",
+        "        print(f\"\\n  Healing toward parent CV={parent_cv:.4f} (±{CV_TOLERANCE})...\")\n",
+        "        t0 = time.time()\n",
+        "        heal_epochs, acc_heal, cv_heal = train_until_cv(\n",
+        "            child, task, target_cv=parent_cv,\n",
+        "            cv_tolerance=CV_TOLERANCE, max_epochs=MAX_HEAL_EPOCHS)\n",
+        "        elapsed = time.time() - t0\n",
+        "        print(f\"  Healed: {heal_epochs} epochs ({elapsed:.1f}s) → \"\n",
+        "              f\"acc={acc_heal:.4f}, cv={cv_heal:.4f}\")\n",
+        "\n",
+        "        # Post-heal profile\n",
+        "        heal_profile, _ = profile_model(child)\n",
+        "        for name, stats in heal_profile.items():\n",
+        "            print(f\"    {name}: CV={stats['cv']:.4f}  eff_rank={stats['eff_rank']:.1f}\")\n",
+        "\n",
+        "        results.append({\n",
+        "            \"scale\": dim,\n",
+        "            \"params\": n_child,\n",
+        "            \"acc_after_transfer\": acc_transfer,\n",
+        "            \"acc_after_heal\": acc_heal,\n",
+        "            \"cv_after_transfer\": cv_transfer,\n",
+        "            \"cv_after_heal\": cv_heal,\n",
+        "            \"heal_epochs\": heal_epochs,\n",
+        "            \"source\": f\"projected from {parent_dim}\",\n",
+        "        })\n",
+        "\n",
+        "        # This child becomes next parent\n",
+        "        parent_model = child\n",
+        "        parent_cv = cv_heal\n",
+        "\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "    # BASELINE: Train 131-dim from scratch\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "\n",
+        "    print(f\"\\n{'='*70}\")\n",
+        "    print(f\"BASELINE: Train {SCALES[-1]}-dim from scratch\")\n",
+        "    print(f\"{'='*70}\")\n",
+        "\n",
+        "    baseline = PatternModel(SEQ_LEN, SCALES[-1], N_CLASSES, N_LAYERS).to(device)\n",
+        "    n_base = sum(p.numel() for p in baseline.parameters())\n",
+        "    base_epochs, base_acc, base_cv = train_until_cv(\n",
+        "        baseline, task, target_cv=0.20,\n",
+        "        cv_tolerance=0.05, max_epochs=200)\n",
+        "    print(f\"  Trained: {base_epochs} epochs → acc={base_acc:.4f}, cv={base_cv:.4f}\")\n",
+        "\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "    # DIRECT PROJECTION: 256 → 131 (single jump baseline)\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "\n",
+        "    print(f\"\\n{'='*70}\")\n",
+        "    print(f\"DIRECT PROJECTION: {SCALES[0]} → {SCALES[-1]} (single jump)\")\n",
+        "    print(f\"{'='*70}\")\n",
+        "\n",
+        "    direct = PatternModel(SEQ_LEN, SCALES[-1], N_CLASSES, N_LAYERS).to(device)\n",
+        "    transfer_weights(root, direct)\n",
+        "    direct.eval()\n",
+        "    _, direct_cv = profile_model(direct)\n",
+        "    with torch.no_grad():\n",
+        "        direct_acc = (direct(test_x).argmax(-1) == test_y).float().mean().item()\n",
+        "    print(f\"  After direct transfer: acc={direct_acc:.4f}, cv={direct_cv:.4f}\")\n",
+        "\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "    # FINAL REPORT\n",
+        "    # ══════════════════════════════════════════════════════════\n",
+        "\n",
+        "    print(f\"\\n{'='*70}\")\n",
+        "    print(f\"RESULTS — ITERATIVE CASCADE\")\n",
+        "    print(f\"{'='*70}\\n\")\n",
+        "\n",
+        "    print(f\"  {'Scale':<8s} {'Params':>8s} {'Acc(proj)':>10s} {'Acc(heal)':>10s} \"\n",
+        "          f\"{'CV(proj)':>9s} {'CV(heal)':>9s} {'Epochs':>7s}\")\n",
+        "    print(f\"  {'─'*8} {'─'*8} {'─'*10} {'─'*10} {'─'*9} {'─'*9} {'─'*7}\")\n",
+        "\n",
+        "    total_heal_epochs = 0\n",
+        "    for r in results:\n",
+        "        print(f\"  {r['scale']:<8d} {r['params']:>8,} {r['acc_after_transfer']:>10.4f} \"\n",
+        "              f\"{r['acc_after_heal']:>10.4f} {r['cv_after_transfer']:>9.4f} \"\n",
+        "              f\"{r['cv_after_heal']:>9.4f} {r['heal_epochs']:>7d}\")\n",
+        "        if r['source'] != 'scratch':\n",
+        "            total_heal_epochs += r['heal_epochs']\n",
+        "\n",
+        "    print(f\"\\n  {'COMPARISONS':}\")\n",
+        "    print(f\"  {'─'*60}\")\n",
+        "    print(f\"  Cascade {SCALES[-1]}-dim:  acc={results[-1]['acc_after_heal']:.4f}  \"\n",
+        "          f\"cv={results[-1]['cv_after_heal']:.4f}  \"\n",
+        "          f\"(total {total_heal_epochs} heal epochs)\")\n",
+        "    print(f\"  Direct proj {SCALES[-1]}-dim:  acc={direct_acc:.4f}  \"\n",
+        "          f\"cv={direct_cv:.4f}  (0 training)\")\n",
+        "    print(f\"  Scratch {SCALES[-1]}-dim:  acc={base_acc:.4f}  \"\n",
+        "          f\"cv={base_cv:.4f}  ({base_epochs} epochs)\")\n",
+        "    print(f\"  Chance:             acc={1/N_CLASSES:.4f}\")\n",
+        "\n",
+        "    print(f\"\\n  COMPRESSION:\")\n",
+        "    print(f\"    Root:    {n_root:>8,} params\")\n",
+        "    print(f\"    Target:  {n_child:>8,} params ({n_child/n_root:.1%})\")\n",
+        "    print(f\"    Ratio:   {n_root/n_child:.1f}×\")\n",
+        "\n",
+        "    # ── Geometric preservation ──\n",
+        "    root_cv = results[0][\"cv_after_heal\"]\n",
+        "    final_cv = results[-1][\"cv_after_heal\"]\n",
+        "    print(f\"\\n  GEOMETRIC PRESERVATION:\")\n",
+        "    print(f\"    Root CV:     {root_cv:.4f}\")\n",
+        "    print(f\"    Final CV:    {final_cv:.4f}\")\n",
+        "    print(f\"    Δ CV:        {abs(root_cv - final_cv):.4f}\")\n",
+        "    print(f\"    Direct CV:   {direct_cv:.4f}\")\n",
+        "    print(f\"    Scratch CV:  {base_cv:.4f}\")\n",
+        "\n",
+        "    print(f\"\\nDone.\")\n",
+        "    return results\n",
+        "\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    results = run_experiment()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# ============================================================================\n",
+        "# OPTIMAL SCALING RATIO EXPERIMENT\n",
+        "#\n",
+        "# Sweep: What ratio between consecutive scales minimizes accuracy loss\n",
+        "# while maximizing compression?\n",
+        "#\n",
+        "# For each ratio r ∈ {0.50, 0.55, 0.60, 0.618, 0.65, 0.70, 0.707, 0.75,\n",
+        "#                      0.80, 0.85, 0.90, 0.95}:\n",
+        "#   - Build cascade from 256 → 64 using steps of dim[i+1] = round(dim[i] * r)\n",
+        "#   - Apply iterative project + 1-epoch heal at each step\n",
+        "#   - Measure: final accuracy, total heal epochs, CV preservation\n",
+        "#\n",
+        "# Natural candidates:\n",
+        "#   φ⁻¹ = 0.6180  (golden ratio inverse — nature's scaling constant)\n",
+        "#   2⁻⁰·⁵ = 0.7071 (inverse sqrt 2 — octave halving)\n",
+        "#   1-0.29514 = 0.7049 (Phil's recurring ratio complement)\n",
+        "#   e⁻¹ = 0.3679  (too aggressive, but worth checking)\n",
+        "# ============================================================================\n",
+        "\n",
+        "import math\n",
+        "import time\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "import numpy as np\n",
+        "\n",
+        "\n",
+        "# ═════════════════════════���════════════════════════════════════════\n",
+        "# GEOMETRIC UTILITIES\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def cayley_menger_vol2(pts):\n",
+        "    with torch.amp.autocast(\"cuda\", enabled=False):\n",
+        "        pts = pts.float()\n",
+        "        diff = pts.unsqueeze(-2) - pts.unsqueeze(-3)\n",
+        "        d2 = (diff * diff).sum(-1)\n",
+        "        B, V, _ = d2.shape\n",
+        "        cm = torch.zeros(B, V+1, V+1, device=d2.device, dtype=torch.float32)\n",
+        "        cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2\n",
+        "        s = (-1.0)**V; f = math.factorial(V-1)\n",
+        "        return s / ((2.0**(V-1)) * f*f) * torch.linalg.det(cm)\n",
+        "\n",
+        "\n",
+        "def pentachoron_cv(embeddings, n_samples=100):\n",
+        "    B = embeddings.shape[0]\n",
+        "    if B < 5:\n",
+        "        return 0.0\n",
+        "    vols = []\n",
+        "    for _ in range(n_samples):\n",
+        "        idx = torch.randperm(B, device=embeddings.device)[:5]\n",
+        "        v2 = cayley_menger_vol2(embeddings[idx].unsqueeze(0))\n",
+        "        v = torch.sqrt(F.relu(v2[0]) + 1e-12).item()\n",
+        "        if v > 0:\n",
+        "            vols.append(v)\n",
+        "    if len(vols) < 10:\n",
+        "        return 0.0\n",
+        "    a = np.array(vols, dtype=np.float64)\n",
+        "    return float(a.std() / max(a.mean(), 1e-12))\n",
+        "\n",
+        "\n",
+        "def profile_model(model):\n",
+        "    results = {}\n",
+        "    for i, layer in enumerate(model.get_linear_layers()):\n",
+        "        W = layer.weight.detach().float()\n",
+        "        cv = pentachoron_cv(W, n_samples=100)\n",
+        "        results[f\"layer_{i}\"] = {\"cv\": cv}\n",
+        "    mean_cv = np.mean([v[\"cv\"] for v in results.values()])\n",
+        "    return mean_cv\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# TASK\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "class PatternTask:\n",
+        "    def __init__(self, n_classes=16, seq_len=64, noise=0.3, device=\"cpu\"):\n",
+        "        self.n_classes = n_classes\n",
+        "        self.seq_len = seq_len\n",
+        "        self.noise = noise\n",
+        "        self.device = device\n",
+        "        torch.manual_seed(42)\n",
+        "        self.templates = F.normalize(\n",
+        "            torch.randn(n_classes, seq_len, device=device), dim=-1)\n",
+        "\n",
+        "    def generate(self, n_samples):\n",
+        "        labels = torch.randint(0, self.n_classes, (n_samples,), device=self.device)\n",
+        "        patterns = self.templates[labels]\n",
+        "        return patterns + torch.randn_like(patterns) * self.noise, labels\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# MODEL\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "class PatternModel(nn.Module):\n",
+        "    def __init__(self, seq_len, hidden_dim, n_classes, n_layers=4):\n",
+        "        super().__init__()\n",
+        "        layers = []\n",
+        "        layers.append(nn.Linear(seq_len, hidden_dim))\n",
+        "        layers.append(nn.GELU())\n",
+        "        layers.append(nn.LayerNorm(hidden_dim))\n",
+        "        for _ in range(n_layers - 2):\n",
+        "            layers.append(nn.Linear(hidden_dim, hidden_dim))\n",
+        "            layers.append(nn.GELU())\n",
+        "            layers.append(nn.LayerNorm(hidden_dim))\n",
+        "        layers.append(nn.Linear(hidden_dim, n_classes))\n",
+        "        self.network = nn.Sequential(*layers)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.network(x)\n",
+        "\n",
+        "    def get_linear_layers(self):\n",
+        "        return [m for m in self.network.modules() if isinstance(m, nn.Linear)]\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════���═══════════════════════════════════════\n",
+        "# PROJECTION + HEALING\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def svd_project(W_large, out_dim, in_dim):\n",
+        "    W = W_large.float()\n",
+        "    U, S, Vt = torch.linalg.svd(W, full_matrices=True)\n",
+        "    k = min(S.shape[0], out_dim, in_dim)\n",
+        "    U_k = U[:min(W.shape[0], out_dim), :k]\n",
+        "    Vt_k = Vt[:k, :min(W.shape[1], in_dim)]\n",
+        "    W_small = U_k @ torch.diag(S[:k]) @ Vt_k\n",
+        "    result = torch.zeros(out_dim, in_dim, device=W.device)\n",
+        "    r, c = W_small.shape\n",
+        "    result[:r, :c] = W_small\n",
+        "    return result\n",
+        "\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def transfer_weights(source, target):\n",
+        "    src_layers = source.get_linear_layers()\n",
+        "    tgt_layers = target.get_linear_layers()\n",
+        "    for L, Sm in zip(src_layers, tgt_layers):\n",
+        "        to, ti = Sm.weight.shape\n",
+        "        Sm.weight.data.copy_(svd_project(L.weight.data, to, ti))\n",
+        "        if L.bias is not None and Sm.bias is not None:\n",
+        "            b = L.bias.data.float()\n",
+        "            if b.shape[0] > to:\n",
+        "                U, _, _ = torch.linalg.svd(L.weight.data.float(), full_matrices=True)\n",
+        "                Sm.bias.data.copy_(U[:, :to].T @ b)\n",
+        "            elif b.shape[0] < to:\n",
+        "                Sm.bias.data.zero_()\n",
+        "                Sm.bias.data[:b.shape[0]].copy_(b)\n",
+        "            else:\n",
+        "                Sm.bias.data.copy_(b)\n",
+        "    src_norms = [m for m in source.network.modules() if isinstance(m, nn.LayerNorm)]\n",
+        "    tgt_norms = [m for m in target.network.modules() if isinstance(m, nn.LayerNorm)]\n",
+        "    for ln_s, ln_t in zip(src_norms, tgt_norms):\n",
+        "        d = min(ln_s.weight.shape[0], ln_t.weight.shape[0])\n",
+        "        ln_t.weight.data[:d].copy_(ln_s.weight.data[:d])\n",
+        "        ln_t.bias.data[:d].copy_(ln_s.bias.data[:d])\n",
+        "\n",
+        "\n",
+        "def heal_one_epoch(model, task, batch_size=256, lr=3e-4):\n",
+        "    \"\"\"Single healing epoch. Returns accuracy after.\"\"\"\n",
+        "    device = next(model.parameters()).device\n",
+        "    train_x, train_y = task.generate(10000)\n",
+        "    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)\n",
+        "    model.train()\n",
+        "    perm = torch.randperm(train_x.shape[0], device=device)\n",
+        "    for i in range(0, train_x.shape[0], batch_size):\n",
+        "        idx = perm[i:i+batch_size]\n",
+        "        loss = F.cross_entropy(model(train_x[idx]), train_y[idx])\n",
+        "        optimizer.zero_grad()\n",
+        "        loss.backward()\n",
+        "        optimizer.step()\n",
+        "    return model\n",
+        "\n",
+        "\n",
+        "def evaluate(model, test_x, test_y):\n",
+        "    model.eval()\n",
+        "    with torch.no_grad():\n",
+        "        return (model(test_x).argmax(-1) == test_y).float().mean().item()\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# BUILD SCALE CASCADE FOR A GIVEN RATIO\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def build_cascade(start_dim, end_dim, ratio):\n",
+        "    \"\"\"Generate dimension sequence: start, round(start*r), round(start*r²), ... until ≤ end.\"\"\"\n",
+        "    dims = [start_dim]\n",
+        "    while True:\n",
+        "        next_dim = max(round(dims[-1] * ratio), end_dim)\n",
+        "        if next_dim >= dims[-1]:\n",
+        "            # Ratio too close to 1, force a step down\n",
+        "            next_dim = dims[-1] - 1\n",
+        "        if next_dim <= end_dim:\n",
+        "            if dims[-1] != end_dim:\n",
+        "                dims.append(end_dim)\n",
+        "            break\n",
+        "        dims.append(next_dim)\n",
+        "    return dims\n",
+        "\n",
+        "\n",
+        "def run_cascade(root_model, task, test_x, test_y, scales, device):\n",
+        "    \"\"\"\n",
+        "    Run a full cascade: project + heal at each scale.\n",
+        "    Returns: final_acc, total_heal_epochs, per-step data.\n",
+        "    \"\"\"\n",
+        "    parent = root_model\n",
+        "    steps = []\n",
+        "    total_epochs = 0\n",
+        "\n",
+        "    for i in range(1, len(scales)):\n",
+        "        dim = scales[i]\n",
+        "        child = PatternModel(task.seq_len, dim, task.n_classes, 4).to(device)\n",
+        "        transfer_weights(parent, child)\n",
+        "\n",
+        "        acc_proj = evaluate(child, test_x, test_y)\n",
+        "        cv_proj = profile_model(child)\n",
+        "\n",
+        "        # 1 healing epoch\n",
+        "        heal_one_epoch(child, task)\n",
+        "        total_epochs += 1\n",
+        "\n",
+        "        acc_heal = evaluate(child, test_x, test_y)\n",
+        "        cv_heal = profile_model(child)\n",
+        "\n",
+        "        steps.append({\n",
+        "            \"from\": scales[i-1], \"to\": dim,\n",
+        "            \"acc_proj\": acc_proj, \"acc_heal\": acc_heal,\n",
+        "            \"cv_proj\": cv_proj, \"cv_heal\": cv_heal,\n",
+        "        })\n",
+        "\n",
+        "        parent = child\n",
+        "\n",
+        "    return acc_heal, total_epochs, cv_heal, steps\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# EXPERIMENT\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def run_experiment():\n",
+        "    print(\"=\" * 70)\n",
+        "    print(\"OPTIMAL SCALING RATIO EXPERIMENT\")\n",
+        "    print(\"=\" * 70)\n",
+        "\n",
+        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "    print(f\"  Device: {device}\")\n",
+        "\n",
+        "    # Config\n",
+        "    SEQ_LEN = 64\n",
+        "    N_CLASSES = 16\n",
+        "    NOISE = 0.3\n",
+        "    START_DIM = 256\n",
+        "    END_DIM = 64\n",
+        "    N_LAYERS = 4\n",
+        "\n",
+        "    # Named ratios\n",
+        "    PHI_INV = 1.0 / ((1 + math.sqrt(5)) / 2)  # 0.6180\n",
+        "    SQRT2_INV = 1.0 / math.sqrt(2)              # 0.7071\n",
+        "    PHIL_COMP = 1.0 - 0.29514                    # 0.7049\n",
+        "    E_INV = 1.0 / math.e                         # 0.3679\n",
+        "\n",
+        "    RATIOS = [\n",
+        "        (0.50,      \"0.500 (halving)\"),\n",
+        "        (0.55,      \"0.550\"),\n",
+        "        (0.60,      \"0.600\"),\n",
+        "        (PHI_INV,   f\"0.618 (1/φ golden)\"),\n",
+        "        (0.65,      \"0.650\"),\n",
+        "        (0.70,      \"0.700\"),\n",
+        "        (PHIL_COMP, f\"0.705 (1-0.295)\"),\n",
+        "        (SQRT2_INV, f\"0.707 (1/√2)\"),\n",
+        "        (0.75,      \"0.750\"),\n",
+        "        (0.80,      \"0.800\"),\n",
+        "        (0.85,      \"0.850\"),\n",
+        "        (0.90,      \"0.900\"),\n",
+        "        (0.95,      \"0.950\"),\n",
+        "    ]\n",
+        "\n",
+        "    task = PatternTask(N_CLASSES, SEQ_LEN, NOISE, device)\n",
+        "    test_x, test_y = task.generate(5000)\n",
+        "\n",
+        "    print(f\"\\n  Task: {N_CLASSES}-class, seq_len={SEQ_LEN}, noise={NOISE}\")\n",
+        "    print(f\"  Compression: {START_DIM} → {END_DIM}\")\n",
+        "    print(f\"  Testing {len(RATIOS)} scaling ratios\")\n",
+        "\n",
+        "    # ── Train root model ──\n",
+        "    print(f\"\\n  Training root model ({START_DIM}-dim)...\")\n",
+        "    root = PatternModel(SEQ_LEN, START_DIM, N_CLASSES, N_LAYERS).to(device)\n",
+        "    optimizer = torch.optim.AdamW(root.parameters(), lr=3e-4)\n",
+        "    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)\n",
+        "    train_x, train_y = task.generate(10000)\n",
+        "\n",
+        "    for epoch in range(200):\n",
+        "        root.train()\n",
+        "        perm = torch.randperm(10000, device=device)\n",
+        "        for i in range(0, 10000, 256):\n",
+        "            idx = perm[i:i+256]\n",
+        "            loss = F.cross_entropy(root(train_x[idx]), train_y[idx])\n",
+        "            optimizer.zero_grad(); loss.backward(); optimizer.step()\n",
+        "        scheduler.step()\n",
+        "        if (epoch+1) % 50 == 0:\n",
+        "            acc = evaluate(root, test_x, test_y)\n",
+        "            print(f\"    Epoch {epoch+1}: acc={acc:.4f}\")\n",
+        "\n",
+        "    root_acc = evaluate(root, test_x, test_y)\n",
+        "    root_cv = profile_model(root)\n",
+        "    print(f\"  Root: acc={root_acc:.4f}, cv={root_cv:.4f}\")\n",
+        "\n",
+        "    # ── Sweep ratios ──\n",
+        "    print(f\"\\n{'='*70}\")\n",
+        "    print(\"RATIO SWEEP\")\n",
+        "    print(f\"{'='*70}\\n\")\n",
+        "\n",
+        "    results = []\n",
+        "\n",
+        "    for ratio, name in RATIOS:\n",
+        "        scales = build_cascade(START_DIM, END_DIM, ratio)\n",
+        "        n_steps = len(scales) - 1\n",
+        "\n",
+        "        t0 = time.time()\n",
+        "        final_acc, total_epochs, final_cv, steps = run_cascade(\n",
+        "            root, task, test_x, test_y, scales, device)\n",
+        "        elapsed = time.time() - t0\n",
+        "\n",
+        "        # Compute efficiency metric: accuracy retained per heal epoch\n",
+        "        acc_retained = final_acc / max(root_acc, 1e-8)\n",
+        "        efficiency = acc_retained / max(total_epochs, 1)\n",
+        "\n",
+        "        result = {\n",
+        "            \"ratio\": ratio,\n",
+        "            \"name\": name,\n",
+        "            \"scales\": scales,\n",
+        "            \"n_steps\": n_steps,\n",
+        "            \"final_acc\": final_acc,\n",
+        "            \"final_cv\": final_cv,\n",
+        "            \"total_epochs\": total_epochs,\n",
+        "            \"acc_retained\": acc_retained,\n",
+        "            \"efficiency\": efficiency,\n",
+        "            \"elapsed\": elapsed,\n",
+        "            \"steps\": steps,\n",
+        "        }\n",
+        "        results.append(result)\n",
+        "\n",
+        "        scale_str = \"→\".join(str(s) for s in scales)\n",
+        "        print(f\"  r={ratio:.3f} ({name:20s}): {n_steps} steps  \"\n",
+        "              f\"acc={final_acc:.4f}  ret={acc_retained:.1%}  \"\n",
+        "              f\"cv={final_cv:.4f}  eff={efficiency:.4f}  \"\n",
+        "              f\"[{scale_str}]\")\n",
+        "\n",
+        "    # ── Direct jump baseline ──\n",
+        "    direct = PatternModel(SEQ_LEN, END_DIM, N_CLASSES, N_LAYERS).to(device)\n",
+        "    transfer_weights(root, direct)\n",
+        "    direct_acc = evaluate(direct, test_x, test_y)\n",
+        "    heal_one_epoch(direct, task)\n",
+        "    direct_heal_acc = evaluate(direct, test_x, test_y)\n",
+        "\n",
+        "    # ══════════════════════════════════════════════════════════════\n",
+        "    # ANALYSIS\n",
+        "    # ══════════════════════════════════════════════════════════════\n",
+        "\n",
+        "    # Sort by final accuracy\n",
+        "    results.sort(key=lambda x: x[\"final_acc\"], reverse=True)\n",
+        "\n",
+        "    print(f\"\\n{'='*70}\")\n",
+        "    print(\"RESULTS — SORTED BY ACCURACY\")\n",
+        "    print(f\"{'='*70}\\n\")\n",
+        "\n",
+        "    print(f\"  {'Ratio':<22s} {'Steps':>5s} {'Acc':>7s} {'Retained':>9s} \"\n",
+        "          f\"{'CV':>7s} {'Epochs':>6s} {'Eff':>7s}\")\n",
+        "    print(f\"  {'─'*22} {'─'*5} {'─'*7} {'─'*9} {'─'*7} {'─'*6} {'─'*7}\")\n",
+        "\n",
+        "    for r in results:\n",
+        "        marker = \" ★\" if \"golden\" in r[\"name\"] or \"0.295\" in r[\"name\"] or \"√2\" in r[\"name\"] else \"\"\n",
+        "        print(f\"  {r['name']:<22s} {r['n_steps']:>5d} {r['final_acc']:>7.4f} \"\n",
+        "              f\"{r['acc_retained']:>8.1%} {r['final_cv']:>7.4f} \"\n",
+        "              f\"{r['total_epochs']:>6d} {r['efficiency']:>7.4f}{marker}\")\n",
+        "\n",
+        "    print(f\"\\n  {'Direct 256→64':22s} {'1':>5s} {direct_heal_acc:>7.4f} \"\n",
+        "          f\"{direct_heal_acc/root_acc:>8.1%} {'—':>7s} {'1':>6s}\")\n",
+        "    print(f\"  {'Root (256)':22s} {'—':>5s} {root_acc:>7.4f} \"\n",
+        "          f\"{'100.0%':>9s} {root_cv:>7.4f} {'200':>6s}\")\n",
+        "\n",
+        "    # ── Find optimal ──\n",
+        "    best = results[0]\n",
+        "    print(f\"\\n  OPTIMAL RATIO: {best['name']}\")\n",
+        "    print(f\"    Accuracy: {best['final_acc']:.4f} ({best['acc_retained']:.1%} retained)\")\n",
+        "    print(f\"    Steps:    {best['n_steps']}\")\n",
+        "    print(f\"    Scales:   {'→'.join(str(s) for s in best['scales'])}\")\n",
+        "    print(f\"    CV:       {best['final_cv']:.4f} (root: {root_cv:.4f})\")\n",
+        "\n",
+        "    # ── Natural constant comparison ──\n",
+        "    phi_result = next(r for r in results if \"golden\" in r[\"name\"])\n",
+        "    sqrt2_result = next(r for r in results if \"√2\" in r[\"name\"])\n",
+        "    phil_result = next(r for r in results if \"0.295\" in r[\"name\"])\n",
+        "\n",
+        "    print(f\"\\n  NATURAL CONSTANTS:\")\n",
+        "    print(f\"    1/φ   (0.618): acc={phi_result['final_acc']:.4f}  \"\n",
+        "          f\"steps={phi_result['n_steps']}  scales={'→'.join(str(s) for s in phi_result['scales'])}\")\n",
+        "    print(f\"    1/��2  (0.707): acc={sqrt2_result['final_acc']:.4f}  \"\n",
+        "          f\"steps={sqrt2_result['n_steps']}  scales={'→'.join(str(s) for s in sqrt2_result['scales'])}\")\n",
+        "    print(f\"    1-0.295(0.705): acc={phil_result['final_acc']:.4f}  \"\n",
+        "          f\"steps={phil_result['n_steps']}  scales={'→'.join(str(s) for s in phil_result['scales'])}\")\n",
+        "\n",
+        "    # ── Pareto analysis: accuracy vs training cost ──\n",
+        "    print(f\"\\n  PARETO FRONTIER (accuracy vs epochs):\")\n",
+        "    print(f\"  {'─'*50}\")\n",
+        "    pareto = []\n",
+        "    best_acc_so_far = 0\n",
+        "    for r in sorted(results, key=lambda x: x[\"total_epochs\"]):\n",
+        "        if r[\"final_acc\"] > best_acc_so_far:\n",
+        "            best_acc_so_far = r[\"final_acc\"]\n",
+        "            pareto.append(r)\n",
+        "            print(f\"    {r['total_epochs']:2d} epochs → {r['final_acc']:.4f} \"\n",
+        "                  f\"({r['name']})\")\n",
+        "\n",
+        "    print(f\"\\nDone.\")\n",
+        "    return results\n",
+        "\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    results = run_experiment()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "gZk3JSs4UCke",
+        "outputId": "2e8b6628-056e-44f0-c168-931134031e84"
+      },
+      "execution_count": 11,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "======================================================================\n",
+            "OPTIMAL SCALING RATIO EXPERIMENT\n",
+            "======================================================================\n",
+            "  Device: cuda\n",
+            "\n",
+            "  Task: 16-class, seq_len=64, noise=0.3\n",
+            "  Compression: 256 → 64\n",
+            "  Testing 13 scaling ratios\n",
+            "\n",
+            "  Training root model (256-dim)...\n",
+            "    Epoch 50: acc=0.8788\n",
+            "    Epoch 100: acc=0.8766\n",
+            "    Epoch 150: acc=0.8746\n",
+            "    Epoch 200: acc=0.8746\n",
+            "  Root: acc=0.8746, cv=0.0929\n",
+            "\n",
+            "======================================================================\n",
+            "RATIO SWEEP\n",
+            "======================================================================\n",
+            "\n",
+            "  r=0.500 (0.500 (halving)     ): 2 steps  acc=0.7724  ret=88.3%  cv=0.1617  eff=0.4416  [256→128→64]\n",
+            "  r=0.550 (0.550               ): 3 steps  acc=0.8236  ret=94.2%  cv=0.1478  eff=0.3139  [256→141→78→64]\n",
+            "  r=0.600 (0.600               ): 3 steps  acc=0.8148  ret=93.2%  cv=0.1406  eff=0.3105  [256→154→92→64]\n",
+            "  r=0.618 (0.618 (1/φ golden)  ): 3 steps  acc=0.8114  ret=92.8%  cv=0.1378  eff=0.3092  [256→158→98→64]\n",
+            "  r=0.650 (0.650               ): 4 steps  acc=0.8416  ret=96.2%  cv=0.1357  eff=0.2406  [256→166→108→70→64]\n",
+            "  r=0.700 (0.700               ): 4 steps  acc=0.8248  ret=94.3%  cv=0.1418  eff=0.2358  [256→179→125→88→64]\n",
+            "  r=0.705 (0.705 (1-0.295)     ): 4 steps  acc=0.8232  ret=94.1%  cv=0.1378  eff=0.2353  [256→180→127→90→64]\n",
+            "  r=0.707 (0.707 (1/√2)        ): 4 steps  acc=0.8194  ret=93.7%  cv=0.1404  eff=0.2342  [256→181→128→91→64]\n",
+            "  r=0.750 (0.750               ): 5 steps  acc=0.8350  ret=95.5%  cv=0.1483  eff=0.1909  [256→192→144→108→81→64]\n",
+            "  r=0.800 (0.800               ): 7 steps  acc=0.8626  ret=98.6%  cv=0.1432  eff=0.1409  [256→205→164→131→105→84→67→64]\n",
+            "  r=0.850 (0.850               ): 9 steps  acc=0.8680  ret=99.2%  cv=0.1400  eff=0.1103  [256→218→185→157→133→113→96→82→70→64]\n",
+            "  r=0.900 (0.900               ): 14 steps  acc=0.8852  ret=101.2%  cv=0.1316  eff=0.0723  [256→230→207→186→167→150→135→122→110→99→89→80→72→65→64]\n",
+            "  r=0.950 (0.950               ): 27 steps  acc=0.8916  ret=101.9%  cv=0.1318  eff=0.0378  [256→243→231→219→208→198→188→179→170→162→154→146→139→132→125→119→113→107→102→97→92→87→83→79→75→71→67→64]\n",
+            "\n",
+            "======================================================================\n",
+            "RESULTS — SORTED BY ACCURACY\n",
+            "======================================================================\n",
+            "\n",
+            "  Ratio                  Steps     Acc  Retained      CV Epochs     Eff\n",
+            "  ────────────────────── ───── ─────── ───────── ─────── ────── ─────���─\n",
+            "  0.950                     27  0.8916   101.9%  0.1318     27  0.0378\n",
+            "  0.900                     14  0.8852   101.2%  0.1316     14  0.0723\n",
+            "  0.850                      9  0.8680    99.2%  0.1400      9  0.1103\n",
+            "  0.800                      7  0.8626    98.6%  0.1432      7  0.1409\n",
+            "  0.650                      4  0.8416    96.2%  0.1357      4  0.2406\n",
+            "  0.750                      5  0.8350    95.5%  0.1483      5  0.1909\n",
+            "  0.700                      4  0.8248    94.3%  0.1418      4  0.2358\n",
+            "  0.550                      3  0.8236    94.2%  0.1478      3  0.3139\n",
+            "  0.705 (1-0.295)            4  0.8232    94.1%  0.1378      4  0.2353 ★\n",
+            "  0.707 (1/√2)               4  0.8194    93.7%  0.1404      4  0.2342 ★\n",
+            "  0.600                      3  0.8148    93.2%  0.1406      3  0.3105\n",
+            "  0.618 (1/φ golden)         3  0.8114    92.8%  0.1378      3  0.3092 ★\n",
+            "  0.500 (halving)            2  0.7724    88.3%  0.1617      2  0.4416\n",
+            "\n",
+            "  Direct 256→64              1  0.7426    84.9%       —      1\n",
+            "  Root (256)                 —  0.8746    100.0%  0.0929    200\n",
+            "\n",
+            "  OPTIMAL RATIO: 0.950\n",
+            "    Accuracy: 0.8916 (101.9% retained)\n",
+            "    Steps:    27\n",
+            "    Scales:   256→243→231→219→208→198→188→179→170→162→154→146→139→132→125→119→113→107→102→97→92→87→83→79→75→71→67→64\n",
+            "    CV:       0.1318 (root: 0.0929)\n",
+            "\n",
+            "  NATURAL CONSTANTS:\n",
+            "    1/φ   (0.618): acc=0.8114  steps=3  scales=256→158→98→64\n",
+            "    1/√2  (0.707): acc=0.8194  steps=4  scales=256→181→128→91→64\n",
+            "    1-0.295(0.705): acc=0.8232  steps=4  scales=256→180→127→90→64\n",
+            "\n",
+            "  PARETO FRONTIER (accuracy vs epochs):\n",
+            "  ──────────────────────────────────────────────────\n",
+            "     2 epochs → 0.7724 (0.500 (halving))\n",
+            "     3 epochs → 0.8236 (0.550)\n",
+            "     4 epochs → 0.8416 (0.650)\n",
+            "     7 epochs → 0.8626 (0.800)\n",
+            "     9 epochs → 0.8680 (0.850)\n",
+            "    14 epochs → 0.8852 (0.900)\n",
+            "    27 epochs → 0.8916 (0.950)\n",
+            "\n",
+            "Done.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# bert rescaling"
+      ],
+      "metadata": {
+        "id": "0wpUVBCiXmJg"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# ============================================================================\n",
+        "# ITERATIVE GEOMETRIC CASCADE ON PRETRAINED BERT\n",
+        "#\n",
+        "# Take BERT-base (768-dim, 12 layers, 110M params) and cascade it down:\n",
+        "#   768 → 672 → 576 → 480 → 384\n",
+        "#\n",
+        "# At each scale:\n",
+        "#   1. SVD-project ALL weight matrices from parent\n",
+        "#   2. Evaluate MLM accuracy (can it still predict masked words?)\n",
+        "#   3. Optionally heal with 1 epoch of MLM\n",
+        "#   4. Project to next scale\n",
+        "#\n",
+        "# No fine-tuning on any downstream task. Pure compression of pretrained\n",
+        "# knowledge via geometric projection.\n",
+        "# ============================================================================\n",
+        "\n",
+        "import math\n",
+        "import time\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "import numpy as np\n",
+        "from dataclasses import dataclass\n",
+        "from typing import Dict, List, Tuple, Optional\n",
+        "from transformers import (\n",
+        "    BertForMaskedLM, BertTokenizer, BertConfig,\n",
+        "    DataCollatorForLanguageModeling\n",
+        ")\n",
+        "from datasets import load_dataset\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# GEOMETRIC UTILITIES\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def cayley_menger_vol2(pts):\n",
+        "    with torch.amp.autocast(\"cuda\", enabled=False):\n",
+        "        pts = pts.float()\n",
+        "        diff = pts.unsqueeze(-2) - pts.unsqueeze(-3)\n",
+        "        d2 = (diff * diff).sum(-1)\n",
+        "        B, V, _ = d2.shape\n",
+        "        cm = torch.zeros(B, V+1, V+1, device=d2.device, dtype=torch.float32)\n",
+        "        cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2\n",
+        "        s = (-1.0)**V; f = math.factorial(V-1)\n",
+        "        return s / ((2.0**(V-1)) * f*f) * torch.linalg.det(cm)\n",
+        "\n",
+        "\n",
+        "def pentachoron_cv(W, n_samples=200):\n",
+        "    \"\"\"CV on weight matrix rows.\"\"\"\n",
+        "    if W.dim() != 2 or W.shape[0] < 5:\n",
+        "        return 0.0\n",
+        "    B = W.shape[0]\n",
+        "    vols = []\n",
+        "    for _ in range(n_samples):\n",
+        "        idx = torch.randperm(B, device=W.device)[:5]\n",
+        "        v2 = cayley_menger_vol2(W[idx].unsqueeze(0))\n",
+        "        v = torch.sqrt(F.relu(v2[0]) + 1e-12).item()\n",
+        "        if v > 0:\n",
+        "            vols.append(v)\n",
+        "    if len(vols) < 10:\n",
+        "        return 0.0\n",
+        "    a = np.array(vols, dtype=np.float64)\n",
+        "    return float(a.std() / max(a.mean(), 1e-12))\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# SVD PROJECTION\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def svd_project_matrix(W, out_dim, in_dim):\n",
+        "    \"\"\"Project weight matrix via truncated SVD.\"\"\"\n",
+        "    W = W.float()\n",
+        "    U, S, Vt = torch.linalg.svd(W, full_matrices=True)\n",
+        "    k = min(S.shape[0], out_dim, in_dim)\n",
+        "    U_k = U[:min(W.shape[0], out_dim), :k]\n",
+        "    Vt_k = Vt[:k, :min(W.shape[1], in_dim)]\n",
+        "    W_small = U_k @ torch.diag(S[:k]) @ Vt_k\n",
+        "    result = torch.zeros(out_dim, in_dim, dtype=W.dtype, device=W.device)\n",
+        "    r, c = W_small.shape\n",
+        "    result[:r, :c] = W_small\n",
+        "    return result\n",
+        "\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def svd_project_vector(v, target_dim):\n",
+        "    \"\"\"Project 1D vector (bias, layernorm) by truncation or padding.\"\"\"\n",
+        "    if v.shape[0] == target_dim:\n",
+        "        return v.clone()\n",
+        "    elif v.shape[0] > target_dim:\n",
+        "        return v[:target_dim].clone()\n",
+        "    else:\n",
+        "        result = torch.zeros(target_dim, dtype=v.dtype, device=v.device)\n",
+        "        result[:v.shape[0]] = v\n",
+        "        return result\n",
+        "\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def svd_project_embedding(E, target_dim):\n",
+        "    \"\"\"Project embedding matrix (vocab_size, hidden) → (vocab_size, target_dim).\"\"\"\n",
+        "    E = E.float()\n",
+        "    # Keep all vocab rows, reduce hidden dim via SVD on the embedding matrix\n",
+        "    U, S, Vt = torch.linalg.svd(E, full_matrices=False)\n",
+        "    k = min(S.shape[0], target_dim)\n",
+        "    # Reconstruct at reduced dimension\n",
+        "    projected = U[:, :k] @ torch.diag(S[:k]) @ Vt[:k, :target_dim]\n",
+        "    if projected.shape[1] < target_dim:\n",
+        "        result = torch.zeros(E.shape[0], target_dim, dtype=E.dtype, device=E.device)\n",
+        "        result[:, :projected.shape[1]] = projected\n",
+        "        return result\n",
+        "    return projected\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# BERT WEIGHT TRANSFER\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def create_scaled_bert(source_model, target_hidden, target_intermediate, device):\n",
+        "    \"\"\"\n",
+        "    Create a new BERT with smaller hidden/intermediate dims,\n",
+        "    SVD-projecting all weights from source.\n",
+        "    \"\"\"\n",
+        "    src_config = source_model.config\n",
+        "    src_hidden = src_config.hidden_size\n",
+        "    src_inter = src_config.intermediate_size\n",
+        "    n_heads = src_config.num_attention_heads\n",
+        "    head_dim = target_hidden // n_heads\n",
+        "\n",
+        "    # New config\n",
+        "    new_config = BertConfig(\n",
+        "        vocab_size=src_config.vocab_size,\n",
+        "        hidden_size=target_hidden,\n",
+        "        num_hidden_layers=src_config.num_hidden_layers,\n",
+        "        num_attention_heads=n_heads,\n",
+        "        intermediate_size=target_intermediate,\n",
+        "        max_position_embeddings=src_config.max_position_embeddings,\n",
+        "        type_vocab_size=src_config.type_vocab_size,\n",
+        "        hidden_act=src_config.hidden_act,\n",
+        "        hidden_dropout_prob=0.0,\n",
+        "        attention_probs_dropout_prob=0.0,\n",
+        "    )\n",
+        "\n",
+        "    new_model = BertForMaskedLM(new_config).to(device)\n",
+        "    src_sd = source_model.state_dict()\n",
+        "    new_sd = new_model.state_dict()\n",
+        "\n",
+        "    transferred = {}\n",
+        "\n",
+        "    for name, param in new_sd.items():\n",
+        "        if name not in src_sd:\n",
+        "            continue\n",
+        "        src_p = src_sd[name].to(device)\n",
+        "\n",
+        "        if src_p.shape == param.shape:\n",
+        "            transferred[name] = src_p.clone()\n",
+        "        elif src_p.dim() == 2:\n",
+        "            transferred[name] = svd_project_matrix(\n",
+        "                src_p, param.shape[0], param.shape[1])\n",
+        "        elif src_p.dim() == 1:\n",
+        "            transferred[name] = svd_project_vector(src_p, param.shape[0])\n",
+        "        else:\n",
+        "            # Skip or pad higher-dim tensors\n",
+        "            transferred[name] = param.clone()\n",
+        "\n",
+        "    # Load transferred weights\n",
+        "    missing, unexpected = new_model.load_state_dict(transferred, strict=False)\n",
+        "    n_transferred = len(transferred)\n",
+        "    n_total = len(new_sd)\n",
+        "    print(f\"    Transferred {n_transferred}/{n_total} params, \"\n",
+        "          f\"{len(missing)} missing, {len(unexpected)} unexpected\")\n",
+        "\n",
+        "    return new_model\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# PROFILING\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def profile_bert(model, tag=\"\"):\n",
+        "    \"\"\"Profile CV of attention and FFN weight matrices.\"\"\"\n",
+        "    cvs = []\n",
+        "    for name, param in model.named_parameters():\n",
+        "        if param.dim() == 2 and param.shape[0] >= 5 and param.shape[1] >= 5:\n",
+        "            if \"weight\" in name and (\"dense\" in name or \"query\" in name\n",
+        "                                     or \"key\" in name or \"value\" in name):\n",
+        "                cv = pentachoron_cv(param.detach(), n_samples=100)\n",
+        "                cvs.append(cv)\n",
+        "    mean_cv = np.mean(cvs) if cvs else 0.0\n",
+        "    n_params = sum(p.numel() for p in model.parameters())\n",
+        "    if tag:\n",
+        "        print(f\"  [{tag}] {n_params:,} params, mean CV={mean_cv:.4f} \"\n",
+        "              f\"(across {len(cvs)} weight matrices)\")\n",
+        "    return mean_cv, n_params\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# EVALUATION: MLM accuracy on short stories\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def evaluate_mlm(model, tokenizer, texts, device, mask_prob=0.15, max_len=128):\n",
+        "    \"\"\"\n",
+        "    Mask random tokens, see if model predicts them correctly.\n",
+        "    Returns: top-1 accuracy, top-5 accuracy.\n",
+        "    \"\"\"\n",
+        "    model.eval()\n",
+        "    total_correct_1 = 0\n",
+        "    total_correct_5 = 0\n",
+        "    total_masked = 0\n",
+        "\n",
+        "    for text in texts:\n",
+        "        tokens = tokenizer(text, return_tensors=\"pt\", max_length=max_len,\n",
+        "                          truncation=True, padding=False).to(device)\n",
+        "        input_ids = tokens[\"input_ids\"][0]\n",
+        "        seq_len = input_ids.shape[0]\n",
+        "\n",
+        "        if seq_len < 5:\n",
+        "            continue\n",
+        "\n",
+        "        # Create masks (skip [CLS], [SEP], [PAD])\n",
+        "        special_mask = torch.zeros(seq_len, dtype=torch.bool, device=device)\n",
+        "        special_mask[0] = True  # CLS\n",
+        "        special_mask[seq_len - 1] = True  # SEP\n",
+        "        special_mask[input_ids == tokenizer.pad_token_id] = True\n",
+        "\n",
+        "        maskable = ~special_mask\n",
+        "        n_mask = max(1, int(maskable.sum().item() * mask_prob))\n",
+        "        mask_positions = maskable.nonzero(as_tuple=True)[0]\n",
+        "        if len(mask_positions) == 0:\n",
+        "            continue\n",
+        "        chosen = mask_positions[torch.randperm(len(mask_positions))[:n_mask]]\n",
+        "\n",
+        "        # Save originals\n",
+        "        original_ids = input_ids[chosen].clone()\n",
+        "\n",
+        "        # Mask\n",
+        "        masked_ids = input_ids.clone()\n",
+        "        masked_ids[chosen] = tokenizer.mask_token_id\n",
+        "\n",
+        "        # Forward\n",
+        "        outputs = model(masked_ids.unsqueeze(0),\n",
+        "                       attention_mask=tokens[\"attention_mask\"])\n",
+        "        logits = outputs.logits[0, chosen]  # (n_mask, vocab_size)\n",
+        "\n",
+        "        # Top-1\n",
+        "        preds = logits.argmax(dim=-1)\n",
+        "        total_correct_1 += (preds == original_ids).sum().item()\n",
+        "\n",
+        "        # Top-5\n",
+        "        top5 = logits.topk(5, dim=-1).indices\n",
+        "        total_correct_5 += (top5 == original_ids.unsqueeze(-1)).any(dim=-1).sum().item()\n",
+        "\n",
+        "        total_masked += n_mask\n",
+        "\n",
+        "    if total_masked == 0:\n",
+        "        return 0.0, 0.0\n",
+        "    return total_correct_1 / total_masked, total_correct_5 / total_masked\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# HEAL: minimal MLM training\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def heal_mlm(model, tokenizer, texts, device, n_epochs=1,\n",
+        "             lr=5e-5, max_len=128, batch_size=16):\n",
+        "    \"\"\"Quick MLM training to heal projection distortion.\"\"\"\n",
+        "    model.train()\n",
+        "\n",
+        "    # Tokenize\n",
+        "    encodings = tokenizer(texts, max_length=max_len, truncation=True,\n",
+        "                          padding=\"max_length\", return_tensors=\"pt\")\n",
+        "    dataset_ids = encodings[\"input_ids\"]\n",
+        "    dataset_mask = encodings[\"attention_mask\"]\n",
+        "\n",
+        "    collator = DataCollatorForLanguageModeling(\n",
+        "        tokenizer=tokenizer, mlm=True, mlm_probability=0.15)\n",
+        "\n",
+        "    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)\n",
+        "    n_samples = dataset_ids.shape[0]\n",
+        "\n",
+        "    total_loss = 0\n",
+        "    n_batches = 0\n",
+        "\n",
+        "    for epoch in range(n_epochs):\n",
+        "        perm = torch.randperm(n_samples)\n",
+        "        for i in range(0, n_samples, batch_size):\n",
+        "            idx = perm[i:i+batch_size]\n",
+        "            batch_ids = dataset_ids[idx]\n",
+        "            batch_mask = dataset_mask[idx]\n",
+        "\n",
+        "            # Manual masking\n",
+        "            collated = collator([{\"input_ids\": ids, \"attention_mask\": m}\n",
+        "                                for ids, m in zip(batch_ids, batch_mask)])\n",
+        "\n",
+        "            input_ids = collated[\"input_ids\"].to(device)\n",
+        "            attention_mask = collated[\"attention_mask\"].to(device)\n",
+        "            labels = collated[\"labels\"].to(device)\n",
+        "\n",
+        "            outputs = model(input_ids=input_ids,\n",
+        "                          attention_mask=attention_mask,\n",
+        "                          labels=labels)\n",
+        "            loss = outputs.loss\n",
+        "            optimizer.zero_grad()\n",
+        "            loss.backward()\n",
+        "            optimizer.step()\n",
+        "            total_loss += loss.item()\n",
+        "            n_batches += 1\n",
+        "\n",
+        "    return total_loss / max(n_batches, 1)\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# EXPERIMENT\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def run_experiment():\n",
+        "    print(\"=\" * 70)\n",
+        "    print(\"ITERATIVE CASCADE ON PRETRAINED BERT-BASE\")\n",
+        "    print(\"=\" * 70)\n",
+        "\n",
+        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "    print(f\"  Device: {device}\")\n",
+        "\n",
+        "    # ── Configuration ──\n",
+        "    # hidden_size must be divisible by num_heads=12\n",
+        "    # 768/12=64, 672/12=56, 576/12=48, 480/12=40, 384/12=32\n",
+        "    SCALES = [768, 720, 672, 624, 576, 528, 480, 432, 384]\n",
+        "\n",
+        "    # intermediate scales proportionally: 3072 → ...\n",
+        "    INTER_SCALES = [3072, 2880, 2688, 2496, 2304, 2112, 1920, 1728, 1536]\n",
+        "    N_EVAL_TEXTS = 200\n",
+        "    N_HEAL_TEXTS = 5000\n",
+        "    HEAL_EPOCHS = 5\n",
+        "\n",
+        "    print(f\"  Scales: {' → '.join(str(s) for s in SCALES)}\")\n",
+        "    print(f\"  Compression: {SCALES[0]} → {SCALES[-1]} \"\n",
+        "          f\"({SCALES[-1]/SCALES[0]:.0%})\")\n",
+        "\n",
+        "    # ── Load data ──\n",
+        "    print(f\"\\n  Loading evaluation data...\")\n",
+        "    ds = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=\"validation\")\n",
+        "    eval_texts = [r[\"text\"].strip() for r in ds if len(r[\"text\"].strip()) > 50]\n",
+        "    eval_texts = eval_texts[:N_EVAL_TEXTS]\n",
+        "    print(f\"  {len(eval_texts)} eval texts\")\n",
+        "\n",
+        "    ds_train = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=\"train\")\n",
+        "    heal_texts = [r[\"text\"].strip() for r in ds_train if len(r[\"text\"].strip()) > 100]\n",
+        "    heal_texts = heal_texts[:N_HEAL_TEXTS]\n",
+        "    print(f\"  {len(heal_texts)} heal texts\")\n",
+        "\n",
+        "    # ── Load BERT-base ──\n",
+        "    print(f\"\\n  Loading BERT-base...\")\n",
+        "    tokenizer = BertTokenizer.from_pretrained(\"google-bert/bert-base-uncased\")\n",
+        "    root_model = BertForMaskedLM.from_pretrained(\"google-bert/bert-base-uncased\").to(device)\n",
+        "    root_model.eval()\n",
+        "\n",
+        "    # ── Profile + evaluate root ──\n",
+        "    root_cv, root_params = profile_bert(root_model, \"Root 768-dim\")\n",
+        "\n",
+        "    print(f\"  Evaluating root MLM accuracy...\")\n",
+        "    root_top1, root_top5 = evaluate_mlm(root_model, tokenizer, eval_texts, device)\n",
+        "    print(f\"  Root: top1={root_top1:.4f}  top5={root_top5:.4f}\")\n",
+        "\n",
+        "    # ── Cascade ──\n",
+        "    results = [{\n",
+        "        \"scale\": SCALES[0],\n",
+        "        \"inter\": INTER_SCALES[0],\n",
+        "        \"params\": root_params,\n",
+        "        \"cv\": root_cv,\n",
+        "        \"top1_proj\": root_top1,\n",
+        "        \"top5_proj\": root_top5,\n",
+        "        \"top1_heal\": root_top1,\n",
+        "        \"top5_heal\": root_top5,\n",
+        "        \"heal_loss\": 0,\n",
+        "        \"heal_epochs\": 0,\n",
+        "    }]\n",
+        "\n",
+        "    parent_model = root_model\n",
+        "\n",
+        "    for i in range(1, len(SCALES)):\n",
+        "        hidden = SCALES[i]\n",
+        "        inter = INTER_SCALES[i]\n",
+        "        parent_hidden = SCALES[i-1]\n",
+        "\n",
+        "        print(f\"\\n{'='*70}\")\n",
+        "        print(f\"SCALE {i}: {parent_hidden} → {hidden} \"\n",
+        "              f\"({(parent_hidden-hidden)/parent_hidden:.0%} reduction)\")\n",
+        "        print(f\"{'='*70}\")\n",
+        "\n",
+        "        # ── Project ──\n",
+        "        print(f\"  Projecting...\")\n",
+        "        t0 = time.time()\n",
+        "        child_model = create_scaled_bert(\n",
+        "            parent_model, hidden, inter, device)\n",
+        "        proj_time = time.time() - t0\n",
+        "        print(f\"  Projection took {proj_time:.1f}s\")\n",
+        "\n",
+        "        # ── Profile + evaluate after projection ──\n",
+        "        child_cv, child_params = profile_bert(child_model, f\"Projected {hidden}-dim\")\n",
+        "\n",
+        "        print(f\"  Evaluating MLM after projection...\")\n",
+        "        proj_top1, proj_top5 = evaluate_mlm(\n",
+        "            child_model, tokenizer, eval_texts, device)\n",
+        "        print(f\"  After projection: top1={proj_top1:.4f}  top5={proj_top5:.4f}\")\n",
+        "\n",
+        "        # ── Heal ──\n",
+        "        print(f\"  Healing ({HEAL_EPOCHS} epoch MLM)...\")\n",
+        "        t0 = time.time()\n",
+        "        heal_loss = heal_mlm(child_model, tokenizer, heal_texts, device,\n",
+        "                              n_epochs=HEAL_EPOCHS, lr=5e-5)\n",
+        "        heal_time = time.time() - t0\n",
+        "        print(f\"  Heal loss: {heal_loss:.4f} ({heal_time:.1f}s)\")\n",
+        "\n",
+        "        # ── Profile + evaluate after heal ──\n",
+        "        heal_cv, _ = profile_bert(child_model, f\"Healed {hidden}-dim\")\n",
+        "\n",
+        "        print(f\"  Evaluating MLM after heal...\")\n",
+        "        heal_top1, heal_top5 = evaluate_mlm(\n",
+        "            child_model, tokenizer, eval_texts, device)\n",
+        "        print(f\"  After heal: top1={heal_top1:.4f}  top5={heal_top5:.4f}\")\n",
+        "\n",
+        "        results.append({\n",
+        "            \"scale\": hidden,\n",
+        "            \"inter\": inter,\n",
+        "            \"params\": child_params,\n",
+        "            \"cv\": heal_cv,\n",
+        "            \"top1_proj\": proj_top1,\n",
+        "            \"top5_proj\": proj_top5,\n",
+        "            \"top1_heal\": heal_top1,\n",
+        "            \"top5_heal\": heal_top5,\n",
+        "            \"heal_loss\": heal_loss,\n",
+        "            \"heal_epochs\": HEAL_EPOCHS,\n",
+        "        })\n",
+        "\n",
+        "        parent_model = child_model\n",
+        "\n",
+        "    # ── Direct jump: 768 → 384 ──\n",
+        "    print(f\"\\n{'='*70}\")\n",
+        "    print(f\"DIRECT PROJECTION: {SCALES[0]} → {SCALES[-1]}\")\n",
+        "    print(f\"{'='*70}\")\n",
+        "\n",
+        "    direct_model = create_scaled_bert(\n",
+        "        root_model, SCALES[-1], INTER_SCALES[-1], device)\n",
+        "    direct_cv, direct_params = profile_bert(direct_model, f\"Direct {SCALES[-1]}-dim\")\n",
+        "    direct_top1, direct_top5 = evaluate_mlm(\n",
+        "        direct_model, tokenizer, eval_texts, device)\n",
+        "    print(f\"  Direct: top1={direct_top1:.4f}  top5={direct_top5:.4f}\")\n",
+        "\n",
+        "    # ── Report ──\n",
+        "    print(f\"\\n{'='*70}\")\n",
+        "    print(f\"RESULTS\")\n",
+        "    print(f\"{'='*70}\\n\")\n",
+        "\n",
+        "    print(f\"  {'Scale':>6s} {'Params':>12s} {'Top1(proj)':>11s} {'Top1(heal)':>11s} \"\n",
+        "          f\"{'Top5(proj)':>11s} {'Top5(heal)':>11s} {'CV':>7s}\")\n",
+        "    print(f\"  {'─'*6} {'─'*12} {'─'*11} {'─'*11} {'─'*11} {'─'*11} {'─'*7}\")\n",
+        "\n",
+        "    for r in results:\n",
+        "        print(f\"  {r['scale']:>6d} {r['params']:>12,} {r['top1_proj']:>11.4f} \"\n",
+        "              f\"{r['top1_heal']:>11.4f} {r['top5_proj']:>11.4f} \"\n",
+        "              f\"{r['top5_heal']:>11.4f} {r['cv']:>7.4f}\")\n",
+        "\n",
+        "    print(f\"\\n  DIRECT {SCALES[-1]}: {direct_params:>12,}  \"\n",
+        "          f\"top1={direct_top1:.4f}  top5={direct_top5:.4f}  cv={direct_cv:.4f}\")\n",
+        "\n",
+        "    # ── Retention ──\n",
+        "    final = results[-1]\n",
+        "    print(f\"\\n  SUMMARY:\")\n",
+        "    print(f\"    Root:      {root_params:>12,} params  top1={root_top1:.4f}  top5={root_top5:.4f}\")\n",
+        "    print(f\"    Cascade:   {final['params']:>12,} params  \"\n",
+        "          f\"top1={final['top1_heal']:.4f}  top5={final['top5_heal']:.4f}\")\n",
+        "    print(f\"    Direct:    {direct_params:>12,} params  \"\n",
+        "          f\"top1={direct_top1:.4f}  top5={direct_top5:.4f}\")\n",
+        "    print(f\"    Compression: {root_params/final['params']:.1f}×\")\n",
+        "    print(f\"    Top1 retained (cascade): {final['top1_heal']/root_top1:.1%}\")\n",
+        "    print(f\"    Top1 retained (direct):  {direct_top1/root_top1:.1%}\")\n",
+        "\n",
+        "    print(f\"\\nDone.\")\n",
+        "    return results\n",
+        "\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    results = run_experiment()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000,
+          "referenced_widgets": [
+            "d5c2a63f6f8544e79268b9bade807345",
+            "f027a57fcb6a49cfbbd28b95a6c0adf7",
+            "4df257d047cb4e7ab2a0a6c57be58b81",
+            "214861fbbd124b45ba164e934f91a024",
+            "f6cf4e1cd78c4c0da8bf756a7cc8760a",
+            "bd3077d18b9640abb14a4c385cac7c39",
+            "c39efce145764de59d3dc9cb7a818d33",
+            "b9fc695d6d70408690bbd7ef8bb5841a",
+            "a1486fd248674e9584e90d907b5156c2",
+            "3472ba9857a543568ec3cd2f340571d6",
+            "89eae54541d447bdaa9625cbbe357e55"
+          ]
+        },
+        "id": "dOhM9mTJXnQl",
+        "outputId": "f68265c8-ba6a-4182-fba7-e5736bbaf37d"
+      },
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "======================================================================\n",
+            "ITERATIVE CASCADE ON PRETRAINED BERT-BASE\n",
+            "======================================================================\n",
+            "  Device: cuda\n",
+            "  Scales: 768 → 720 → 672 → 624 → 576 → 528 → 480 → 432 → 384\n",
+            "  Compression: 768 → 384 (50%)\n",
+            "\n",
+            "  Loading evaluation data...\n",
+            "  200 eval texts\n",
+            "  5000 heal texts\n",
+            "\n",
+            "  Loading BERT-base...\n"
+          ]
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "Loading weights:   0%|          | 0/202 [00:00<?, ?it/s]"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "d5c2a63f6f8544e79268b9bade807345"
+            }
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "BertForMaskedLM LOAD REPORT from: google-bert/bert-base-uncased\n",
+            "Key                         | Status     |  | \n",
+            "----------------------------+------------+--+-\n",
+            "bert.pooler.dense.weight    | UNEXPECTED |  | \n",
+            "cls.seq_relationship.weight | UNEXPECTED |  | \n",
+            "bert.pooler.dense.bias      | UNEXPECTED |  | \n",
+            "cls.seq_relationship.bias   | UNEXPECTED |  | \n",
+            "\n",
+            "Notes:\n",
+            "- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "  [Root 768-dim] 109,514,298 params, mean CV=0.2243 (across 73 weight matrices)\n",
+            "  Evaluating root MLM accuracy...\n",
+            "  Root: top1=0.6145  top5=0.7683\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 1: 768 → 720 (6% reduction)\n",
+            "======================================================================\n",
+            "  Projecting...\n",
+            "    Transferred 204/204 params, 0 missing, 0 unexpected\n",
+            "  Projection took 3.7s\n",
+            "  [Projected 720-dim] 97,660,362 params, mean CV=0.2166 (across 73 weight matrices)\n",
+            "  Evaluating MLM after projection...\n",
+            "  After projection: top1=0.0807  top5=0.2317\n",
+            "  Healing (5 epoch MLM)...\n",
+            "  Heal loss: 2.1421 (64.8s)\n",
+            "  [Healed 720-dim] 97,660,362 params, mean CV=0.2184 (across 73 weight matrices)\n",
+            "  Evaluating MLM after heal...\n",
+            "  After heal: top1=0.5550  top5=0.7285\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 2: 720 → 672 (7% reduction)\n",
+            "======================================================================\n",
+            "  Projecting...\n",
+            "    Transferred 204/204 params, 0 missing, 0 unexpected\n",
+            "  Projection took 3.4s\n",
+            "  [Projected 672-dim] 86,474,586 params, mean CV=0.2182 (across 73 weight matrices)\n",
+            "  Evaluating MLM after projection...\n",
+            "  After projection: top1=0.3859  top5=0.6137\n",
+            "  Healing (5 epoch MLM)...\n",
+            "  Heal loss: 1.9844 (60.2s)\n",
+            "  [Healed 672-dim] 86,474,586 params, mean CV=0.2135 (across 73 weight matrices)\n",
+            "  Evaluating MLM after heal...\n",
+            "  After heal: top1=0.5325  top5=0.7157\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 3: 672 → 624 (7% reduction)\n",
+            "======================================================================\n",
+            "  Projecting...\n",
+            "    Transferred 204/204 params, 0 missing, 0 unexpected\n",
+            "  Projection took 3.1s\n",
+            "  [Projected 624-dim] 75,956,970 params, mean CV=0.2113 (across 73 weight matrices)\n",
+            "  Evaluating MLM after projection...\n",
+            "  After projection: top1=0.3908  top5=0.5859\n",
+            "  Healing (5 epoch MLM)...\n",
+            "  Heal loss: 2.0210 (55.8s)\n",
+            "  [Healed 624-dim] 75,956,970 params, mean CV=0.2204 (across 73 weight matrices)\n",
+            "  Evaluating MLM after heal...\n",
+            "  After heal: top1=0.5092  top5=0.6940\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 4: 624 → 576 (8% reduction)\n",
+            "======================================================================\n",
+            "  Projecting...\n",
+            "    Transferred 204/204 params, 0 missing, 0 unexpected\n",
+            "  Projection took 2.7s\n",
+            "  [Projected 576-dim] 66,107,514 params, mean CV=0.2325 (across 73 weight matrices)\n",
+            "  Evaluating MLM after projection...\n",
+            "  After projection: top1=0.3365  top5=0.5309\n",
+            "  Healing (5 epoch MLM)...\n",
+            "  Heal loss: 2.1767 (52.5s)\n",
+            "  [Healed 576-dim] 66,107,514 params, mean CV=0.2273 (across 73 weight matrices)\n",
+            "  Evaluating MLM after heal...\n",
+            "  After heal: top1=0.4606  top5=0.6357\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 5: 576 → 528 (8% reduction)\n",
+            "======================================================================\n",
+            "  Projecting...\n",
+            "    Transferred 204/204 params, 0 missing, 0 unexpected\n",
+            "  Projection took 2.4s\n",
+            "  [Projected 528-dim] 56,926,218 params, mean CV=0.2303 (across 73 weight matrices)\n",
+            "  Evaluating MLM after projection...\n",
+            "  After projection: top1=0.3096  top5=0.4783\n",
+            "  Healing (5 epoch MLM)...\n",
+            "  Heal loss: 2.3574 (48.7s)\n",
+            "  [Healed 528-dim] 56,926,218 params, mean CV=0.2206 (across 73 weight matrices)\n",
+            "  Evaluating MLM after heal...\n",
+            "  After heal: top1=0.4137  top5=0.6012\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 6: 528 → 480 (9% reduction)\n",
+            "======================================================================\n",
+            "  Projecting...\n",
+            "    Transferred 204/204 params, 0 missing, 0 unexpected\n",
+            "  Projection took 2.2s\n",
+            "  [Projected 480-dim] 48,413,082 params, mean CV=0.2333 (across 73 weight matrices)\n",
+            "  Evaluating MLM after projection...\n",
+            "  After projection: top1=0.2711  top5=0.4498\n",
+            "  Healing (5 epoch MLM)...\n",
+            "  Heal loss: 2.5221 (44.1s)\n",
+            "  [Healed 480-dim] 48,413,082 params, mean CV=0.2394 (across 73 weight matrices)\n",
+            "  Evaluating MLM after heal...\n",
+            "  After heal: top1=0.4241  top5=0.5900\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 7: 480 → 432 (10% reduction)\n",
+            "======================================================================\n",
+            "  Projecting...\n",
+            "    Transferred 204/204 params, 0 missing, 0 unexpected\n",
+            "  Projection took 1.8s\n",
+            "  [Projected 432-dim] 40,568,106 params, mean CV=0.2365 (across 73 weight matrices)\n",
+            "  Evaluating MLM after projection...\n",
+            "  After projection: top1=0.2482  top5=0.4032\n",
+            "  Healing (5 epoch MLM)...\n",
+            "  Heal loss: 2.7694 (41.1s)\n",
+            "  [Healed 432-dim] 40,568,106 params, mean CV=0.2254 (across 73 weight matrices)\n",
+            "  Evaluating MLM after heal...\n",
+            "  After heal: top1=0.4088  top5=0.5598\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 8: 432 → 384 (11% reduction)\n",
+            "======================================================================\n",
+            "  Projecting...\n",
+            "    Transferred 204/204 params, 0 missing, 0 unexpected\n",
+            "  Projection took 1.6s\n",
+            "  [Projected 384-dim] 33,391,290 params, mean CV=0.2377 (across 73 weight matrices)\n",
+            "  Evaluating MLM after projection...\n",
+            "  After projection: top1=0.2012  top5=0.3651\n",
+            "  Healing (5 epoch MLM)...\n",
+            "  Heal loss: 3.0380 (38.6s)\n",
+            "  [Healed 384-dim] 33,391,290 params, mean CV=0.2366 (across 73 weight matrices)\n",
+            "  Evaluating MLM after heal...\n",
+            "  After heal: top1=0.3779  top5=0.5418\n",
+            "\n",
+            "======================================================================\n",
+            "DIRECT PROJECTION: 768 → 384\n",
+            "======================================================================\n",
+            "    Transferred 204/204 params, 0 missing, 0 unexpected\n",
+            "  [Direct 384-dim] 33,391,290 params, mean CV=0.2843 (across 73 weight matrices)\n",
+            "  Direct: top1=0.0602  top5=0.1735\n",
+            "\n",
+            "======================================================================\n",
+            "RESULTS\n",
+            "======================================================================\n",
+            "\n",
+            "   Scale       Params  Top1(proj)  Top1(heal)  Top5(proj)  Top5(heal)      CV\n",
+            "  ────── ──────────── ─────────── ─────────── ─────────── ─────────── ───────\n",
+            "     768  109,514,298      0.6145      0.6145      0.7683      0.7683  0.2243\n",
+            "     720   97,660,362      0.0807      0.5550      0.2317      0.7285  0.2184\n",
+            "     672   86,474,586      0.3859      0.5325      0.6137      0.7157  0.2135\n",
+            "     624   75,956,970      0.3908      0.5092      0.5859      0.6940  0.2204\n",
+            "     576   66,107,514      0.3365      0.4606      0.5309      0.6357  0.2273\n",
+            "     528   56,926,218      0.3096      0.4137      0.4783      0.6012  0.2206\n",
+            "     480   48,413,082      0.2711      0.4241      0.4498      0.5900  0.2394\n",
+            "     432   40,568,106      0.2482      0.4088      0.4032      0.5598  0.2254\n",
+            "     384   33,391,290      0.2012      0.3779      0.3651      0.5418  0.2366\n",
+            "\n",
+            "  DIRECT 384:   33,391,290  top1=0.0602  top5=0.1735  cv=0.2843\n",
+            "\n",
+            "  SUMMARY:\n",
+            "    Root:       109,514,298 params  top1=0.6145  top5=0.7683\n",
+            "    Cascade:     33,391,290 params  top1=0.3779  top5=0.5418\n",
+            "    Direct:      33,391,290 params  top1=0.0602  top5=0.1735\n",
+            "    Compression: 3.3×\n",
+            "    Top1 retained (cascade): 61.5%\n",
+            "    Top1 retained (direct):  9.8%\n",
+            "\n",
+            "Done.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# v2 bert rescaled attention"
+      ],
+      "metadata": {
+        "id": "kUsvH_-Mp0u2"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# ============================================================================\n",
+        "# TEACHER-GUIDED CASCADE v3 — ALL FIXES APPLIED\n",
+        "#\n",
+        "# Fixes from cross-model review:\n",
+        "#   1. Distillation loss computed ONLY on active tokens (not padding)\n",
+        "#   2. Projectors FROZEN after Procrustes init (prevent collapse shortcut)\n",
+        "#   3. Biases projected via SVD basis (U_k.T @ b), not truncated\n",
+        "#   4. FFN intermediate reduced via L1 magnitude pruning, not SVD\n",
+        "#      (preserves coordinate alignment with GELU nonlinearity)\n",
+        "#   5. Seeded eval masking for consistent cross-scale comparison\n",
+        "# ============================================================================\n",
+        "\n",
+        "import math\n",
+        "import time\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "import numpy as np\n",
+        "from transformers import (\n",
+        "    BertForMaskedLM, BertTokenizer, BertConfig,\n",
+        "    DataCollatorForLanguageModeling\n",
+        ")\n",
+        "from datasets import load_dataset\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# GEOMETRIC UTILITIES\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def cayley_menger_vol2(pts):\n",
+        "    with torch.amp.autocast(\"cuda\", enabled=False):\n",
+        "        pts = pts.float()\n",
+        "        diff = pts.unsqueeze(-2) - pts.unsqueeze(-3)\n",
+        "        d2 = (diff * diff).sum(-1)\n",
+        "        B, V, _ = d2.shape\n",
+        "        cm = torch.zeros(B, V+1, V+1, device=d2.device, dtype=torch.float32)\n",
+        "        cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2\n",
+        "        s = (-1.0)**V; f = math.factorial(V-1)\n",
+        "        return s / ((2.0**(V-1)) * f*f) * torch.linalg.det(cm)\n",
+        "\n",
+        "def pentachoron_cv(W, n_samples=200):\n",
+        "    if W.dim() != 2 or W.shape[0] < 5:\n",
+        "        return 0.0\n",
+        "    vols = []\n",
+        "    for _ in range(n_samples):\n",
+        "        idx = torch.randperm(W.shape[0], device=W.device)[:5]\n",
+        "        v2 = cayley_menger_vol2(W[idx].unsqueeze(0))\n",
+        "        v = torch.sqrt(F.relu(v2[0]) + 1e-12).item()\n",
+        "        if v > 0:\n",
+        "            vols.append(v)\n",
+        "    if len(vols) < 10:\n",
+        "        return 0.0\n",
+        "    a = np.array(vols, dtype=np.float64)\n",
+        "    return float(a.std() / max(a.mean(), 1e-12))\n",
+        "\n",
+        "def profile_bert(model, tag=\"\"):\n",
+        "    cvs = []\n",
+        "    for name, param in model.named_parameters():\n",
+        "        if param.dim() == 2 and param.shape[0] >= 5 and param.shape[1] >= 5:\n",
+        "            if \"weight\" in name and (\"dense\" in name or \"query\" in name\n",
+        "                                     or \"key\" in name or \"value\" in name):\n",
+        "                cv = pentachoron_cv(param.detach(), n_samples=100)\n",
+        "                cvs.append(cv)\n",
+        "    mean_cv = np.mean(cvs) if cvs else 0.0\n",
+        "    n_params = sum(p.numel() for p in model.parameters())\n",
+        "    if tag:\n",
+        "        print(f\"  [{tag}] {n_params:,} params, CV={mean_cv:.4f} ({len(cvs)} matrices)\")\n",
+        "    return mean_cv, n_params\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# FIX 3: SVD PROJECTION WITH PROPER BIAS HANDLING\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def svd_project_weight_and_bias(W, b, out_dim, in_dim):\n",
+        "    \"\"\"\n",
+        "    Project weight matrix via truncated SVD AND project bias\n",
+        "    into the new SVD output basis.\n",
+        "\n",
+        "    W: (D_out, D_in) → (out_dim, in_dim)\n",
+        "    b: (D_out,) → (out_dim,) projected via U_k.T\n",
+        "\n",
+        "    Returns: W_new, b_new\n",
+        "    \"\"\"\n",
+        "    W = W.float()\n",
+        "    U, S, Vt = torch.linalg.svd(W, full_matrices=True)\n",
+        "    k = min(S.shape[0], out_dim, in_dim)\n",
+        "\n",
+        "    U_k = U[:, :k]         # (D_out, k) — output basis\n",
+        "    S_k = S[:k]             # (k,)\n",
+        "    Vt_k = Vt[:k, :]       # (k, D_in)\n",
+        "\n",
+        "    # Truncate input dimension\n",
+        "    Vt_k_trunc = Vt_k[:, :min(W.shape[1], in_dim)]\n",
+        "\n",
+        "    # Reconstruct at target dimensions\n",
+        "    W_new = torch.zeros(out_dim, in_dim, dtype=W.dtype, device=W.device)\n",
+        "    core = U_k[:min(W.shape[0], out_dim), :] @ torch.diag(S_k) @ Vt_k_trunc\n",
+        "    r, c = core.shape\n",
+        "    W_new[:r, :c] = core\n",
+        "\n",
+        "    # Project bias into new output basis\n",
+        "    b_new = torch.zeros(out_dim, dtype=W.dtype, device=W.device)\n",
+        "    if b is not None:\n",
+        "        b = b.float()\n",
+        "        # b_projected = U_k[:out_dim, :].T @ b → but U_k might be (D_out, k) with k < out_dim\n",
+        "        # Use the same truncation as W\n",
+        "        b_proj = U_k[:min(W.shape[0], out_dim), :].T @ b[:min(W.shape[0], out_dim)]\n",
+        "        b_new[:min(k, out_dim)] = b_proj[:min(k, out_dim)]\n",
+        "\n",
+        "    return W_new, b_new\n",
+        "\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def svd_project_matrix_only(W, out_dim, in_dim):\n",
+        "    \"\"\"SVD project weight matrix without bias.\"\"\"\n",
+        "    W = W.float()\n",
+        "    U, S, Vt = torch.linalg.svd(W, full_matrices=True)\n",
+        "    k = min(S.shape[0], out_dim, in_dim)\n",
+        "    U_k = U[:min(W.shape[0], out_dim), :k]\n",
+        "    Vt_k = Vt[:k, :min(W.shape[1], in_dim)]\n",
+        "    W_small = U_k @ torch.diag(S[:k]) @ Vt_k\n",
+        "    result = torch.zeros(out_dim, in_dim, dtype=W.dtype, device=W.device)\n",
+        "    r, c = W_small.shape\n",
+        "    result[:r, :c] = W_small\n",
+        "    return result\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# FIX 4: L1 MAGNITUDE PRUNING FOR FFN INTERMEDIATE\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def l1_prune_ffn(W_up, b_up, W_down, b_down, target_intermediate):\n",
+        "    \"\"\"\n",
+        "    Prune FFN intermediate dimension by keeping rows/cols with highest L1 norm.\n",
+        "    Preserves coordinate alignment with GELU nonlinearity.\n",
+        "\n",
+        "    W_up: (src_inter, src_hidden) — expands\n",
+        "    b_up: (src_inter,)\n",
+        "    W_down: (src_hidden, src_inter) — contracts\n",
+        "    b_down: (src_hidden,)\n",
+        "\n",
+        "    Returns: pruned W_up, b_up, W_down (columns pruned)\n",
+        "    \"\"\"\n",
+        "    src_inter = W_up.shape[0]\n",
+        "    if src_inter <= target_intermediate:\n",
+        "        return W_up, b_up, W_down\n",
+        "\n",
+        "    # Importance = L1 norm of each intermediate neuron\n",
+        "    # Combined from both up-projection row and down-projection column\n",
+        "    importance = W_up.float().abs().sum(dim=1) + W_down.float().abs().sum(dim=0)\n",
+        "\n",
+        "    # Keep top-k\n",
+        "    _, keep_idx = importance.topk(target_intermediate)\n",
+        "    keep_idx = keep_idx.sort().values\n",
+        "\n",
+        "    W_up_pruned = W_up[keep_idx, :]\n",
+        "    b_up_pruned = b_up[keep_idx] if b_up is not None else None\n",
+        "    W_down_pruned = W_down[:, keep_idx]\n",
+        "    # b_down stays same dimension (hidden_size)\n",
+        "\n",
+        "    return W_up_pruned, b_up_pruned, W_down_pruned\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# CORRECTED BERT PROJECTION\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def create_projected_bert(source_model, target_hidden, target_intermediate, device):\n",
+        "    \"\"\"\n",
+        "    Project BERT with:\n",
+        "      - SVD + proper bias projection for attention/embedding matrices\n",
+        "      - L1 magnitude pruning for FFN intermediate (respects GELU)\n",
+        "    \"\"\"\n",
+        "    src_config = source_model.config\n",
+        "    new_config = BertConfig(\n",
+        "        vocab_size=src_config.vocab_size,\n",
+        "        hidden_size=target_hidden,\n",
+        "        num_hidden_layers=src_config.num_hidden_layers,\n",
+        "        num_attention_heads=src_config.num_attention_heads,\n",
+        "        intermediate_size=target_intermediate,\n",
+        "        max_position_embeddings=src_config.max_position_embeddings,\n",
+        "        type_vocab_size=src_config.type_vocab_size,\n",
+        "        hidden_act=src_config.hidden_act,\n",
+        "        hidden_dropout_prob=0.0,\n",
+        "        attention_probs_dropout_prob=0.0,\n",
+        "    )\n",
+        "    target = BertForMaskedLM(new_config).to(device)\n",
+        "    src = source_model\n",
+        "\n",
+        "    # ── Embeddings (SVD on hidden dim, keep vocab) ──\n",
+        "    for emb_name in [\"word_embeddings\", \"position_embeddings\", \"token_type_embeddings\"]:\n",
+        "        src_w = getattr(src.bert.embeddings, emb_name).weight.data\n",
+        "        tgt_w = getattr(target.bert.embeddings, emb_name).weight.data\n",
+        "        tgt_w.copy_(svd_project_matrix_only(src_w, tgt_w.shape[0], tgt_w.shape[1]))\n",
+        "\n",
+        "    # Embedding LayerNorm — truncate (element-wise, no rotation)\n",
+        "    target.bert.embeddings.LayerNorm.weight.data.copy_(\n",
+        "        src.bert.embeddings.LayerNorm.weight.data[:target_hidden])\n",
+        "    target.bert.embeddings.LayerNorm.bias.data.copy_(\n",
+        "        src.bert.embeddings.LayerNorm.bias.data[:target_hidden])\n",
+        "\n",
+        "    # ── Encoder layers ──\n",
+        "    for i, (src_layer, tgt_layer) in enumerate(\n",
+        "        zip(src.bert.encoder.layer, target.bert.encoder.layer)):\n",
+        "\n",
+        "        # Q, K, V: (src_hidden, src_hidden) → (target_hidden, target_hidden)\n",
+        "        for attr in [\"query\", \"key\", \"value\"]:\n",
+        "            src_mod = getattr(src_layer.attention.self, attr)\n",
+        "            tgt_mod = getattr(tgt_layer.attention.self, attr)\n",
+        "            W_new, b_new = svd_project_weight_and_bias(\n",
+        "                src_mod.weight.data, src_mod.bias.data,\n",
+        "                target_hidden, target_hidden)\n",
+        "            tgt_mod.weight.data.copy_(W_new)\n",
+        "            tgt_mod.bias.data.copy_(b_new)\n",
+        "\n",
+        "        # Attention output: (src_hidden, src_hidden) → (target_hidden, target_hidden)\n",
+        "        W_new, b_new = svd_project_weight_and_bias(\n",
+        "            src_layer.attention.output.dense.weight.data,\n",
+        "            src_layer.attention.output.dense.bias.data,\n",
+        "            target_hidden, target_hidden)\n",
+        "        tgt_layer.attention.output.dense.weight.data.copy_(W_new)\n",
+        "        tgt_layer.attention.output.dense.bias.data.copy_(b_new)\n",
+        "\n",
+        "        # Attention LayerNorm — truncate\n",
+        "        tgt_layer.attention.output.LayerNorm.weight.data.copy_(\n",
+        "            src_layer.attention.output.LayerNorm.weight.data[:target_hidden])\n",
+        "        tgt_layer.attention.output.LayerNorm.bias.data.copy_(\n",
+        "            src_layer.attention.output.LayerNorm.bias.data[:target_hidden])\n",
+        "\n",
+        "        # FFN: L1 magnitude pruning (respects GELU coordinate alignment)\n",
+        "        W_up = src_layer.intermediate.dense.weight.data.to(device)\n",
+        "        b_up = src_layer.intermediate.dense.bias.data.to(device)\n",
+        "        W_down = src_layer.output.dense.weight.data.to(device)\n",
+        "        b_down = src_layer.output.dense.bias.data.to(device)\n",
+        "\n",
+        "        # First prune intermediate dimension\n",
+        "        W_up_p, b_up_p, W_down_p = l1_prune_ffn(\n",
+        "            W_up, b_up, W_down, b_down, target_intermediate)\n",
+        "\n",
+        "        # Then SVD-project the hidden dimensions with proper bias\n",
+        "        W_up_final, b_up_final = svd_project_weight_and_bias(\n",
+        "            W_up_p, b_up_p, target_intermediate, target_hidden)\n",
+        "        tgt_layer.intermediate.dense.weight.data.copy_(W_up_final)\n",
+        "        tgt_layer.intermediate.dense.bias.data.copy_(b_up_final)\n",
+        "\n",
+        "        W_down_final, b_down_final = svd_project_weight_and_bias(\n",
+        "            W_down_p, b_down, target_hidden, target_intermediate)\n",
+        "        tgt_layer.output.dense.weight.data.copy_(W_down_final)\n",
+        "        tgt_layer.output.dense.bias.data.copy_(b_down_final)\n",
+        "\n",
+        "        # Output LayerNorm — truncate\n",
+        "        tgt_layer.output.LayerNorm.weight.data.copy_(\n",
+        "            src_layer.output.LayerNorm.weight.data[:target_hidden])\n",
+        "        tgt_layer.output.LayerNorm.bias.data.copy_(\n",
+        "            src_layer.output.LayerNorm.bias.data[:target_hidden])\n",
+        "\n",
+        "    # ── MLM Head ──\n",
+        "    if hasattr(src.cls.predictions.transform, 'dense'):\n",
+        "        W_new, b_new = svd_project_weight_and_bias(\n",
+        "            src.cls.predictions.transform.dense.weight.data,\n",
+        "            src.cls.predictions.transform.dense.bias.data,\n",
+        "            target_hidden, target_hidden)\n",
+        "        target.cls.predictions.transform.dense.weight.data.copy_(W_new)\n",
+        "        target.cls.predictions.transform.dense.bias.data.copy_(b_new)\n",
+        "\n",
+        "    if hasattr(src.cls.predictions.transform, 'LayerNorm'):\n",
+        "        target.cls.predictions.transform.LayerNorm.weight.data.copy_(\n",
+        "            src.cls.predictions.transform.LayerNorm.weight.data[:target_hidden])\n",
+        "        target.cls.predictions.transform.LayerNorm.bias.data.copy_(\n",
+        "            src.cls.predictions.transform.LayerNorm.bias.data[:target_hidden])\n",
+        "\n",
+        "    if hasattr(src.cls.predictions, 'bias'):\n",
+        "        target.cls.predictions.bias.data.copy_(src.cls.predictions.bias.data)\n",
+        "\n",
+        "    return target\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# FIX 2: FROZEN PER-LAYER PROJECTORS\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "class FrozenLayerProjectors(nn.Module):\n",
+        "    \"\"\"\n",
+        "    Per-layer projectors initialized from Procrustes, then FROZEN.\n",
+        "    The student must move to match the fixed target — no shortcut collapse.\n",
+        "    \"\"\"\n",
+        "    def __init__(self, teacher_dim, student_dim, n_layers, device):\n",
+        "        super().__init__()\n",
+        "        self.projectors = nn.ModuleList([\n",
+        "            nn.Linear(teacher_dim, student_dim, bias=False).to(device)\n",
+        "            for _ in range(n_layers + 1)\n",
+        "        ])\n",
+        "\n",
+        "    @torch.no_grad()\n",
+        "    def init_from_layer_procrustes(self, teacher_model, student_dim, device):\n",
+        "        teacher_dim = teacher_model.config.hidden_size\n",
+        "        for i, layer in enumerate(teacher_model.bert.encoder.layer):\n",
+        "            weights = [\n",
+        "                layer.attention.self.query.weight.data.T,\n",
+        "                layer.attention.self.key.weight.data.T,\n",
+        "                layer.attention.self.value.weight.data.T,\n",
+        "                layer.intermediate.dense.weight.data.T,\n",
+        "            ]\n",
+        "            L = torch.cat(weights, dim=1).float().to(device)\n",
+        "            U, S, Vt = torch.linalg.svd(L, full_matrices=False)\n",
+        "            P_layer = U[:, :student_dim]  # (teacher_dim, student_dim)\n",
+        "            self.projectors[i + 1].weight.data.copy_(P_layer.T)\n",
+        "            if i == 0:\n",
+        "                self.projectors[0].weight.data.copy_(P_layer.T)\n",
+        "\n",
+        "        # FREEZE all projectors\n",
+        "        for p in self.parameters():\n",
+        "            p.requires_grad = False\n",
+        "\n",
+        "        print(f\"    {len(self.projectors)} projectors initialized + FROZEN\")\n",
+        "\n",
+        "    def forward(self, teacher_hiddens):\n",
+        "        projected = []\n",
+        "        for t_h, proj in zip(teacher_hiddens, self.projectors):\n",
+        "            projected.append(proj(t_h.float()))\n",
+        "        return projected\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# FIX 1: DISTILLATION LOSS ON ACTIVE TOKENS ONLY\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "class TeacherGuidedHealerV3:\n",
+        "    def __init__(self, teacher_model, projectors, device,\n",
+        "                 mlm_weight=1.0, distill_weight=2.0):\n",
+        "        self.teacher = teacher_model\n",
+        "        self.teacher.eval()\n",
+        "        self.projectors = projectors  # FROZEN\n",
+        "        self.device = device\n",
+        "        self.mlm_weight = mlm_weight\n",
+        "        self.distill_weight = distill_weight\n",
+        "\n",
+        "    def compute_distillation_loss(self, student_model, input_ids, attention_mask):\n",
+        "        with torch.no_grad():\n",
+        "            teacher_out = self.teacher.bert(\n",
+        "                input_ids=input_ids, attention_mask=attention_mask,\n",
+        "                output_hidden_states=True, return_dict=True)\n",
+        "\n",
+        "        student_out = student_model.bert(\n",
+        "            input_ids=input_ids, attention_mask=attention_mask,\n",
+        "            output_hidden_states=True, return_dict=True)\n",
+        "\n",
+        "        # Per-layer projection (frozen projectors)\n",
+        "        projected = self.projectors(teacher_out.hidden_states)\n",
+        "        student_hiddens = student_out.hidden_states\n",
+        "\n",
+        "        n_layers = min(len(projected), len(student_hiddens))\n",
+        "        total_loss = torch.tensor(0.0, device=self.device)\n",
+        "\n",
+        "        # FIX 1: Only compute loss on active (non-padding) tokens\n",
+        "        active_mask = attention_mask.float()  # (B, seq), 1=active, 0=pad\n",
+        "        n_active = active_mask.sum().clamp(min=1.0)\n",
+        "\n",
+        "        for layer_idx in range(1, n_layers):\n",
+        "            t_proj = projected[layer_idx]        # (B, seq, student_dim)\n",
+        "            s_h = student_hiddens[layer_idx].float()\n",
+        "\n",
+        "            # Cosine similarity per token\n",
+        "            t_norm = F.normalize(t_proj, dim=-1)\n",
+        "            s_norm = F.normalize(s_h, dim=-1)\n",
+        "            cos_sim = (t_norm * s_norm).sum(-1)  # (B, seq)\n",
+        "\n",
+        "            # Mask out padding, average over active tokens only\n",
+        "            cos_sim_active = cos_sim * active_mask\n",
+        "            layer_loss = 1.0 - cos_sim_active.sum() / n_active\n",
+        "            total_loss = total_loss + layer_loss\n",
+        "\n",
+        "        return total_loss / max(n_layers - 1, 1)\n",
+        "\n",
+        "    def heal(self, student_model, tokenizer, texts, n_epochs=5,\n",
+        "             lr=5e-5, max_len=128, batch_size=16):\n",
+        "        student_model.train()\n",
+        "        # Only student params — projectors are frozen\n",
+        "        optimizer = torch.optim.AdamW(student_model.parameters(), lr=lr)\n",
+        "\n",
+        "        enc = tokenizer(texts, max_length=max_len, truncation=True,\n",
+        "                        padding=\"max_length\", return_tensors=\"pt\")\n",
+        "        ids, masks = enc[\"input_ids\"], enc[\"attention_mask\"]\n",
+        "        collator = DataCollatorForLanguageModeling(\n",
+        "            tokenizer=tokenizer, mlm=True, mlm_probability=0.15)\n",
+        "        n = ids.shape[0]\n",
+        "        total_loss = 0\n",
+        "        n_batches = 0\n",
+        "\n",
+        "        for epoch in range(n_epochs):\n",
+        "            perm = torch.randperm(n)\n",
+        "            for i in range(0, n, batch_size):\n",
+        "                idx = perm[i:i+batch_size]\n",
+        "                batch = [{\"input_ids\": ids[j], \"attention_mask\": masks[j]}\n",
+        "                         for j in idx]\n",
+        "                c = collator(batch)\n",
+        "                c_ids = c[\"input_ids\"].to(self.device)\n",
+        "                c_mask = c[\"attention_mask\"].to(self.device)\n",
+        "                c_labels = c[\"labels\"].to(self.device)\n",
+        "\n",
+        "                mlm_out = student_model(\n",
+        "                    input_ids=c_ids, attention_mask=c_mask, labels=c_labels)\n",
+        "\n",
+        "                # Distillation on unmasked input\n",
+        "                orig_ids = ids[idx].to(self.device)\n",
+        "                orig_mask = masks[idx].to(self.device)\n",
+        "                distill_loss = self.compute_distillation_loss(\n",
+        "                    student_model, orig_ids, orig_mask)\n",
+        "\n",
+        "                loss = (self.mlm_weight * mlm_out.loss +\n",
+        "                       self.distill_weight * distill_loss)\n",
+        "\n",
+        "                optimizer.zero_grad()\n",
+        "                loss.backward()\n",
+        "                optimizer.step()\n",
+        "                total_loss += loss.item()\n",
+        "                n_batches += 1\n",
+        "\n",
+        "            if (epoch + 1) % 2 == 0 or epoch == 0:\n",
+        "                nb = max(n_batches, 1)\n",
+        "                print(f\"      Epoch {epoch+1}: loss={total_loss/nb:.4f} \"\n",
+        "                      f\"(mlm≈{mlm_out.loss.item():.3f}, \"\n",
+        "                      f\"distill≈{distill_loss.item():.3f})\")\n",
+        "\n",
+        "        return total_loss / max(n_batches, 1)\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# FIX 5: SEEDED EVALUATION\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def evaluate_mlm(model, tokenizer, texts, device, mask_prob=0.15,\n",
+        "                 max_len=128, seed=42):\n",
+        "    \"\"\"Deterministic masking for consistent cross-scale comparison.\"\"\"\n",
+        "    model.eval()\n",
+        "    gen = torch.Generator().manual_seed(seed)\n",
+        "    total_1 = total_5 = total_m = 0\n",
+        "\n",
+        "    for text in texts:\n",
+        "        tokens = tokenizer(text, return_tensors=\"pt\", max_length=max_len,\n",
+        "                          truncation=True, padding=False).to(device)\n",
+        "        input_ids = tokens[\"input_ids\"][0]\n",
+        "        seq_len = input_ids.shape[0]\n",
+        "        if seq_len < 5:\n",
+        "            continue\n",
+        "        special = torch.zeros(seq_len, dtype=torch.bool, device=device)\n",
+        "        special[0] = special[seq_len-1] = True\n",
+        "        special[input_ids == tokenizer.pad_token_id] = True\n",
+        "        maskable = (~special).nonzero(as_tuple=True)[0]\n",
+        "        if len(maskable) == 0:\n",
+        "            continue\n",
+        "        n_mask = max(1, int(len(maskable) * mask_prob))\n",
+        "        chosen = maskable[torch.randperm(len(maskable), generator=gen)[:n_mask]]\n",
+        "        orig = input_ids[chosen].clone()\n",
+        "        masked = input_ids.clone()\n",
+        "        masked[chosen] = tokenizer.mask_token_id\n",
+        "        logits = model(masked.unsqueeze(0),\n",
+        "                      attention_mask=tokens[\"attention_mask\"]).logits[0, chosen]\n",
+        "        total_1 += (logits.argmax(-1) == orig).sum().item()\n",
+        "        top5 = logits.topk(5, dim=-1).indices\n",
+        "        total_5 += (top5 == orig.unsqueeze(-1)).any(-1).sum().item()\n",
+        "        total_m += n_mask\n",
+        "    if total_m == 0:\n",
+        "        return 0.0, 0.0\n",
+        "    return total_1 / total_m, total_5 / total_m\n",
+        "\n",
+        "\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "# EXPERIMENT\n",
+        "# ══════════════════════════════════════════════════════════════════\n",
+        "\n",
+        "def run_experiment():\n",
+        "    print(\"=\" * 70)\n",
+        "    print(\"TEACHER-GUIDED CASCADE v3 — ALL FIXES\")\n",
+        "    print(\"=\" * 70)\n",
+        "\n",
+        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "    print(f\"  Device: {device}\")\n",
+        "\n",
+        "    SCALES =       [768, 720, 672, 624, 576, 528, 480, 432, 384]\n",
+        "    INTER_SCALES = [3072, 2880, 2688, 2496, 2304, 2112, 1920, 1728, 1536]\n",
+        "    N_EVAL = 200\n",
+        "    N_HEAL = 5000\n",
+        "    HEAL_EPOCHS = 5\n",
+        "\n",
+        "    print(f\"  Scales: {' → '.join(str(s) for s in SCALES)}\")\n",
+        "    print(f\"  Fixes: padding-masked loss, frozen projectors, \"\n",
+        "          f\"SVD bias projection, L1 FFN pruning, seeded eval\")\n",
+        "\n",
+        "    # ── Data ──\n",
+        "    print(f\"\\n  Loading data...\")\n",
+        "    ds_val = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=\"validation\")\n",
+        "    eval_texts = [r[\"text\"].strip() for r in ds_val if len(r[\"text\"].strip()) > 50][:N_EVAL]\n",
+        "    ds_train = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=\"train\")\n",
+        "    heal_texts = [r[\"text\"].strip() for r in ds_train if len(r[\"text\"].strip()) > 100][:N_HEAL]\n",
+        "    print(f\"  {len(eval_texts)} eval, {len(heal_texts)} heal texts\")\n",
+        "\n",
+        "    # ── Teacher ──\n",
+        "    print(f\"\\n  Loading BERT-base (teacher)...\")\n",
+        "    tokenizer = BertTokenizer.from_pretrained(\"google-bert/bert-base-uncased\")\n",
+        "    teacher = BertForMaskedLM.from_pretrained(\"google-bert/bert-base-uncased\").to(device)\n",
+        "    teacher.eval()\n",
+        "    for p in teacher.parameters():\n",
+        "        p.requires_grad = False\n",
+        "\n",
+        "    root_cv, root_params = profile_bert(teacher, \"Teacher 768\")\n",
+        "    root_top1, root_top5 = evaluate_mlm(teacher, tokenizer, eval_texts, device)\n",
+        "    print(f\"  Teacher: top1={root_top1:.4f}  top5={root_top5:.4f}\")\n",
+        "\n",
+        "    results = [{\n",
+        "        \"scale\": 768, \"params\": root_params, \"cv\": root_cv,\n",
+        "        \"top1_proj\": root_top1, \"top5_proj\": root_top5,\n",
+        "        \"top1_heal\": root_top1, \"top5_heal\": root_top5,\n",
+        "    }]\n",
+        "\n",
+        "    parent = teacher\n",
+        "    n_encoder_layers = teacher.config.num_hidden_layers\n",
+        "\n",
+        "    for i in range(1, len(SCALES)):\n",
+        "        hidden = SCALES[i]\n",
+        "        inter = INTER_SCALES[i]\n",
+        "        parent_hidden = SCALES[i-1]\n",
+        "\n",
+        "        print(f\"\\n{'='*70}\")\n",
+        "        print(f\"SCALE {i}: {parent_hidden} → {hidden} \"\n",
+        "              f\"({(parent_hidden-hidden)/parent_hidden:.0%} reduction)\")\n",
+        "        print(f\"{'='*70}\")\n",
+        "\n",
+        "        # ── Project ──\n",
+        "        print(f\"  Projecting (SVD + L1 FFN prune)...\")\n",
+        "        t0 = time.time()\n",
+        "        child = create_projected_bert(parent, hidden, inter, device)\n",
+        "        print(f\"  Projection: {time.time()-t0:.1f}s\")\n",
+        "\n",
+        "        child_cv, child_params = profile_bert(child, f\"Projected {hidden}\")\n",
+        "        proj_top1, proj_top5 = evaluate_mlm(child, tokenizer, eval_texts, device)\n",
+        "        print(f\"  After proj: top1={proj_top1:.4f}  top5={proj_top5:.4f}\")\n",
+        "\n",
+        "        # ── Frozen per-layer projectors ──\n",
+        "        print(f\"  Initializing frozen per-layer projectors...\")\n",
+        "        projectors = FrozenLayerProjectors(768, hidden, n_encoder_layers, device)\n",
+        "        projectors.init_from_layer_procrustes(teacher, hidden, device)\n",
+        "\n",
+        "        # ── Teacher-guided healing ──\n",
+        "        print(f\"  Healing ({HEAL_EPOCHS} epochs)...\")\n",
+        "        healer = TeacherGuidedHealerV3(\n",
+        "            teacher, projectors, device,\n",
+        "            mlm_weight=1.0, distill_weight=2.0)\n",
+        "        t0 = time.time()\n",
+        "        heal_loss = healer.heal(child, tokenizer, heal_texts,\n",
+        "                                 n_epochs=HEAL_EPOCHS, lr=5e-5)\n",
+        "        print(f\"  Heal: {time.time()-t0:.1f}s\")\n",
+        "\n",
+        "        heal_cv, _ = profile_bert(child, f\"Healed {hidden}\")\n",
+        "        heal_top1, heal_top5 = evaluate_mlm(child, tokenizer, eval_texts, device)\n",
+        "        print(f\"  After heal: top1={heal_top1:.4f}  top5={heal_top5:.4f}\")\n",
+        "\n",
+        "        results.append({\n",
+        "            \"scale\": hidden, \"params\": child_params, \"cv\": heal_cv,\n",
+        "            \"top1_proj\": proj_top1, \"top5_proj\": proj_top5,\n",
+        "            \"top1_heal\": heal_top1, \"top5_heal\": heal_top5,\n",
+        "        })\n",
+        "\n",
+        "        parent = child\n",
+        "\n",
+        "    # ── Report ──\n",
+        "    print(f\"\\n{'='*70}\")\n",
+        "    print(f\"RESULTS\")\n",
+        "    print(f\"{'='*70}\\n\")\n",
+        "\n",
+        "    print(f\"  {'Scale':>6s} {'Params':>12s} {'Top1(proj)':>11s} {'Top1(heal)':>11s} \"\n",
+        "          f\"{'Top5(proj)':>11s} {'Top5(heal)':>11s} {'CV':>7s}\")\n",
+        "    print(f\"  {'─'*6} {'─'*12} {'─'*11} {'─'*11} {'─'*11} {'─'*11} {'─'*7}\")\n",
+        "\n",
+        "    for r in results:\n",
+        "        print(f\"  {r['scale']:>6d} {r['params']:>12,} {r['top1_proj']:>11.4f} \"\n",
+        "              f\"{r['top1_heal']:>11.4f} {r['top5_proj']:>11.4f} \"\n",
+        "              f\"{r['top5_heal']:>11.4f} {r['cv']:>7.4f}\")\n",
+        "\n",
+        "    final = results[-1]\n",
+        "    print(f\"\\n  SUMMARY:\")\n",
+        "    print(f\"    Teacher:   {root_params:>12,}  top1={root_top1:.4f}  top5={root_top5:.4f}\")\n",
+        "    print(f\"    Cascade:   {final['params']:>12,}  \"\n",
+        "          f\"top1={final['top1_heal']:.4f}  top5={final['top5_heal']:.4f}\")\n",
+        "    print(f\"    Compression: {root_params/final['params']:.1f}×\")\n",
+        "    print(f\"    Top1 retained: {final['top1_heal']/root_top1:.1%}\")\n",
+        "\n",
+        "    print(f\"\\n  ALL APPROACHES:\")\n",
+        "    print(f\"    v1 Independent SVD + blind MLM:       61.5%\")\n",
+        "    print(f\"    v2 + teacher global P:                62.5%\")\n",
+        "    print(f\"    v2 + per-layer projectors (buggy):    ???\")\n",
+        "    print(f\"    v3 all fixes:                         {final['top1_heal']/root_top1:.1%}\")\n",
+        "\n",
+        "    print(f\"\\nDone.\")\n",
+        "    return results\n",
+        "\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    results = run_experiment()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000,
+          "referenced_widgets": [
+            "ae1bdb89a9704d09a1c03ec82354905e",
+            "8d03191c19b64a698be6d6fc141817cb",
+            "8d24919b783b47748aefac2a6c234313",
+            "38bea3901f6c4e339d17a0269f0e535b",
+            "e4655e0917814b1b950d53a8f59d1fa5",
+            "325923e6a4054298be70581c2cd1061d",
+            "97bcf95ab5ac4823b2f696f42db534da",
+            "866d9713b1f5436b924a5505e36b9e7d",
+            "8f481ca713e04111a7dcab82f19a072b",
+            "8566ff523c5d4bb5b799cdd12a95c633",
+            "bd53b01466524897aea749b68000684a"
+          ]
+        },
+        "id": "ddAQ1-RGp3Fx",
+        "outputId": "a9f9c9f5-fc9a-4120-d1c4-06a50e9f7483"
+      },
+      "execution_count": 20,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "======================================================================\n",
+            "TEACHER-GUIDED CASCADE v3 — ALL FIXES\n",
+            "======================================================================\n",
+            "  Device: cuda\n",
+            "  Scales: 768 → 720 → 672 → 624 → 576 → 528 → 480 → 432 → 384\n",
+            "  Fixes: padding-masked loss, frozen projectors, SVD bias projection, L1 FFN pruning, seeded eval\n",
+            "\n",
+            "  Loading data...\n",
+            "  200 eval, 5000 heal texts\n",
+            "\n",
+            "  Loading BERT-base (teacher)...\n"
+          ]
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "Loading weights:   0%|          | 0/202 [00:00<?, ?it/s]"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "ae1bdb89a9704d09a1c03ec82354905e"
+            }
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "BertForMaskedLM LOAD REPORT from: google-bert/bert-base-uncased\n",
+            "Key                         | Status     |  | \n",
+            "----------------------------+------------+--+-\n",
+            "bert.pooler.dense.weight    | UNEXPECTED |  | \n",
+            "cls.seq_relationship.weight | UNEXPECTED |  | \n",
+            "bert.pooler.dense.bias      | UNEXPECTED |  | \n",
+            "cls.seq_relationship.bias   | UNEXPECTED |  | \n",
+            "\n",
+            "Notes:\n",
+            "- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "  [Teacher 768] 109,514,298 params, CV=0.2248 (73 matrices)\n",
+            "  Teacher: top1=0.6129  top5=0.7711\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 1: 768 → 720 (6% reduction)\n",
+            "======================================================================\n",
+            "  Projecting (SVD + L1 FFN prune)...\n",
+            "  Projection: 3.6s\n",
+            "  [Projected 720] 97,660,362 params, CV=0.2297 (73 matrices)\n",
+            "  After proj: top1=0.0133  top5=0.0803\n",
+            "  Initializing frozen per-layer projectors...\n",
+            "    13 projectors initialized + FROZEN\n",
+            "  Healing (5 epochs)...\n",
+            "      Epoch 1: loss=5.0110 (mlm≈3.201, distill≈0.966)\n",
+            "      Epoch 2: loss=4.5469 (mlm≈2.117, distill≈0.816)\n",
+            "      Epoch 4: loss=4.0323 (mlm≈1.896, distill≈0.683)\n",
+            "  Heal: 133.2s\n",
+            "  [Healed 720] 97,660,362 params, CV=0.2280 (73 matrices)\n",
+            "  After heal: top1=0.5438  top5=0.7257\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 2: 720 → 672 (7% reduction)\n",
+            "======================================================================\n",
+            "  Projecting (SVD + L1 FFN prune)...\n",
+            "  Projection: 3.3s\n",
+            "  [Projected 672] 86,474,586 params, CV=0.2179 (73 matrices)\n",
+            "  After proj: top1=0.1703  top5=0.3341\n",
+            "  Initializing frozen per-layer projectors...\n",
+            "    13 projectors initialized + FROZEN\n",
+            "  Healing (5 epochs)...\n",
+            "      Epoch 1: loss=3.8281 (mlm≈2.275, distill≈0.675)\n",
+            "      Epoch 2: loss=3.6484 (mlm≈2.285, distill≈0.662)\n",
+            "      Epoch 4: loss=3.4661 (mlm≈1.936, distill≈0.653)\n",
+            "  Heal: 126.0s\n",
+            "  [Healed 672] 86,474,586 params, CV=0.2138 (73 matrices)\n",
+            "  After heal: top1=0.5185  top5=0.6988\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 3: 672 → 624 (7% reduction)\n",
+            "======================================================================\n",
+            "  Projecting (SVD + L1 FFN prune)...\n",
+            "  Projection: 3.0s\n",
+            "  [Projected 624] 75,956,970 params, CV=0.2216 (73 matrices)\n",
+            "  After proj: top1=0.2739  top5=0.4386\n",
+            "  Initializing frozen per-layer projectors...\n",
+            "    13 projectors initialized + FROZEN\n",
+            "  Healing (5 epochs)...\n",
+            "      Epoch 1: loss=3.7916 (mlm≈2.533, distill≈0.648)\n",
+            "      Epoch 2: loss=3.6354 (mlm≈2.513, distill≈0.654)\n",
+            "      Epoch 4: loss=3.4690 (mlm≈1.935, distill≈0.641)\n",
+            "  Heal: 116.9s\n",
+            "  [Healed 624] 75,956,970 params, CV=0.2195 (73 matrices)\n",
+            "  After heal: top1=0.4779  top5=0.6574\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 4: 624 → 576 (8% reduction)\n",
+            "======================================================================\n",
+            "  Projecting (SVD + L1 FFN prune)...\n",
+            "  Projection: 2.7s\n",
+            "  [Projected 576] 66,107,514 params, CV=0.2237 (73 matrices)\n",
+            "  After proj: top1=0.2205  top5=0.3747\n",
+            "  Initializing frozen per-layer projectors...\n",
+            "    13 projectors initialized + FROZEN\n",
+            "  Healing (5 epochs)...\n",
+            "      Epoch 1: loss=3.9799 (mlm≈3.118, distill≈0.632)\n",
+            "      Epoch 2: loss=3.7930 (mlm≈1.822, distill≈0.631)\n",
+            "      Epoch 4: loss=3.6012 (mlm≈1.899, distill≈0.628)\n",
+            "  Heal: 110.9s\n",
+            "  [Healed 576] 66,107,514 params, CV=0.2272 (73 matrices)\n",
+            "  After heal: top1=0.4594  top5=0.6390\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 5: 576 → 528 (8% reduction)\n",
+            "======================================================================\n",
+            "  Projecting (SVD + L1 FFN prune)...\n",
+            "  Projection: 2.3s\n",
+            "  [Projected 528] 56,926,218 params, CV=0.2297 (73 matrices)\n",
+            "  After proj: top1=0.1912  top5=0.3562\n",
+            "  Initializing frozen per-layer projectors...\n",
+            "    13 projectors initialized + FROZEN\n",
+            "  Healing (5 epochs)...\n",
+            "      Epoch 1: loss=4.1487 (mlm≈3.233, distill≈0.623)\n",
+            "      Epoch 2: loss=3.9650 (mlm≈2.617, distill≈0.630)\n",
+            "      Epoch 4: loss=3.7522 (mlm≈2.055, distill≈0.608)\n",
+            "  Heal: 106.6s\n",
+            "  [Healed 528] 56,926,218 params, CV=0.2098 (73 matrices)\n",
+            "  After heal: top1=0.4221  top5=0.5944\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 6: 528 → 480 (9% reduction)\n",
+            "======================================================================\n",
+            "  Projecting (SVD + L1 FFN prune)...\n",
+            "  Projection: 2.2s\n",
+            "  [Projected 480] 48,413,082 params, CV=0.2268 (73 matrices)\n",
+            "  After proj: top1=0.2133  top5=0.3538\n",
+            "  Initializing frozen per-layer projectors...\n",
+            "    13 projectors initialized + FROZEN\n",
+            "  Healing (5 epochs)...\n",
+            "      Epoch 1: loss=4.2363 (mlm≈2.888, distill≈0.599)\n",
+            "      Epoch 2: loss=4.0566 (mlm≈2.414, distill≈0.597)\n",
+            "      Epoch 4: loss=3.8558 (mlm≈2.829, distill≈0.590)\n",
+            "  Heal: 97.5s\n",
+            "  [Healed 480] 48,413,082 params, CV=0.2300 (73 matrices)\n",
+            "  After heal: top1=0.4020  top5=0.5747\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 7: 480 → 432 (10% reduction)\n",
+            "======================================================================\n",
+            "  Projecting (SVD + L1 FFN prune)...\n",
+            "  Projection: 1.7s\n",
+            "  [Projected 432] 40,568,106 params, CV=0.2250 (73 matrices)\n",
+            "  After proj: top1=0.1783  top5=0.2956\n",
+            "  Initializing frozen per-layer projectors...\n",
+            "    13 projectors initialized + FROZEN\n",
+            "  Healing (5 epochs)...\n",
+            "      Epoch 1: loss=4.5051 (mlm≈3.395, distill≈0.600)\n",
+            "      Epoch 2: loss=4.2936 (mlm≈2.605, distill≈0.582)\n",
+            "      Epoch 4: loss=4.0766 (mlm≈2.656, distill≈0.573)\n",
+            "  Heal: 91.5s\n",
+            "  [Healed 432] 40,568,106 params, CV=0.2273 (73 matrices)\n",
+            "  After heal: top1=0.3908  top5=0.5522\n",
+            "\n",
+            "======================================================================\n",
+            "SCALE 8: 432 → 384 (11% reduction)\n",
+            "======================================================================\n",
+            "  Projecting (SVD + L1 FFN prune)...\n",
+            "  Projection: 1.5s\n",
+            "  [Projected 384] 33,391,290 params, CV=0.2375 (73 matrices)\n",
+            "  After proj: top1=0.1012  top5=0.2859\n",
+            "  Initializing frozen per-layer projectors...\n",
+            "    13 projectors initialized + FROZEN\n",
+            "  Healing (5 epochs)...\n",
+            "      Epoch 1: loss=4.7430 (mlm≈3.338, distill≈0.571)\n",
+            "      Epoch 2: loss=4.5331 (mlm≈3.046, distill≈0.573)\n",
+            "      Epoch 4: loss=4.2957 (mlm≈2.892, distill≈0.557)\n",
+            "  Heal: 87.3s\n",
+            "  [Healed 384] 33,391,290 params, CV=0.2504 (73 matrices)\n",
+            "  After heal: top1=0.3715  top5=0.5353\n",
+            "\n",
+            "======================================================================\n",
+            "RESULTS\n",
+            "======================================================================\n",
+            "\n",
+            "   Scale       Params  Top1(proj)  Top1(heal)  Top5(proj)  Top5(heal)      CV\n",
+            "  ────── ──────────── ─────────── ─────────── ─────────── ─────────── ───────\n",
+            "     768  109,514,298      0.6129      0.6129      0.7711      0.7711  0.2248\n",
+            "     720   97,660,362      0.0133      0.5438      0.0803      0.7257  0.2280\n",
+            "     672   86,474,586      0.1703      0.5185      0.3341      0.6988  0.2138\n",
+            "     624   75,956,970      0.2739      0.4779      0.4386      0.6574  0.2195\n",
+            "     576   66,107,514      0.2205      0.4594      0.3747      0.6390  0.2272\n",
+            "     528   56,926,218      0.1912      0.4221      0.3562      0.5944  0.2098\n",
+            "     480   48,413,082      0.2133      0.4020      0.3538      0.5747  0.2300\n",
+            "     432   40,568,106      0.1783      0.3908      0.2956      0.5522  0.2273\n",
+            "     384   33,391,290      0.1012      0.3715      0.2859      0.5353  0.2504\n",
+            "\n",
+            "  SUMMARY:\n",
+            "    Teacher:    109,514,298  top1=0.6129  top5=0.7711\n",
+            "    Cascade:     33,391,290  top1=0.3715  top5=0.5353\n",
+            "    Compression: 3.3×\n",
+            "    Top1 retained: 60.6%\n",
+            "\n",
+            "  ALL APPROACHES:\n",
+            "    v1 Independent SVD + blind MLM:       61.5%\n",
+            "    v2 + teacher global P:                62.5%\n",
+            "    v2 + per-layer projectors (buggy):    ???\n",
+            "    v3 all fixes:                         60.6%\n",
+            "\n",
+            "Done.\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file