Spaces:

SII-GAIR
/

daVinci-MagiHuman

Paused

jiadisu Claude Opus 4.6 commited on about 1 month ago

Commit

e6066e8

1 Parent(s): 1b389ac

Switch back to Docker SDK with local pkgs

- Dockerfile: CUDA 12.4 base image, install MagiCompiler from pkgs/,
flash-attn, and stable-audio whl
- README.md: sdk: docker
- app.py: remove spaces.GPU decorator
- pkgs/: MagiCompiler source + stable-audio whl

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
Dockerfile +30 -18
README.md +1 -2
app.py +1 -4
pkgs/MagiCompiler/.gitignore +216 -0
pkgs/MagiCompiler/.pre-commit-config.yaml +60 -0
pkgs/MagiCompiler/LICENSE +201 -0
pkgs/MagiCompiler/README.md +186 -0
pkgs/MagiCompiler/docs/AutoCudaGraphDesign.md +174 -0
pkgs/MagiCompiler/docs/Hunyuan15Benchmark.md +79 -0
pkgs/MagiCompiler/docs/Wan2.2Benchmark.md +72 -0
pkgs/MagiCompiler/docs/WhyMagiCompiler.md +246 -0
pkgs/MagiCompiler/docs/WhyMagiDepyf.md +175 -0
pkgs/MagiCompiler/docs/assets/submod_0_rank_0.pdf +3 -0
pkgs/MagiCompiler/magi_compiler/__init__.py +17 -0
pkgs/MagiCompiler/magi_compiler/_cache_data_cls.py +28 -0
pkgs/MagiCompiler/magi_compiler/api.py +666 -0
pkgs/MagiCompiler/magi_compiler/compile_artifacts.py +125 -0
pkgs/MagiCompiler/magi_compiler/config.py +282 -0
pkgs/MagiCompiler/magi_compiler/cuda/cudart.py +60 -0
pkgs/MagiCompiler/magi_compiler/cuda_graph_mgr.py +931 -0
pkgs/MagiCompiler/magi_compiler/joint_graph_partition.py +180 -0
pkgs/MagiCompiler/magi_compiler/magi_backend.py +607 -0
pkgs/MagiCompiler/magi_compiler/magi_compiler_base.py +219 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/__init__.py +21 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/__init__.py +19 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/__init__.py +22 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/decompile_context.py +53 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handler_registry.py +62 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/__init__.py +22 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/arithmetic.py +144 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/calls.py +200 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/containers.py +200 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/control_flow.py +273 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/load_store.py +262 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/stack_ops.py +84 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/instruction.py +129 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/source_emitter.py +153 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/decompiler.py +230 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/postprocess/__init__.py +35 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/postprocess/branch_dedup.py +99 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/postprocess/for_temps.py +57 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/postprocess/inline_temps.py +165 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/recompiler.py +53 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/demo_toy_example.py +54 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/__init__.py +57 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/dump_src.py +78 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/introspect.py +524 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/model.py +241 -0
pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/result.py +51 -0

.gitattributes CHANGED Viewed

@@ -4,3 +4,5 @@
 *.png filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text

 *.png filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.whl filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,28 +1,46 @@
 # =============================================================================
 # HF Spaces Docker image for daVinci-MagiHuman
-# Hardware: A100-80GB (or H100)
 # =============================================================================
-# Based on the official MagiCompiler image which includes:
-#   - CUDA 12.4, cuDNN, Python 3.12, PyTorch 2.9
-#   - MagiCompiler (pre-installed)
-#   - Flash Attention 3 (Hopper) (pre-installed)
-# =============================================================================
-FROM sandai/magi-compiler:latest
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV GRADIO_SERVER_NAME=0.0.0.0
 ENV GRADIO_SERVER_PORT=7860
-# System deps needed for audio/video processing
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    ffmpeg libsndfile1 && \
-    rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # ---------------------------------------------------------------------------
-# Python dependencies
 # ---------------------------------------------------------------------------
 COPY requirements.txt requirements-nodeps.txt ./
 RUN pip install --no-cache-dir -r requirements.txt && \
@@ -36,16 +54,10 @@ COPY inference/ inference/
 COPY example/ example/
 COPY app.py .
-# ---------------------------------------------------------------------------
 # Model weights are downloaded at runtime from HF Hub.
-# Set HF_TOKEN as a Space secret if any repos are gated/private.
-#
-# Persistent storage (/data) is recommended on HF Spaces so weights survive
-# container restarts. Enable it in Space settings → "Persistent storage".
-# ---------------------------------------------------------------------------
 ENV MODEL_ROOT=/data/models
-# HF Spaces requires the app to listen on port 7860
 EXPOSE 7860
 CMD ["python", "app.py"]

 # =============================================================================
 # HF Spaces Docker image for daVinci-MagiHuman
+# Hardware: A100-80GB (recommended)
 # =============================================================================
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV GRADIO_SERVER_NAME=0.0.0.0
 ENV GRADIO_SERVER_PORT=7860
+# System deps
 RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.12 python3.12-dev python3.12-venv python3-pip \
+    git ffmpeg libsndfile1 ninja-build && \
+    rm -rf /var/lib/apt/lists/* && \
+    ln -sf /usr/bin/python3.12 /usr/bin/python && \
+    ln -sf /usr/bin/python3.12 /usr/bin/python3
 WORKDIR /app
 # ---------------------------------------------------------------------------
+# PyTorch (must be installed first — MagiCompiler build depends on it)
+# ---------------------------------------------------------------------------
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir torch torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/cu124
+# ---------------------------------------------------------------------------
+# Local packages: MagiCompiler + stable-audio whl
+# ---------------------------------------------------------------------------
+COPY pkgs/ pkgs/
+RUN pip install -e ./pkgs/MagiCompiler \
+    --no-build-isolation --config-settings editable_mode=compat && \
+    pip install --no-cache-dir pkgs/magife_stable_audio_open-1.0.0+mav.1-py3-none-any.whl
+# ---------------------------------------------------------------------------
+# Flash Attention (pre-built wheel for CUDA 12.4 + PyTorch 2.9)
+# ---------------------------------------------------------------------------
+RUN pip install --no-cache-dir flash-attn --no-build-isolation
+# ---------------------------------------------------------------------------
+# Project Python dependencies
 # ---------------------------------------------------------------------------
 COPY requirements.txt requirements-nodeps.txt ./
 RUN pip install --no-cache-dir -r requirements.txt && \
 COPY example/ example/
 COPY app.py .
 # Model weights are downloaded at runtime from HF Hub.
+# Enable "Persistent storage" in Space settings so /data survives restarts.
 ENV MODEL_ROOT=/data/models
 EXPOSE 7860
 CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -3,8 +3,7 @@ title: daVinci-MagiHuman
 emoji: 🎬
 colorFrom: blue
 colorTo: purple
-sdk: gradio
-sdk_version: 5.23.0
 app_port: 7860
 ---

 emoji: 🎬
 colorFrom: blue
 colorTo: purple
+sdk: docker
 app_port: 7860
 ---

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 """
 Gradio frontend for daVinci-MagiHuman distilled model.
-Designed for Hugging Face Spaces with ZeroGPU (Gradio SDK).
 Accepts an image + text prompt + duration, generates audio-video output.
 """
@@ -12,8 +12,6 @@ import sys
 import tempfile
 import uuid
-import spaces
 # ---------------------------------------------------------------------------
 # 1. Download all model weights from HF Hub (runs on CPU, cached)
 # ---------------------------------------------------------------------------
@@ -132,7 +130,6 @@ print("[app] Pipeline ready.")
 # 4. Inference wrapper — @spaces.GPU requests a ZeroGPU allocation
 #    duration= sets the max GPU time in seconds (default 60, max 300)
 # ---------------------------------------------------------------------------
-@spaces.GPU(duration=300)
 def generate_video(
     image,
     prompt: str,

 """
 Gradio frontend for daVinci-MagiHuman distilled model.
+Designed for Hugging Face Spaces (Docker SDK, A100-80GB GPU).
 Accepts an image + text prompt + duration, generates audio-video output.
 """
 import tempfile
 import uuid
 # ---------------------------------------------------------------------------
 # 1. Download all model weights from HF Hub (runs on CPU, cached)
 # ---------------------------------------------------------------------------
 # 4. Inference wrapper — @spaces.GPU requests a ZeroGPU allocation
 #    duration= sets the max GPU time in seconds (default 60, max 300)
 # ---------------------------------------------------------------------------
 def generate_video(
     image,
     prompt: str,

pkgs/MagiCompiler/.gitignore ADDED Viewed

	@@ -0,0 +1,216 @@

+# magi_compiler
+magi_compiler/_version.py
+magi_dump_src_dir/
+*.nsys-rep
+*.ncu-rep
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Vscode stuff:
+.vscode
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

pkgs/MagiCompiler/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+exclude: \.patch$
+repos:
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python3 ./.github/.codestyle/copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    -   id: check-added-large-files
+        args:
+        -   --maxkb=30720
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: detect-private-key
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+    -   id: requirements-txt-fixer
+    -   id: sort-simple-yaml
+-   repo: https://github.com/Lucas-C/pre-commit-hooks.git
+    rev: v1.5.1
+    hooks:
+    -   id: remove-crlf
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
+    -   id: remove-tabs
+        name: Tabs remover (C++)
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$
+        args: [--whitespaces-count, '2']
+    -   id: remove-tabs
+        name: Tabs remover (Python)
+        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
+        args: [--whitespaces-count, '4']
+-   repo: https://github.com/psf/black.git
+    rev: 23.3.0
+    hooks:
+    -   id: black
+        args: [--line-length=127, --skip-string-normalization, --skip-magic-trailing-comma]
+        files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
+-   repo: https://github.com/pre-commit/mirrors-isort
+    rev: v5.10.1
+    hooks:
+    -   id: isort
+        args: [--profile=black, --line-length=127, --multi-line=3, --force-grid-wrap=0]
+        files: \.py$
+-   repo: https://github.com/PyCQA/autoflake
+    rev: v2.3.1
+    hooks:
+    -   id: autoflake
+        args: [--remove-all-unused-imports, --remove-unused-variables, --in-place, --ignore-init-module-imports, --ignore-pass-after-docstring]
+        files: \.py$
+-   repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks.git
+    rev: v2.9.0
+    hooks:
+    -   id: pretty-format-yaml
+        args: [--autofix, --indent, '4']
+        additional_dependencies: [setuptools]

pkgs/MagiCompiler/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

pkgs/MagiCompiler/README.md ADDED Viewed

	@@ -0,0 +1,186 @@

+## MagiCompiler
+An engineering-oriented compiler and execution augmentation library for PyTorch 2.8+, providing module-level compilation decorators, backend adapters, graph partitioning strategies, readable and reusable compile artifacts, and tightly integrated runtime scheduling for any inference engine. The design goal is to systematically expose capabilities of PyTorch Dynamo / AOTAutograd / Inductor / Triton while prioritizing correctness, stability, observability, and maintainability.
+### Design Overview
+- Compilation entrypoint: the `@magi_compile` decorator augments `nn.Module.forward` with compilation, including dynamic-shape annotation and argument validation.
+- Partitioning and passes: configurable graph partitioning and pass management (e.g., `InductorPass`, `PostGradPassManager`) for fusion, kernel generation, and tuning.
+- Artifact system: persists compile artifacts using a Python file/directory layout for readability, auditability, and portability (see “Compile Cache Overview”).
+- Configuration: `CompileConfig` centralizes backend, partition rules, cache root, runtime shapes, and other key parameters.
+### Key Features
+- Dynamic-shape annotations:
+  - Automatic inference: when a `forward` parameter is annotated as `torch.Tensor` or `torch.Tensor | None`, dimension 0 is treated as dynamic by default.
+  - Explicit specification: use `@magi_compile(dynamic_arg_dims={...})` to mark dimensions (negative indices supported).
+  - Consistency constraints: parameters that alternately appear as `None` and non-`None` across the model lifetime cannot be captured into the same computation graph.
+- Backend selection and standalone compilation:
+  - `inductor` mode defaults to PyTorch 2.8+ `standalone_compile`, producing reusable artifacts.
+  - `eager` mode is available for debugging or fallback paths.
+- Partitioning and passes: operator-set-driven partition rules and pass contexts that stabilize subgraph boundaries and kernel generation across runtime shapes.
+- Readable, portable artifacts: structured directories with Python files for quick triage and cross-environment debugging.
+- Engine integration: the decorator reads engine-level `CompileConfig` to stay aligned with distributed/scheduling components.
+## Installation and Requirements
+- Python ≥ 3.10
+- PyTorch ≥ 2.8 (with `torch._inductor.standalone_compile` available)
+- Recommended to be used within the Athena environment, together with its dependencies and distributed components (e.g., CUDA Graph manager).
+For local development, install in editable mode:
+```bash
+pip install -e . --no-build-isolation --config-settings editable_mode=compat
+```
+## Quick Start
+### Minimal Example (automatic dynamic-dim inference)
+```python
+import torch
+from torch import nn
+from magi_compiler.decorator import magi_compile
+@magi_compile
+class MyModel(nn.Module):
+    def __init__(self, *, model_config):
+        super().__init__()
+        self.linear = nn.Linear(10, 5)
+    def forward(self, x: torch.Tensor, y: torch.Tensor | None) -> torch.Tensor:
+        if y is not None:
+            return self.linear(x + y)
+        return self.linear(x)
+# In Athena, model_config is typically provided by the engine
+model = MyModel(model_config=...)
+out1 = model(torch.randn(4, 10), torch.randn(4, 10))
+out2 = model(torch.randn(8, 10), None)  # dynamic batch dimension
+```
+### Explicit Dynamic-Dim Specification
+```python
+@magi_compile(dynamic_arg_dims={"x": -1})  # mark the last dimension as dynamic
+class DynamicDimModel(nn.Module):
+    def __init__(self, *, model_config):
+        super().__init__()
+        self.proj = nn.Linear(16, 16, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.proj(x)
+m = DynamicDimModel(model_config=...)
+_ = m(torch.randn(2, 16))
+_ = m(torch.randn(2, 32))  # allow the last dimension to vary
+```
+## Configuration and Modes
+- `CompileConfig`: centralizes compile parameters (backend, cache paths, partition strategy, dynamic shapes, traced files, etc.).
+- `CompileMode`: typical setting is `CompileMode.TORCH_COMPILE`.
+- Backends:
+  - `inductor`: uses `standalone_compile` to produce reusable artifacts, ideal for production deployments.
+  - `eager`: convenient for rapid debugging or as a fallback.
+## Architecture and Execution Flow (Brief)
+1. `@magi_compile` wraps `nn.Module`:
+   - infers/validates `dynamic_arg_dims`;
+   - extends MRO by injecting `MagiCompilerBase`;
+   - reads engine-level `CompileConfig` in MagiCompiler.
+2. `CompilerManager`:
+   - defines cache keys using `(runtime_shape, graph_index, backend)`;
+   - dispatches to backends via `CompilerInterface` (`InductorStandaloneAdaptor` or `EagerAdaptor`);
+   - applies partition rules and pass contexts within `compile_context(...)`;
+   - serializes compile artifacts into a human-readable directory structure.
+3. Monitoring and statistics:
+   - counters and timestamps report per-shape/per-subgraph latencies and milestones.
+## Compile Cache Overview
+This document summarizes the cache files generated by `torch.compile` (TorchDynamo + TorchInductor + AOTAutograd + Triton). Reference path: `cache/`.
+### Directory Layout (Tree)
+```text
+cache/
+├─ depyf/
+│  └─ rank_0/
+│     ├─ __transformed_code_0_for_forward.py
+│     ├─ decompiled_code.py
+│     ├─ full_code_for_forward_0.py
+│     ├─ __compiled_fn_1.BEFORE_PRE_GRAD.{0..N}.py
+│     ├─ __compiled_fn_1.kernel_{0..K}.py
+│     ├─ __compiled_fn_1.__compiled_fn_1_<uuid>.0.py
+│     ├─ __compiled_fn_1.Before_split.0.py
+│     ├─ __compiled_fn_1.After_split.0.py
+│     ├─ __compiled_fn_1.pre_split_module.0.py
+│     ├─ __compiled_fn_1.post_split_module.0.py
+│     └─ __compiled_fn_1.pre_insert_deferred_runtime_asserts__<uuid>.0.py
+│
+└─ torch_compile_cache/
+   └─ bfa0df33ea/                # Hash for graph + compile options + device, etc.
+      └─ rank_0/                 # Rank id in distributed/multi-GPU runs
+         └─ backbone/
+            ├─ computation_graph.py
+            ├─ magi_compile_cache.py
+            ├─ artifact_shape_None_subgraph_0/
+            ├─ artifact_shape_None_subgraph_1/
+            ├─ ...
+            └─ artifact_shape_None_subgraph_30/
+               ├─ ir/
+               │  └─ *.py        # Python/Triton kernels generated by Inductor for this subgraph
+               ├─ fxgraph/
+               │  └─ */*/<hash>  # Binary FX IR/metadata snapshots (not human-readable)
+               ├─ aotautograd/
+               │  └─ */*/<hash>  # AOTAutograd partition/capture metadata and artifacts
+               ├─ 44/
+               │  └─ *.py        # Other sharded/generated code buckets
+               └─ ...            # Structure may vary slightly across subgraphs
+```
+### What Each File/Dir Is For
+- `cache/depyf/` (TorchDynamo/Depyf debug exports)
+  - `__transformed_code_0_for_forward.py`: The Dynamo-transformed `forward` code (diff-friendly view of pre/post transformation).
+  - `decompiled_code.py`: Decompiled snapshot to help map traced graphs back to original Python.
+  - `full_code_for_forward_0.py`: A more complete expanded `forward` for inspection.
+  - `__compiled_fn_1.BEFORE_PRE_GRAD.{i}.py`: Intermediate wrapper snapshots at specific compile stages (e.g., before autodiff).
+  - `__compiled_fn_1.kernel_{k}.py`: Entrypoints/wrappers for kernels generated at various stages.
+  - `Before_split` / `After_split` / `pre_split_module` / `post_split_module`: Intermediate forms around graph partitioning.
+  - `pre_insert_deferred_runtime_asserts__*.py`: Snapshot before inserting deferred runtime assertions (dynamic shapes/guards).
+- `cache/torch_compile_cache/` (TorchInductor artifacts)
+  - `bfa0df33ea/`: Namespace keyed by a hash of model structure, compile settings, and device info.
+  - `rank_0/`: Bucket per process rank for distributed runs.
+  - `backbone/`:
+    - `computation_graph.py`: Full model FX GraphModule with symbolic dims; shared across subgraph kernels.
+    - `magi_compile_cache.py`:
+      - Maps subgraph indices to artifact directories, e.g. `(None, i, 'inductor_standalone') -> artifact_shape_None_subgraph_i/`.
+      - Registers and asynchronously compiles Triton kernels via `AsyncCompile.triton(...)`, including autotune metadata, device properties, scheduling hints, etc.
+    - `artifact_shape_None_subgraph_{N}/`:
+      - `ir/*.py`: Inductor-generated Python/Triton kernels and scheduling code for this subgraph (readable).
+      - `fxgraph/*/*/<hash>`: FX IR/metadata snapshots for fast graph reconstruction (binary; do not edit).
+      - `aotautograd/*/*/<hash>`: AOTAutograd partitions/captures and replay requirements.
+      - Additional hashed/prefixed buckets (e.g., `44/`, `o5/`, `55/`, `br/`) containing generated operator/subtask code.
+### FAQ
+- How are these caches produced?
+  - At runtime by `torch.compile(...)`, after TorchDynamo tracing, AOTAutograd partitioning, TorchInductor lowering/fusion, and Triton codegen.
+- Will they change across runs?
+  - Yes. Different input shapes, env vars, device info, or compile options can produce different hash namespaces (e.g., a new `bfa0df33ea`).
+- Is it safe to delete them?
+  - Yes. You can delete `cache/`. It will be rebuilt on demand; the next run will be slower due to recompilation.
+## Compatibility and Recommendations
+- Prefer official PyTorch ≥ 2.8 builds to ensure `standalone_compile` availability.
+- For highly dynamic models, explicitly mark key dynamic dimensions to improve graph capture and cache reuse.
+## Acknowledgments
+This library builds upon capabilities of PyTorch Dynamo, AOTAutograd, Inductor, and Triton, and incorporates engineering practices and interface designs inspired by the vLLM community. We thank the relevant open-source communities and contributors.

pkgs/MagiCompiler/docs/AutoCudaGraphDesign.md ADDED Viewed

	@@ -0,0 +1,174 @@

+## AutoCudaGraph Design
+Author: ZhiyaoCen
+## Overview
+AutoCudaGraph is a CUDA Graph optimization module integrated into the MagiCompiler framework, designed to automate CUDA Graph capture, caching, replay, and tensor memory management for PyTorch-based neural network inference. It targets Transformer architectures with dynamic sequence lengths, optimizing kernel execution by reusing pre-captured computation graphs and static tensor buffers. Core Goals:
+* Automate CUDA Graph lifecycle (capture/replay/cache) with minimal code intrusion
+* Support dynamic shape adaptation (sequence length expansion)
+* Optimize memory efficiency via global memory pool and static tensor reuse
+* Ensure consistency between cached graphs and runtime inputs/outputs
+## Key Components
+### CudaGraphMgr (Core Manager)
+Singleton class managing all CUDA Graph operations:
+```python
+class CudaGraphMgr:
+    def __init__(self):
+        self.cache: Dict[StaticSignature, StaticTensorEntry] = dict()
+        self.graph_mem_pool: Optional[torch.cuda.graph_pool_handle] = None
+```
+**Core Methods**
+| Method | Purpose |
+|---------------------------------|----------------------------------------|
+| run()                           | Main entry: Replay cached graph or warm up & capture new graph|
+| wrapped_graph_capture()         | Capture CUDA Graph with sliced static input/output tensors |
+| wrapped_graph_replay()          | Replay cached CUDA Graph with sliced static tensors and output template wrapping
+| get_expanded_static_tensors()   | Expand static tensors, reuse buffers if dimensionally compatible|
+### Signature System
+StaticSignature
+```python
+@dataclass(unsafe_hash=True)
+class StaticSignature(HashableDataclass):
+    func_name: str = ""
+    tensor_static_infos: Tuple[TensorStaticInfo, ...] = tuple()
+```
+* Encodes fixed properties of input tensors (dtype, static dimensions)
+* Used as primary key for static tensor buffer caching
+DynamicSignature
+```python
+@dataclass(unsafe_hash=True)
+class DynamicSignature(HashableDataclass):
+    tensor_dynamic_infos: Tuple[TensorDynamicInfo, ...] = tuple()
+    literals_info: LiteralsInfo = None
+```
+* Tracks dynamic dimensions (sequence length) and literal parameters
+* Secondary key for graph entry lookup
+### Tensor Management
+```python
+@dataclass
+class StaticTensorEntry:
+    input_tensors: Optional[List[torch.Tensor]] = None
+    output_tensors: Optional[List[torch.Tensor]] = None
+    template_entry_dict: Dict[DynamicSignature, OutputTemplateEntry] = None
+```
+* Memory Reuse: Reuse existing tensor buffers when possible to avoid reallocation
+* Dynamic Expansion: Only expand static tensors when new input dimensions exceed current buffer size
+* Shape Validation: Ensure static dimensions (non-sequence) match between cached and new tensors
+### Graph Management
+```python
+@dataclass
+class GraphEntry:
+    graph: Optional[torch.cuda.CUDAGraph] = None
+    inconsistent: bool = False
+    invalid: bool = False
+@dataclass
+class OutputTemplateEntry:
+    graph_entry_dict: Dict[int, GraphEntry] = None
+    output_template: Any = None
+```
+* Graph State Tracking: GraphEntry tracks CUDA Graph instances and validity states to control replay eligibility.
+* Layer-wise Organization: OutputTemplateEntry maps dynamic signatures to per-layer GraphEntry for layer-specific graph reuse.
+* Output Consistency: output_template preserves output object structure to ensure consistent result wrapping during replay.
+## Execution Flow
+### Inline Replay (Fast Path)
+* Extract input signatures from runtime arguments
+* Look up cached CUDA Graph via StaticSignature + DynamicSignature + layer number
+* Validate graph consistency (not inconsistent/invalid)
+* Reuse static tensors with dynamic slicing
+* Replay graph and return sliced output
+### Graph Capture (Slow Path)
+Triggered when no valid cached graph exists or tensor expansion is needed:
+* Execute function to get output tensors
+* Ensure input signatures match post-warmup
+* Expand static buffers if new shapes require it
+* Capture new CUDA Graph with static tensors
+* Store new graph and update tensor entries
+* Return warmup execution output as final result
+### Sequence Length Handling
+* Only last dimension is static for ND tensors (ND > 1)
+* All dimension is dynamic for 1D tensors (ND=1)
+* Automatic buffer expansion for increasing sequence lengths
+* Invalidates old graphs when tensors are expanded
+## Examples
+```python
+import torch
+import torch.nn as nn
+from magi_compiler.cuda_graph_mgr import cuda_graph_mgr, cuda_graph_enable_if
+class SimpleTransformerLayer(nn.Module):
+    def __init__(self, hidden_dim: int = 1024, num_heads: int = 8):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(hidden_dim, num_heads, batch_first=True)
+        self.linear = nn.Linear(hidden_dim, hidden_dim)
+        self.layer_norm = nn.LayerNorm(hidden_dim)
+        self.layer_number = 0
+    @cuda_graph_enable_if(lambda: torch.cuda.is_available())
+    def forward(self, x: torch.Tensor):
+        attn_out, _ = self.self_attn(x, x, x)
+        out = self.linear(self.layer_norm(x + attn_out))
+        return out
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = SimpleTransformerLayer(hidden_dim=1024, num_heads=8).to(device).eval()
+graph_mgr = cuda_graph_mgr()
+with torch.no_grad():
+    input_1 = torch.randn(2, 512, 1024, device=device)
+    output_1 = model(input_1)
+    print(f"First run (graph capture): Output shape = {output_1.shape}")
+    print(f"Cached graphs count: {graph_mgr.graph_count}")
+    input_2 = torch.randn(2, 512, 1024, device=device)
+    output_2 = model(input_2)
+    print(f"Second run (graph replay): Output shape = {output_2.shape}")
+    print(f"Cached graphs count: {graph_mgr.graph_count}")
+    input_3 = torch.randn(2, 1024, 1024, device=device)
+    output_3 = model(input_3)
+    print(f"Third run (tensor expansion): Output shape = {output_3.shape}")
+    print(f"Cached graphs count: {graph_mgr.graph_count}")
+    print(f"Static tensor memory usage: {graph_mgr.tensor_mem_size:.2f} MB")
+    print("\nCUDA Graph Cache Details:")
+    print(graph_mgr.formatted_cache_str())
+    # StaticSignature: StaticSignature(_cached_hash=None, func_name='SimpleTransformerLayer.forward', tensor_static_infos=(TensorStaticInfo(_cached_hash=None, name='', shapes=(-1, -1, 1024), dtype='torch.float32'),))
+    #   Input Static Tensors: [shape=[2, 1024, 1024],dtype=torch.float32]
+    #   Output Static Tensors: [shape=[2, 1024, 1024],dtype=torch.float32]
+    #   DynamicSignature: DynamicSignature(_cached_hash=None, tensor_dynamic_infos=(TensorDynamicInfo(_cached_hash=None, name='', shapes=(2, 512, -1)),), literals_info=LiteralsInfo(_cached_hash=None, literals=()))
+    #     Output Template: FakeTensor(shape=[2, 512, 1024], dtype='torch.float32', device='cuda:0')
+    #     Layer 0: Graph Status: Invalid
+    #   DynamicSignature: DynamicSignature(_cached_hash=None, tensor_dynamic_infos=(TensorDynamicInfo(_cached_hash=None, name='', shapes=(2, 1024, -1)),), literals_info=LiteralsInfo(_cached_hash=None, literals=()))
+    #     Output Template: FakeTensor(shape=[2, 1024, 1024], dtype='torch.float32', device='cuda:0')
+    #     Layer 0: Graph Status: Valid
+```
+## Limitations and Constraints
+* No support for data-dependent control flow in captured functions
+* Graph capture fails if function contains CPU/GPU synchronization
+* Only supports CUDA tensors (CPU tensors trigger fallback)
+* Custom input classes must inherit from InplaceSubstituteFakeClass
+* Assumes input tensors of captured graphs are not reused externally (risk of cross-scenario static tensor reuse)
+* Relies on identical function, input tensors shapes, and constants for valid graph reuse
+* No support for multi-stream execution scenarios
+## Best Practices
+* Dynamic Dimensions: Tensor use sequence length as dimension 0 where possible
+* Monitor Memory Usage: Track graph_mem_pool_size and tensor_mem_size to avoid OOM
+* Specify Layer IDs: Use layer_number to distinguish graphs across different models/layers
+* LRU Cache (Future): Implement cache eviction to limit total graph/tensor count

pkgs/MagiCompiler/docs/Hunyuan15Benchmark.md ADDED Viewed

	@@ -0,0 +1,79 @@

+## Hunyuan1.5 Benchmark
+### Executive Summary
+This report presents a comprehensive performance evaluation of the **[Athena](https://github.com/world-sim-dev/athena)** framework compared to the baseline **[LightX2V](https://github.com/ModelTC/LightX2V)** framework. The benchmarks were conducted using the **[Hunyuan-1.5](https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5)** model on NVIDIA H100 hardware.
+---
+### 🎯Test Environment & Versioning
+#### Hardware & Settings
+| Parameter           | Value           |
+| ------------------- | --------------  |
+| Hardware            | NVIDIA H100     |
+| Model               | Hunyuan-1.5 480p_t2v_distilled  |
+| Precision           | torch.bfloat16  |
+| Inference Steps     | 20              |
+| Resolution          | 480p            |
+| FPS                 | 24              |
+| CFG                 | Disable         |
+#### Software Versioning
+To ensure reproducibility, the following specific commits were used for this benchmark:
+| Framework | Branch / Tag | Commit |
+| --------- | ------------ | ------ |
+| Athena | main|[5e6086b](https://github.com/world-sim-dev/athena/commit/5e6086b4dc2ab60bc4d44dbe39745b4354075121) |
+| LightX2V | main | [5573905](https://github.com/ModelTC/LightX2V/commit/5573905f3f38d876d468b815f86d417a608975b6) |
+### 🏆 Performance Benchmarks
+📊 We compared the iteration speed (seconds per iteration) between Athena and LightX2V across three distinct Context Parallel (CP) configurations.
+| Configuration | Frames | LightX2V (s/it) | Athena (s/it)     | Speedup     |
+| ------------- | ------ | -------------- | --------------     | -------     |
+| CP1           | 121    | 2.42          | **2.06**           | **1.17x** 🚀|
+| CP2           | 121    | 1.38          | **1.13**           | **1.22x** 🚀|
+| CP4           | 241    | 2.25         | **1.85**           | **1.22x** 🚀|
+| CP8           | 241    | 1.28       | **1.01**           | **1.27x** 🚀|
+---
+### 📹 Output Comparison
+| Framework | Video Result |
+| --------- | ---------------------------- |
+| Athena    | <img src="../../../assets/athena_hunyuan_1_5_test_videos_20260213_155842_idx0_A_close-up815965.gif" width="480" /> |
+| LightX2V  | <img src="../../../assets/lightx2v_hunyuan_1_5_result_A_close-up122526.gif" width="480" /> |
+### 💡 Reproduction Guide
+To reproduce the results presented in this report, follow the steps below using the specified commit hashes.
+#### Setup
+```bash
+git clone https://github.com/world-sim-dev/athena
+cd athena
+git checkout 5e6086b
+pip install -r requirements.txt
+pip install -r requirements-nodeps.txt
+pip install -e ./pkgs/MagiCompiler --no-build-isolation --config-settings editable_mode=compat
+# Clone and install LightX2V (for baseline comparison)
+git clone https://github.com/ModelTC/LightX2V
+cd lightx2v
+git checkout 5573905
+pip install -v .
+```
+#### Running Benchmarks
+For Athena, run:
+```
+RESOLUTION=480p CFG_DISTILLED=true TASK=t2v CHECKPOINT_PATH=path/to/480p_t2v_distilled bash ./scripts/run_hunyuan.sh
+```
+For LightX2V:
+Clone the scripts from [Benchmark for LightX2V](https://gist.github.com/wtr0504/d80bbebb7da1ef7b58f3e6faf1c68880) and run:
+```
+git clone https://gist.github.com/wtr0504/d80bbebb7da1ef7b58f3e6faf1c68880
+MODEL_PATH=path/to/HunyuanVideo-1.5 DISTILL_CKPT=path/to/480p_t2v_distilled/diffusion_pytorch_model.safetensors bash run_hunyuan.sh
+```
+### 🔎 MagiCompiler Optimization Methodology
+**Whole Graph Compilation**
+Constant Folding & Dead Code Elimination: Streamlining the computation graph prior to execution.
+**Coarse-grained Kernel Fusion**
+MagiCompiler aggregates multiple smaller operators into larger, fused kernels. This optimization is critical for efficient execution on the GPU.

pkgs/MagiCompiler/docs/Wan2.2Benchmark.md ADDED Viewed

	@@ -0,0 +1,72 @@

+## Wan2.2 Benchmark
+### Executive Summary
+This report presents a comprehensive performance evaluation of the **[Athena](https://github.com/world-sim-dev/athena)** framework compared to the baseline **[LightX2V](https://github.com/ModelTC/LightX2V)** framework. The benchmarks were conducted using the **[Wan2.2-TI2V-5B](https://huggingface.co/Wan-AI)** model on NVIDIA H100 hardware.
+---
+### 🎯Test Environment & Versioning
+#### Hardware & Settings
+| Parameter           | Value           |
+| ------------------- | --------------  |
+| Hardware            | NVIDIA H100     |
+| Model               | Wan2.2-TI2V-5B  |
+| Precision           | torch.bfloat16  |
+| Inference Steps     | 50              |
+| Resolution          | 704 × 1280(720p)|
+| FPS                 | 24              |
+| CFG                 | Enabled         |
+#### Software Versioning
+To ensure reproducibility, the following specific commits were used for this benchmark:
+| Framework | Branch / Tag | Commit |
+| --------- | ------------ | ------ |
+| Athena | main|[f676ae6](https://github.com/world-sim-dev/athena/commit/f676ae64ad2fc581289d1c3ae5eb51c15ce76f1d) |
+| LightX2V | main | [33f0f67](https://github.com/ModelTC/LightX2V/commit/33f0f67f4ecdff86b1db676d3e0786628cc31c7b) |
+### 🏆 Performance Benchmarks
+📊 We compared the iteration speed (seconds per iteration) between Athena and LightX2V across three distinct Context Parallel (CP) configurations.
+| Configuration | Frames | LightX2V (s/it) | Athena (s/it)     | Speedup     |
+| ------------- | ------ | -------------- | --------------     | -------     |
+| CP1           | 121    | 1.928          | **1.69**           | **1.14x** 🚀|
+| CP2           | 121    | 1.197          | **1.06**           | **1.13x** 🚀|
+| CP4           | 241    | 1.767          | **1.32**           | **1.34x** 🚀|
+| CP8           | 241    | 1.507          | **1.35**           | **1.12x** 🚀|
+---
+### 💡 Reproduction Guide
+To reproduce the results presented in this report, follow the steps below using the specified commit hashes.
+#### Setup
+```bash
+git clone https://github.com/world-sim-dev/athena
+cd athena
+git checkout f676ae6
+pip install -r requirements.txt
+# Clone and install LightX2V (for baseline comparison)
+git clone https://github.com/ModelTC/LightX2V
+cd lightx2v
+git checkout 33f0f67
+pip install -r requirements.txt
+```
+#### Running Benchmarks
+For Athena, run:
+```
+bash ./scripts/run_wan2_2_ti2v_i2v.sh
+```
+For LightX2V:
+Clone the scripts from [Benchmark for LightX2V](https://gist.github.com/wtr0504/629388f17ed38d1c12d5ef5c25a15197) and run:
+```
+git clone https://gist.github.com/wtr0504/629388f17ed38d1c12d5ef5c25a15197
+bash run_wan.sh
+```
+### 🔎 MagiCompiler Optimization Methodology
+**Whole Graph Compilation**
+Constant Folding & Dead Code Elimination: Streamlining the computation graph prior to execution.
+**Coarse-grained Kernel Fusion**
+MagiCompiler aggregates multiple smaller operators into larger, fused kernels. This optimization is critical for efficient execution on the GPU.
+**All to All Communication**
+MagiCompiler Uses ``all_to_all_single`` (1 communication op per sync point) while LightX2V Uses all_to_all x 3 (3 separate communication ops).

pkgs/MagiCompiler/docs/WhyMagiCompiler.md ADDED Viewed

	@@ -0,0 +1,246 @@

+# Why MagiCompiler?
+## 1. Compiler Overview
+### 1.1 Background
+We have long encountered several significant challenges in model optimization:
+1.  **Blurred Acceleration Boundaries:** There is ambiguity regarding the extent of optimization required to achieve "extreme" performance.
+2.  **Complex Performance Tuning:** Optimization strategies are often tightly coupled with model architectures, necessitating extensive and repetitive manual intervention.
+3.  **Deficiency in Optimization Tools:** The infrastructure lacks sufficient mechanisms for computational graph-level optimizations, such as operator substitution and communication overlap.
+MagiCompiler addresses these issues through the following approaches:
+*   **Addressing Challenge 1:** It adopts **whole-graph compilation**, thoroughly transcending the boundaries of `TransformerLayer` to maximize the scope of kernel fusion.
+*   **Addressing Challenge 2:** It integrates infrastructure optimizations directly into MagiCompiler, implementing features such as `AutoCudaGraph` and `AutoCheckpointing(WIP)`.
+*   **Addressing Challenge 3:** It leverages the dynamic-to-static capabilities provided by **Dynamo**, capturing `fx.graph` IR in eager mode to perform pass optimizations at the IR level.
+#### Illustrative Example
+```python
+from magi_compiler import magi_compile
+@magi_compile()
+class TinyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(1024, 1024, device="cuda")
+    @no_grad()
+    def forward(self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor) -> torch.Tensor:
+        return self.linear(x + y - z + 1)
+def magi_compiler_demo():
+    model = TinyModel()
+    x = torch.randn(1024, 1024, device="cuda")
+    y = torch.randn(1024, 1024, device="cuda")
+    z = torch.randn(1024, 1024, device="cuda")
+    model(x, y, z)
+```
+**Optimized Code (Triton Kernel):**
+```python
+triton_poi_fused_add_sub_0 = async_compile.triton('triton_poi_fused_add_sub_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 1048576},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_sub_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'B8F4209CBFC2377D6AF9CF3C65D610CA2B56C138A443862350DE1E56F5BF54C3', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_sub_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), xmask)
+    tmp1 = tl.load(in_ptr1 + (x0), xmask)
+    tmp3 = tl.load(in_ptr2 + (x0), xmask)
+    tmp2 = tmp0 + tmp1
+    tmp4 = tmp2 - tmp3
+    tmp5 = 1.0
+    tmp6 = tmp4 + tmp5
+    tl.store(out_ptr0 + (x0), tmp6, xmask)
+''', device_str='cuda')
+```
+### 1.2 Frontend (Dynamo)
+![Dynamo](./assets/why_magicompiler_1_dynamo.jpeg)
+*   **PyFrameObject (Dynamic Call Stack):**
+    *   Represents the context environment during function execution. Python creates a new `PyFrameObject` for each function call.
+*   **PyCodeObject (Static Bytecode):**
+    *   The compiled product of Python code, which is static and read-only. A single `PyCodeObject` exists regardless of how many times the function is invoked.
+```python
+def f(x, mod):
+    for guard, transformed_code in f.compiled_entries:
+        if guard(x, mod):
+            return transformed_code(x, mod)
+    try:
+        guard, transformed_code = compile_and_optimize(x, mod)
+        f.compiled_entries.append([guard, transformed_code])
+        return transformed_code(x, mod)
+    except FailToCompileError:
+        y = mod(x)
+        z = torch.log(y)
+        return z
+```
+#### Symbolic Shape
+MagiCompiler specifically targets the Transformer architecture and supports custom `dynamic_arg_dims` (typically for `seq_len`).
+**Example:**
+```python
+@magi_compile(dynamic_arg_dims={"x": 0, "y": 0, "z": 0})
+class TinyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(1024, 1024, device="cuda")
+    @no_grad()
+    def forward(self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor) -> torch.Tensor:
+        return self.linear(x + y - z + 1)
+```
+**Guard Mechanism and Elimination in Symbolic Shape Deduction:**
+```log
+I1204 16:31:35.745000 1859360 torch/_dynamo/symbolic_convert.py:3842] [0/0] Step 1: torchdynamo start tracing inner /usr/local/lib/python3.12/dist-packages/torch/_dynamo/external_utils.py:66
+I1204 16:31:35.746000 1859360 torch/fx/experimental/symbolic_shapes.py:3775] [0/0] create_env
+I1204 16:31:35.781000 1859360 torch/fx/experimental/symbolic_shapes.py:5120] [0/0] create_symbol s33 = 1024 for L['args'][0].size()[0] [2, int_oo] return self.linear(x + y - z + 1)  # ome/niubility2/hongyu/athena/integration_test/scripts/linear_demo.py:50 in forward (_dynamo/variables/builder.py:3501 in <lambda>), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="s33" or to suppress this message run with TORCHDYNAMO_EXTENDED_ADVICE="0"
+I1204 16:31:35.785000 1859360 torch/fx/experimental/symbolic_shapes.py:5120] [0/0] create_symbol s6 = 1024 for L['args'][1].size()[0] [2, int_oo] return self.linear(x + y - z + 1)  # ome/niubility2/hongyu/athena/integration_test/scripts/linear_demo.py:50 in forward (_dynamo/variables/builder.py:3501 in <lambda>), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="s6" or to suppress this message run with TORCHDYNAMO_EXTENDED_ADVICE="0"
+I1204 16:31:35.794000 1859360 torch/fx/experimental/symbolic_shapes.py:7213] [0/0] eval Eq(s33, s6) [guard added] return self.linear(x + y - z + 1)  # ome/niubility2/hongyu/athena/integration_test/scripts/linear_demo.py:50 in forward (_subclasses/fake_impls.py:1148 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s33, s6)"
+I1204 16:31:35.795000 1859360 torch/fx/experimental/symbolic_shapes.py:6792] [0/0] set_replacement s6 = s33 (solve) VR[2, int_oo]
+I1204 16:31:35.800000 1859360 torch/fx/experimental/symbolic_shapes.py:5120] [0/0] create_symbol s21 = 1024 for L['args'][2].size()[0] [2, int_oo] return self.linear(x + y - z + 1)  # ome/niubility2/hongyu/athena/integration_test/scripts/linear_demo.py:50 in forward (_dynamo/variables/builder.py:3501 in <lambda>), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="s21" or to suppress this message run with TORCHDYNAMO_EXTENDED_ADVICE="0"
+I1204 16:31:35.806000 1859360 torch/fx/experimental/symbolic_shapes.py:7213] [0/0] eval Eq(s33, s21) [guard added] return self.linear(x + y - z + 1)  # ome/niubility2/hongyu/athena/integration_test/scripts/linear_demo.py:50 in forward (_subclasses/fake_impls.py:1148 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s33, s21)"
+I1204 16:31:35.807000 1859360 torch/fx/experimental/symbolic_shapes.py:6792] [0/0] set_replacement s33 = s21 (solve) VR[2, int_oo]
+I1204 16:31:35.828000 1859360 torch/_dynamo/symbolic_convert.py:4059] [0/0] Step 1: torchdynamo done tracing inner (RETURN_VALUE)
+I1204 16:31:35.837000 1859360 torch/fx/experimental/symbolic_shapes.py:6792] [0/0] set_replacement s6 = s21 (find) VR[2, int_oo]
+```
+### 1.3 Backend (Inductor, MagiBackend, etc.)
+![Backend Architecture](./assets/why_magicompiler_2_arch.png)
+MagiCompiler hijack the `torch.compile` logic through the following components:
+*   **`custom_partitioner_fn`:** Segments the forward and backward computational graphs and determines which intermediate results are transmitted to the backward pass.
+*   **`post_grad_custom_pre_pass`:** Performs pass optimizations at the whole-graph level (computational graph matching and rewriting).
+*   **`PartitionFunc`:** Implements custom subgraph partitioning logic, utilizing attention mechanisms as splitting points.
+![Partition](./assets/why_magicompiler_3_partition.png)
+*   **`post_grad_custom_post_pass`:** Executes pass optimizations at the subgraph level (computation/communication overlap).
+---
+## 2. Best Practices
+### 2.1 Model Adaptation
+MagiCompiler has certain limitations, such as mandatory whole-graph capture and the inability to support implicit subgraph interruptions. Consequently, manual adaptation is required in specific scenarios:
+**1. Computational Graph Dependencies or CPU/GPU Synchronization**
+```python
+@magi_compile
+class MeanModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        x = x.cos().sin()
+        if x.mean() > 0.5:
+            x = x - 1
+        return x * y
+```
+> **Note:** In typical Transformer models, certain pre/post-processing operations are unavoidable. Therefore, the recommended practice for `magi_compiler` is to perform **whole-graph capture at the `TransformerBlock` level**, as `TransformerBlock` computations constitute over 95% of the total workload.
+**2. Custom Operators (e.g., FlashAttention, FlexFlashAttention, MoE kernels)**
+*   **Operator Registration:** A logic for operator registration is provided. Commonly used operators like FlashAttention (FA) and FlexFlashAttention (FFA) are already registered.
+```python
+# Operator Registration
+@torch.library.custom_op("athena::flash_attn_func", mutates_args=())
+def flash_attn_func(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+    ...
+# Operator Deduce Function
+@flash_attn_func.register_fake
+def _(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+    return torch.empty_like(query)
+# Call flash_attn_func
+self_attn_out = torch.ops.athena.flash_attn_func(q, k, v)
+out, _ = torch.ops.athena.flex_flash_attn_func(q, k, v, q_ranges=ffa_handler.q_ranges, k_ranges=ffa_handler.k_ranges)
+```
+*   **Unit Testing:** Independent unit tests for operators should be provided in the production environment.
+```python
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("seq_len", [1024, 2048, 4096])
+@pytest.mark.parametrize("query_head", [48])
+@pytest.mark.parametrize("kv_head", [4, 8])
+@pytest.mark.parametrize("head_dim", [128, 256])
+def test_fake_fa3(batch_size, seq_len, query_head, kv_head, head_dim):
+    q = torch.randn((batch_size, seq_len, query_head, head_dim), device="cuda", dtype=torch.bfloat16)
+    k = torch.randn((batch_size, seq_len, kv_head, head_dim), device="cuda", dtype=torch.bfloat16)
+    v = torch.randn((batch_size, seq_len, kv_head, head_dim), device="cuda", dtype=torch.bfloat16)
+    torch.library.opcheck(torch.ops.athena.flash_attn_func, (q, k, v))
+```
+### 2.2 Debugging Methods
+Key questions for debugging:
+*   Is the bug originating from the compiler?
+*   Which specific component of the compiler is causing the bug?
+![Debugging](./assets/why_magicompiler_4_debug.png)
+```python
+class CompileConfig(BaseModel):
+    # Basic configs
+    backend: str = Field("inductor", description="Compilation backend.")
+    compile_mode: CompileMode = Field(CompileMode.MAGI_COMPILE, description="Compilation mode.")
+    ...
+    # Cudagraph configs
+    cudagraph_mode: CudaGraphMode = Field(CudaGraphMode.NONE, description="Cudagraph mode.")
+    ...
+    # Pass configs
+    pass_config: PassConfig = Field(PassConfig(), description="Pass configuration.")
+    ...
+```
+### 2.3 Profiling Results
+For further details, please refer to the [**Wan2.2 Benchmark**](Wan2.2Benchmark.md).
+---
+## References
+1.  [PyTorch 2.0 Overview](https://docs.pytorch.org/assets/pytorch2-2.pdf)
+2.  [TorchDynamo: An Experiment in Dynamic Python Bytecode Transformation](https://dev-discuss.pytorch.org/t/torchdynamo-an-experiment-in-dynamic-python-bytecode-transformation/361)
+3.  [Depyf Walkthrough](https://depyf.readthedocs.io/en/latest/walk_through.html)
+4.  [Getting Started with PyTorch Compiler](https://docs.pytorch.org/docs/main/torch.compiler_get_started.html)

pkgs/MagiCompiler/docs/WhyMagiDepyf.md ADDED Viewed

	@@ -0,0 +1,175 @@

+# magi_depyf
+A structured inspector for `torch.compile` and MagiCompiler compilation
+artifacts — decompiled source, Inductor kernels, guard conditions, graph break
+chains, and more — all organized in a navigable directory tree.
+## Why
+### The problem: compilation is a black box
+`torch.compile` and MagiCompiler accelerate models by transforming Python
+functions through a deep pipeline: Dynamo captures bytecode into FX graphs,
+a backend (Inductor, etc.) compiles them into optimized kernels, and the
+runtime dispatches through a chain of cache entries, compiled functions, and
+resume functions.  The result is fast — but opaque.
+When something goes wrong — a correctness bug, an unexpected graph break, a
+performance cliff — you need to see what the compiler actually produced.
+What bytecode did Dynamo generate?  Which subgraphs went to Inductor vs.
+eager fallback?  What do the kernels look like?  How do resume functions
+chain together?  MagiCompiler adds further layers: CUDA graph capture
+regions, piecewise subgraph splits, and its own dispatch logic.
+None of this is easily accessible.
+### depyf: a pioneering effort
+[depyf](https://github.com/thuml/depyf) was the first tool to address this,
+hooking into `torch._dynamo` to dump decompiled source, FX graphs, and
+Inductor output.  It made `torch.compile` significantly more transparent.
+### Why a new tool?
+magi_depyf is purpose-built for MagiCompiler's compilation stack, and takes
+a fundamentally different approach from depyf:
+| | depyf | magi_depyf |
+|-|-------|------------|
+| **When artifacts are collected** | During compilation, via monkey-patching internal hooks | **After** compilation completes, by walking the final CacheEntry chain — a single, clean post-hoc pass |
+| **Output structure** | Flat files (`full_code_0.py`, `__transformed_code_0_for_xxx.py`, …) — hard to navigate for complex models | **Hierarchical directory tree** mirroring the compilation structure: function → entries → compiled\_fns / resume\_fns |
+| **MagiCompiler support** | None | First-class: per-subgraph Inductor source, CUDA graph mode, piecewise split metadata |
+| **Decompiler** | Monolithic class supporting Python 3.8–3.12 | Modular handler registry; focused on 3.10+ |
+### Key features
+**See everything the compiler produced, in one structured tree.**
+One context manager call gives you a complete, navigable dump: decompiled
+bytecode (before and after Dynamo), Inductor kernel source for every compiled
+function, guard conditions, bytecode metadata (`co_flags`, `co_consts`,
+`dis` output), and the full resume function chain — recursively.
+**MagiCompiler-native.**
+Understands MagiCompiler's backend, extracting per-subgraph Inductor source,
+CUDA graph capture mode (full / piecewise), and split metadata that
+`torch.compile`-only tools cannot see.
+**Post-hoc introspection.**
+Artifacts are collected after compilation finishes, by walking the CacheEntry
+linked list and extracting what Dynamo and the backend actually produced.
+No monkey-patching of internal compilation hooks, no interference with the
+compilation process itself.
+## Usage
+### `dump_src` — the main entry point
+```python
+import torch
+from magi_compiler.magi_depyf.inspect import dump_src
+@torch.compile
+def toy_example(a, b):
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+with dump_src("./output"):
+    for _ in range(100):
+        toy_example(torch.randn(10), torch.randn(10))
+```
+This produces:
+```
+output/
+  toy_example/
+    overview.md                  # Navigable index with links to everything
+    decompiled_code.py           # Original function source
+    bytecode_info.txt            # CodeType metadata + dis output
+    entry_0/
+      decompiled_code.py         # Dynamo-transformed bytecode → Python
+      bytecode_info.txt          # Transformed code metadata
+      guards.txt                 # Guard conditions for this cache entry
+      compiled_fns/
+        __compiled_fn_1_xxx.py           # FX graph (readable)
+        __compiled_fn_1_xxx_post_grad.py # Post-grad graph
+        __compiled_fn_1_xxx_runnable.py  # Inductor kernel source
+      resume_fns/
+        __resume_at_94_2/        # Resume function after graph break
+          overview.md
+          decompiled_code.py     # Resume function source
+          bytecode_info.txt
+          entry_0/               # Dynamo compiles resume fns too
+            decompiled_code.py
+            guards.txt
+            compiled_fns/
+              ...
+        __resume_at_104_3/
+          ...
+```
+### Programmatic API
+```python
+from magi_compiler.magi_depyf import decompile
+# Decompile a code object to Python source
+source = decompile(my_function.__code__)
+# Introspect a compiled function
+from magi_compiler.magi_depyf.inspect import Introspector
+info = Introspector.build_function_info(fn, fn_globals=fn.__globals__)
+# info.entries[0].decompiled_src  — decompiled transformed code
+# info.entries[0].compiled_fns    — backend-compiled functions
+# info.entries[0].resume_fns      — resume functions after graph breaks
+```
+### Tested model architectures
+The test suite verifies the decompile → recompile round-trip on real model
+structures, ensuring the decompiler produces correct source for Dynamo output:
+| Category | Models |
+|----------|--------|
+| **PyTorch core** | MLP, Conv-BN-ReLU, MultiheadAttention, TransformerEncoderLayer, Embedding, residual blocks, depthwise separable conv |
+| **Diffusion blocks** | GEGLU, RMSNorm, sinusoidal embeddings, cross-attention, AdaLayerNorm, DiT blocks, timestep MLP |
+| **HuggingFace transformers** | BERT, GPT-2, T5 encoder (tiny configs) |
+| **HuggingFace diffusers** | Attention (self / cross), BasicTransformerBlock |
+| **timm** | ResNet-18, MobileNetV3, EfficientNet-B0, ViT, ConvNeXt, Swin, DeiT |
+| **Graph breaks** | `print()` breaks, explicit `graph_break()`, multi-break chains — with recursive resume function round-tripping |
+## Code structure
+```
+magi_depyf/
+├── __init__.py              # Public API: decompile, safe_decompile
+│
+├── decompile/               # Bytecode → Python source (no torch dependency)
+│   ├── decompiler.py        # Decompiler: orchestrates the pipeline
+│   ├── recompiler.py        # CodeRecompiler: decompile → compile() → CodeType
+│   ├── bytecode/
+│   │   ├── instruction.py       # Mutable wrapper over dis.Instruction
+│   │   ├── source_emitter.py    # Stack machine + source accumulator
+│   │   ├── decompile_context.py # Read-only context for handlers
+│   │   ├── handler_registry.py  # Opcode → handler dispatch table
+│   │   └── handlers/            # One module per opcode category
+│   └── postprocess/             # Source-level cleanup passes
+│
+└── inspect/                 # torch.compile introspection (requires torch)
+    ├── dump_src.py          # dump_src(): the main entry point
+    ├── introspect.py        # Introspector: walk CacheEntry chain
+    ├── model.py             # Data model (FunctionInfo, EntryInfo, ...)
+    ├── writer.py            # Serialize to directory tree
+    ├── session.py           # CaptureSession: bytecode hook lifecycle
+    └── result.py            # CaptureResult: one compilation event
+```
+## Compatibility
+| Requirement | Version |
+|-------------|---------|
+| **Python** | >= 3.10 |
+| **PyTorch** | >= 2.0 (requires `torch._dynamo` internals) |
+| **depyf** | Optional; used as fallback by `safe_decompile` |

pkgs/MagiCompiler/docs/assets/submod_0_rank_0.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:583d97460eb7ebf48efbdeb7a6ae424f640de9ff99e6f33bbadef432583f40d3
+size 16122

pkgs/MagiCompiler/magi_compiler/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .api import magi_compile
+__all__ = ["magi_compile"]

pkgs/MagiCompiler/magi_compiler/_cache_data_cls.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+@dataclasses.dataclass(frozen=True)
+class CacheHandle:
+    key: str | None
+    path: str
+@dataclasses.dataclass(frozen=True)
+class CacheEntry:
+    runtime_shape: int | None
+    graph_index: int
+    backend_name: str

pkgs/MagiCompiler/magi_compiler/api.py ADDED Viewed

	@@ -0,0 +1,666 @@

+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import gc
+import inspect
+import os
+from contextlib import contextmanager
+from typing import Callable, TypeVar, get_args, get_origin, overload
+from unittest.mock import patch
+import magi_compiler.utils.envs as envs
+import torch
+from magi_compiler.cuda.cudart import pin_memory_in_place
+from magi_compiler.magi_compiler_base import MagiCompilerBase
+from magi_compiler.utils import compilation_counter, magi_logger
+from magi_compiler.utils.compile_time_monitor import CompileMonitor
+from torch import distributed as dist
+from torch import nn
+from torch._dynamo.symbolic_convert import InliningInstructionTranslator
+from .config import CompileConfig, CompileMode, get_compile_config
+# =============================================================================
+# Workaround: TorchInductor autotune get_raw_stream
+# =============================================================================
+# TorchInductor autotune code blocks may reference get_raw_stream() without
+# defining it, causing "name 'get_raw_stream' is not defined" at runtime.
+# Register it as a builtin so the exec'd autotune snippets can always find it.
+def _patch_get_raw_stream():
+    try:
+        import builtins
+        from torch._C import _cuda_getCurrentRawStream as _get_raw_stream
+    except Exception:
+        return
+    if not hasattr(builtins, "get_raw_stream"):
+        builtins.get_raw_stream = _get_raw_stream
+_patch_get_raw_stream()
+# =============================================================================
+# Dynamo Config Isolation
+# =============================================================================
+# Capture the default dynamo config at module load time (before any torch.compile).
+# This ensures we have a "clean" baseline config that hasn't been modified by
+# external torch.compile calls (e.g., with dynamic=True).
+_DEFAULT_DYNAMO_CONFIG: dict = torch._dynamo.config.get_config_copy()
+@contextmanager
+def _isolated_dynamo_config():
+    """
+    Context manager that provides an isolated dynamo config environment.
+    """
+    with torch._dynamo.config.patch(**_DEFAULT_DYNAMO_CONFIG):
+        yield
+_T = TypeVar("_T", bound=type[nn.Module])
+_W = TypeVar("_W", bound="MagiCompilerBase")
+@overload
+def magi_compile(*, enable_if: Callable[None, bool] | None = None) -> Callable[[_T], _T]:
+    ...
+@overload
+def magi_compile(*, dynamic_arg_dims: dict[str, int | list[int]] | None) -> Callable[[_T], _T]:
+    ...
+@overload
+def magi_compile(*, config_patch: Callable[[CompileConfig], CompileConfig] | None = None) -> Callable[[_T], _T]:
+    ...
+@overload
+def magi_compile(cls: _T) -> _T:
+    ...
+def magi_compile(
+    cls: _T | None = None,
+    *,
+    model_tag: str | None = None,
+    dynamic_arg_dims: dict[str, int | list[int]] | None = None,
+    enable_if: Callable[None, bool] | None = None,
+    config_patch: Callable[[CompileConfig], CompileConfig] | None = None,
+) -> Callable[[_T], _T] | _T:
+    """
+    A decorator to add support for compiling the forward method of a class.
+    Usage:
+    1. use directly as a decorator without arguments:
+    ```python
+    @magi_compile
+    class MyModel(nn.Module):
+        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
+    ```
+    2. use as a decorator with arguments:
+    ```python
+    @magi_compile(dynamic_arg_dims={"x": 0, "y": 0})
+    class MyModel(nn.Module):
+        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
+    ```
+    Arguments:
+    - model_tag: optional tag in cache path (e.g. "wan_ti2v"). If not set, class name is used.
+        Path segment: model_{idx}_{model_tag}_rank_{rank}.
+    - dynamic_arg_dims: a dictionary that maps argument names to the dynamic
+        dimensions of the argument. The dynamic dimensions can be either a single
+        integer or a list of integers.
+    - enable_if: a function that returns a boolean value indicating whether to compile the model or not.
+        This is useful if you want to compile the model only when certain conditions are met.
+    Notes:
+    - dynamic_arg_dims will be inferred from the type annotation of the forward method if not provided,
+        if the argument is annotated as `torch.Tensor` or `Optional[torch.Tensor]`,
+        the first dimension will be marked as dynamic.
+    - if an argument is `None`, it should always be passed as `None` during
+        the lifetime of the model, otherwise, it cannot be captured as a single
+        computation graph.
+    """
+    def cls_decorator_helper(cls: _T) -> _T:
+        nonlocal dynamic_arg_dims
+        dynamic_arg_dims = dynamic_arg_dims or _infer_dynamic_arg_dims(cls)
+        # Accuracy check
+        assert hasattr(cls, "forward"), "decorated class should have a forward method."
+        assert len(dynamic_arg_dims) > 0, (
+            "No dynamic dimensions found in the forward method of " f"{cls}. Please provide dynamic_arg_dims explicitly."
+        )
+        for k in dynamic_arg_dims:
+            assert k in inspect.signature(cls.forward).parameters, f"Argument {k} not found in the forward method of {cls}"
+        return _magi_compile(cls, dynamic_arg_dims, enable_if, config_patch, model_tag=model_tag)
+    if cls is not None:
+        # use `magi_compile` as a decorator without arguments, cls is the class to be decorated
+        assert isinstance(cls, type)
+        return cls_decorator_helper(cls)
+    return cls_decorator_helper
+def offload(obj):
+    if isinstance(obj, torch.Tensor):
+        return obj.cpu()
+    elif isinstance(obj, dict):
+        return {k: offload(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return type(obj)(offload(item) for item in obj)
+    return obj
+def _magi_compile(
+    cls: _T,
+    dynamic_arg_dims: dict[str, int | list[int]],
+    enable_if: Callable[None, bool] | None = None,
+    config_patch: Callable[[CompileConfig], CompileConfig] | None = None,
+    model_tag: str | None = None,
+) -> _T:
+    """
+    A decorator to add support for compiling the forward method of a class.
+    """
+    if MagiCompilerBase in cls.__bases__:
+        return cls
+    # take care of method resolution order, make sure super().__init__ is called on the base class
+    # other than MagiCompilerBase
+    cls.__bases__ = cls.__bases__ + (MagiCompilerBase,)
+    if get_compile_config().offload_config.model_cpu_offload:
+        magi_logger.info(f"Enabling CPU offload for {cls}")
+        _orig_apply = cls._apply
+        def _cpu_apply(self, fn):
+            if getattr(self, "_magi_offloaded_once", False):
+                return _orig_apply(self, fn)
+            # First, move all parameters/buffers to CPU
+            def _force_cpu(t):
+                return fn(t).cpu()
+            _orig_apply(self, _force_cpu)
+            # create shared memory tensors for all parameters/buffers on CPU
+            if dist.is_initialized():
+                local_rank = int(os.environ.get("LOCAL_RANK", 0))
+                full_state_dict = self.state_dict()
+                grouped_params = {}  # {dtype: [(name, tensor), ...]}
+                for name, tensor in full_state_dict.items():
+                    if tensor.device.type == 'cpu':
+                        dt = tensor.dtype
+                        if dt not in grouped_params:
+                            grouped_params[dt] = []
+                        grouped_params[dt].append((name, tensor))
+                shared_state_dict = {}
+                self._magi_giant_buffers = []
+                dist.barrier()
+                for dtype, param_list in grouped_params.items():
+                    dtype_str = str(dtype).split('.')[-1]
+                    shared_bin_path = (
+                        f"{envs.MAGI_SHARED_BIN_PATH}/magi_model_shared_{dtype_str}_{self.__class__.__name__}.bin"
+                    )
+                    total_numel = sum(t.numel() for _, t in param_list)
+                    if local_rank == 0:
+                        flat_buffer = torch.zeros(total_numel, dtype=dtype)
+                        offset = 0
+                        for _, tensor in param_list:
+                            numel = tensor.numel()
+                            flat_buffer[offset : offset + numel].copy_(tensor.view(-1))
+                            offset += numel
+                        if dtype == torch.bfloat16:
+                            flat_buffer.view(torch.int16).numpy().tofile(shared_bin_path)
+                        elif dtype.itemsize == 1 and dtype.is_floating_point:
+                            # fp8
+                            flat_buffer.view(torch.uint8).numpy().tofile(shared_bin_path)
+                        else:
+                            flat_buffer.numpy().tofile(shared_bin_path)
+                        del flat_buffer
+                        gc.collect()
+                    dist.barrier()
+                    giant_shared_tensor = torch.from_file(
+                        shared_bin_path, shared=True, size=total_numel, dtype=dtype, device="cpu"
+                    )
+                    self._magi_giant_buffers.append(giant_shared_tensor)
+                    pin_memory_in_place(giant_shared_tensor)
+                    offset = 0
+                    for name, original_tensor in param_list:
+                        numel = original_tensor.numel()
+                        shared_param = giant_shared_tensor[offset : offset + numel].view(original_tensor.shape)
+                        if original_tensor.requires_grad:
+                            shared_param.requires_grad_(True)
+                        shared_state_dict[name] = shared_param
+                        offset += numel
+                    dist.barrier()
+                    if local_rank == 0:
+                        if os.path.exists(shared_bin_path):
+                            os.remove(shared_bin_path)
+                self.load_state_dict(shared_state_dict, assign=True)
+            else:
+                def _pinner(t):
+                    return t.pin_memory()
+                _orig_apply(self, _pinner)
+            self._magi_offloaded_once = True
+            return self
+        cls._apply = _cpu_apply
+    old_init = cls.__init__
+    def __init__(self: _W, *args, **kwargs):
+        old_init(self, *args, **kwargs)
+        compile_config = get_compile_config()
+        if config_patch is not None:
+            compile_config = config_patch(compile_config)
+        # deepcopy the compile config to avoid modifying the original compile config
+        self.compile_config = copy.deepcopy(compile_config)
+        enable_compile = enable_if is None or enable_if()
+        self.enable_compile = self.compile_config.compile_mode != CompileMode.NONE and enable_compile
+        if not self.enable_compile:
+            return
+        compilation_counter.num_models_seen += 1
+        self.compile_config.model_idx = compilation_counter.num_models_seen
+        self.compile_config.model_tag = model_tag if model_tag is not None else self.__class__.__name__
+        MagiCompilerBase.__init__(self, compile_config=self.compile_config)
+    cls.__init__ = __init__
+    old_call = cls.__call__
+    def __call__(self: _W, *args, **kwargs):
+        ### Step1: Run compiled module directly if disable compile or captured before ###
+        if self.compile_config.offload_config.model_cpu_offload and self.compiled_code is None:
+            args = offload(args)
+            kwargs = offload(kwargs)
+        if not self.enable_compile or torch.compiler.is_compiling():
+            # Skip compiling the model if inside the compilation process.
+            return old_call(self, *args, **kwargs)
+        if self.compiled_code is not None:
+            # Run the compiled function if compiled code is available.
+            with self.dispatch_to_compiled_fwd(mode="jit"):
+                return old_call(self, *args, **kwargs)
+        if envs.MAGI_AOT_COMPILE:
+            # Try load AOT artifacts from cache and run directly.
+            self.aot_compiled_fn = self.try_load_aot_compile_artifacts()
+            if self.aot_compiled_fn is not None:
+                with self.dispatch_to_compiled_fwd(mode="aot"):
+                    return old_call(self, *args, **kwargs)
+        ### Step2: Mark dynamic shapes for the first compilation ###
+        bound_args = inspect.signature(self.__class__.forward).bind(self, *args, **kwargs)
+        bound_args.apply_defaults()
+        for k, dims in dynamic_arg_dims.items():
+            arg = bound_args.arguments.get(k)
+            if arg is None:
+                continue
+            dims = [dims] if isinstance(dims, int) else dims
+            assert isinstance(arg, torch.Tensor), f"Unsupported dynamic dim {dims} for argument {k} with type {type(arg)}."
+            dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+            torch._dynamo.mark_dynamic(arg, dims)
+        ### Step3: Start compiling the model ###
+        magi_logger.info(f"Start compiling function {self.original_code_object}")
+        CompileMonitor().start(
+            self.compile_config.compile_mode == CompileMode.MAGI_COMPILE, self.compile_config.debug_dump_path()
+        )
+        # Dynamo reuse the compilation across instances, but we need to make sure the compiled code is not reused.
+        torch._dynamo.eval_frame.remove_from_cache(self.original_code_object)
+        with (
+            _hijack_inline_call_to_collect_traced_files(self),
+            patch.object(torch.compiler.config, "dynamic_sources", self.compile_config.dynamic_sources),
+            patch.object(torch._dynamo.config, "enable_cpp_symbolic_shape_guards", False),
+            # 允许 mark_dynamic 在 module 属性链上的 tensor 生效
+            # (默认 True 会强制 module property tensor 为 static shape，忽略 mark_dynamic)
+            patch.object(torch._dynamo.config, "force_nn_module_property_static_shapes", False),
+            patch.dict(
+                os.environ, {"TORCHINDUCTOR_CACHE_DIR": (self.compile_config.cache_dump_path() / "inductor_cache").as_posix()}
+            ),
+        ):
+            if envs.MAGI_AOT_COMPILE:
+                self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
+                self.aot_compiled_fn.save_compiled_function(self.aot_compilation_path)
+                with self.dispatch_to_compiled_fwd(mode="aot"):
+                    output = old_call(self, *args, **kwargs)
+            else:
+                with patch.object(self, "forward", self.jit_compile):
+                    output = old_call(self, *args, **kwargs)
+        return output
+    # 使用 @torch.compiler.disable 和 _isolated_dynamo_config 包裹整个 __call__
+    # 确保 magi compile 在外部嵌套 torch.compile 时也能独立工作不受影响
+    isolated_call = _isolated_dynamo_config()(__call__)
+    cls.__call__ = torch.compiler.disable(isolated_call)
+    return cls
+# Collect all relevant files traced by Dynamo, re-compile the model when any of these files change.
+# 1. the file containing the top-level forward function
+# 2. hijack function to know all the functions called during Dynamo tracing, every time Dynamo sees a function call, it will inline
+# the function by calling InliningInstructionTranslator.inline_call_
+def _hijack_inline_call_to_collect_traced_files(owner: _W):
+    owner.compile_config.traced_files.add(owner.original_code_object.co_filename)
+    inline_call = InliningInstructionTranslator.inline_call_
+    def patched_inline_call(self_):
+        code = self_.f_code
+        owner.compile_config.traced_files.add(code.co_filename)
+        return inline_call(self_)
+    return patch.object(InliningInstructionTranslator, "inline_call_", patched_inline_call)
+def _infer_dynamic_arg_dims(cls: _T) -> dict[str, int | list[int]]:
+    sig = inspect.signature(cls.forward)
+    inferred_dynamic_arg_dims = {}
+    for k, v in sig.parameters.items():
+        if v.annotation in [torch.Tensor, torch.Tensor | None]:
+            inferred_dynamic_arg_dims[k] = 0
+    magi_logger.info(f"Inferred dynamic dimensions for forward method of {cls}: {list(inferred_dynamic_arg_dims.keys())}")
+    return inferred_dynamic_arg_dims
+def _get_num_outputs_from_return_annotation(fn: Callable) -> int:
+    """
+    Get the number of outputs from the function's return type annotation.
+    Returns:
+    - 1 if the return type is a single Tensor
+    - N if the return type is tuple[Tensor, Tensor, ...] with N elements
+    - 1 if no annotation or unrecognized annotation (default to single output)
+    """
+    sig = inspect.signature(fn)
+    return_annotation = sig.return_annotation
+    if return_annotation is inspect.Parameter.empty:
+        return 1
+    # Check if it's a tuple type (e.g., tuple[Tensor, Tensor])
+    origin = get_origin(return_annotation)
+    if origin is tuple:
+        args = get_args(return_annotation)
+        # Filter out ellipsis (for variable-length tuples like tuple[Tensor, ...])
+        if args and args[-1] is not ...:
+            return len(args)
+        return 1
+    return 1
+def _generate_op_name(fn: Callable) -> str:
+    """
+    Generate a unique operator name from function's name and source file.
+    The generated name follows the format: namespace::op_name
+    - namespace: derived from the source file path (module-like structure)
+    - op_name: the function name
+    Example:
+        Function `_my_custom_op` in file `/path/to/my_module.py`
+        -> "my_module::_my_custom_op"
+    """
+    import re
+    from pathlib import Path
+    func_name = fn.__name__
+    # Get the source file path
+    try:
+        source_file = inspect.getfile(fn)
+        # Extract the file stem (without extension) as namespace
+        namespace = Path(source_file).stem
+        # Clean up namespace: replace invalid characters with underscores
+        namespace = re.sub(r"[^a-zA-Z0-9_]", "_", namespace)
+    except (TypeError, OSError):
+        # If we can't get the source file, use a default namespace
+        namespace = "magi_custom"
+    return f"{namespace}::{func_name}"
+def _create_identity_meta_fn(fn: Callable) -> Callable:
+    """
+    Create a default identity meta function for the given function.
+    This identity meta function assumes that:
+    - The number of outputs is determined by the function's return type annotation
+    - Each output's metadata (shape, dtype, device) matches the corresponding input tensor
+    For example, if a function has signature:
+        def my_op(a: Tensor, b: Tensor, scale: float) -> tuple[Tensor, Tensor]:
+    The identity meta function will return:
+        (torch.empty_like(a), torch.empty_like(b))
+    """
+    num_outputs = _get_num_outputs_from_return_annotation(fn)
+    sig = inspect.signature(fn)
+    # Get parameter names, excluding 'self' if present
+    param_names = [name for name in sig.parameters.keys() if name != "self"]
+    def identity_meta_fn(*args, **kwargs):
+        # Bind arguments to get a mapping of param_name -> value
+        bound = sig.bind(*args, **kwargs)
+        bound.apply_defaults()
+        # Collect the first `num_outputs` tensor arguments
+        tensor_args = []
+        for name in param_names:
+            arg = bound.arguments.get(name)
+            if isinstance(arg, torch.Tensor):
+                tensor_args.append(arg)
+                if len(tensor_args) >= num_outputs:
+                    break
+        if len(tensor_args) < num_outputs:
+            raise ValueError(
+                f"identity_meta_fn requires at least {num_outputs} tensor inputs to match "
+                f"{num_outputs} outputs, but only found {len(tensor_args)} tensor inputs. "
+                f"Please provide a custom infer_output_meta_fn."
+            )
+        # Return outputs with same metadata as the first N inputs
+        if num_outputs == 1:
+            return torch.empty_like(tensor_args[0])
+        return tuple(torch.empty_like(t) for t in tensor_args[:num_outputs])
+    return identity_meta_fn
+def _create_meta_fn_from_param_names(fn: Callable, param_names: list[str]) -> Callable:
+    """
+    Create a meta function that returns torch.empty_like() for each specified parameter.
+    This is useful when output tensors have the same shape/dtype/device as specific input
+    parameters, but not necessarily in positional order.
+    Example:
+        param_names = ["weight", "bias"]
+        def my_op(grad: Tensor, weight: Tensor, bias: Tensor) -> tuple[Tensor, Tensor]:
+            ...
+        Generated meta function returns:
+            (torch.empty_like(weight), torch.empty_like(bias))
+    """
+    sig = inspect.signature(fn)
+    def meta_fn(*args, **kwargs):
+        # Bind arguments to get a mapping of param_name -> value
+        bound = sig.bind(*args, **kwargs)
+        bound.apply_defaults()
+        # Collect tensors for each specified parameter name
+        tensor_outputs = []
+        for name in param_names:
+            if name not in bound.arguments:
+                raise ValueError(
+                    f"Parameter '{name}' not found in function signature. "
+                    f"Available parameters: {list(bound.arguments.keys())}"
+                )
+            arg = bound.arguments[name]
+            if not isinstance(arg, torch.Tensor):
+                raise ValueError(
+                    f"Parameter '{name}' is not a Tensor (got {type(arg).__name__}). "
+                    f"infer_output_meta_fn list should only contain tensor parameter names."
+                )
+            tensor_outputs.append(torch.empty_like(arg))
+        # Return single tensor or tuple based on number of outputs
+        if len(tensor_outputs) == 1:
+            return tensor_outputs[0]
+        return tuple(tensor_outputs)
+    return meta_fn
+def magi_register_custom_op(
+    name: str | None = None,
+    mutates_args: tuple[str, ...] = (),
+    infer_output_meta_fn: Callable | list[str] | None = None,
+    setup_context_fn: Callable | None = None,
+    backward_fn: Callable | None = None,
+):
+    """
+    A unified decorator to register a custom operator with PyTorch's library.
+    This decorator combines the functionality of:
+    - @torch.library.custom_op
+    - @torch.library.register_fake
+    - fn.register_autograd
+    Arguments:
+        name: The fully qualified name of the operator (e.g., "namespace::op_name").
+              If None, auto-generated from the function name and source file.
+        mutates_args: Tuple of argument names that are mutated by the operator.
+        infer_output_meta_fn: Specifies output tensor metadata (shape, dtype, device) for tracing.
+            - None (default): Assumes each output has the same metadata as the corresponding
+              input tensor (1st output matches 1st tensor input, 2nd matches 2nd, etc.).
+            - list[str]: Parameter names whose metadata to use for outputs.
+              E.g., ["weight", "bias"] means output[0] has same shape as `weight`,
+              output[1] has same shape as `bias`.
+            - Callable: Custom function with same signature as the op, returns torch.empty_like()
+              tensors matching the expected output shapes.
+        setup_context_fn: Function to save tensors/values for backward.
+            Signature: setup_context_fn(ctx, inputs, output)
+        backward_fn: Function to compute gradients.
+            Signature: backward_fn(ctx, *grad_outputs) -> tuple of gradients
+    Returns:
+        The registered custom operator function.
+    Examples:
+        1. Basic usage (forward only, auto-generated name and meta function):
+        >>> @magi_register_custom_op()
+        ... def my_relu(x: torch.Tensor) -> torch.Tensor:
+        ...     return torch.maximum(x, torch.zeros_like(x))
+        2. Multiple outputs with explicit output metadata via parameter names:
+        >>> @magi_register_custom_op(
+        ...     infer_output_meta_fn=["weight", "bias"],  # output shapes match weight and bias
+        ... )
+        ... def compute_gradients(
+        ...     grad_output: torch.Tensor,
+        ...     weight: torch.Tensor,
+        ...     bias: torch.Tensor,
+        ... ) -> tuple[torch.Tensor, torch.Tensor]:
+        ...     grad_weight = grad_output.sum(dim=0).view_as(weight)
+        ...     grad_bias = grad_output.sum(dim=0).view_as(bias)
+        ...     return grad_weight, grad_bias
+        3. Full custom op with autograd support:
+        >>> def _square_meta(x: torch.Tensor) -> torch.Tensor:
+        ...     return torch.empty_like(x)
+        ...
+        >>> def _square_setup_context(ctx, inputs, output):
+        ...     (x,) = inputs
+        ...     ctx.save_for_backward(x)
+        ...
+        >>> def _square_backward(ctx, grad_output):
+        ...     (x,) = ctx.saved_tensors
+        ...     return grad_output * 2 * x
+        ...
+        >>> @magi_register_custom_op(
+        ...     name="my_ops::square",
+        ...     infer_output_meta_fn=_square_meta,
+        ...     setup_context_fn=_square_setup_context,
+        ...     backward_fn=_square_backward,
+        ... )
+        ... def square(x: torch.Tensor) -> torch.Tensor:
+        ...     return x * x
+    """
+    def decorator(fn: Callable) -> Callable:
+        # Auto-generate name if not provided
+        op_name = name if name is not None else _generate_op_name(fn)
+        # Step 1: Register the custom op with torch.library.custom_op
+        registered_op = torch.library.custom_op(op_name, mutates_args=mutates_args)(fn)
+        # Step 2: Register the output meta inference function
+        # Determine meta_fn based on the type of infer_output_meta_fn
+        if infer_output_meta_fn is None:
+            meta_fn = _create_identity_meta_fn(fn)
+        elif isinstance(infer_output_meta_fn, list):
+            meta_fn = _create_meta_fn_from_param_names(fn, infer_output_meta_fn)
+        else:
+            meta_fn = infer_output_meta_fn
+        torch.library.register_fake(op_name)(meta_fn)
+        # Step 3: Register autograd if backward_fn is provided
+        if backward_fn is not None:
+            registered_op.register_autograd(backward_fn, setup_context=setup_context_fn)
+        return registered_op
+    return decorator

pkgs/MagiCompiler/magi_compiler/compile_artifacts.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import inspect
+import pickle
+from unittest.mock import patch
+import torch
+from torch.utils._pytree import tree_map_only
+try:
+    from torch._dynamo.aot_compile import SerializableCallable
+except ImportError:
+    SerializableCallable = object
+assert isinstance(SerializableCallable, type)
+class MagiSerializableFunction(SerializableCallable):
+    """
+    A wrapper around a compiled function by vllm. It will forward the tensor
+    inputs to the compiled function and return the result.
+    It also implements a serialization interface to support PyTorch's precompile
+    with custom backend, so that we can save and load the compiled function on
+    disk. There's no need to wrap around the compiled function if we don't want
+    to serialize them in particular cases.
+    Right now serialization for the custom backend is done via
+    serializing the Dynamo fx graph plus example inputs.
+    """
+    def __init__(self, graph_module, example_inputs, model_tag, optimized_call):
+        assert isinstance(graph_module, torch.fx.GraphModule)
+        self.graph_module = graph_module
+        self.example_inputs = example_inputs
+        self.model_tag = model_tag
+        self.optimized_call = optimized_call
+        self.shape_env = None
+        sym_input = next((i for i in self.example_inputs if isinstance(i, torch.SymInt)), None)
+        if sym_input is not None:
+            self.shape_env = sym_input.node.shape_env
+    def __call__(self, *args, **kwargs):
+        return self.optimized_call(*args, **kwargs)
+    @classmethod
+    def serialize_compile_artifacts(cls, compiled_fn: "MagiSerializableFunction") -> bytes:
+        import sympy
+        from torch._subclasses import FakeTensorMode
+        from torch.fx._graph_pickler import GraphPickler, Options
+        state = compiled_fn.__dict__.copy()
+        state.pop("optimized_call")
+        state.pop("shape_env")
+        for node in state["graph_module"].graph.nodes:
+            node.meta.pop("source_fn_stack", None)
+            node.meta.pop("nn_module_stack", None)
+        graph_reducer_override = GraphPickler.reducer_override
+        def _graph_reducer_override(self, obj):
+            if inspect.isclass(obj) and issubclass(obj, sympy.Function) and hasattr(obj, "_torch_unpickler"):
+                return obj._torch_unpickler, (obj._torch_handler_name,)
+            if isinstance(obj, FakeTensorMode):
+                return type(None), ()
+            return graph_reducer_override(self, obj)
+        # Mask off tensor inputs since they are large and not needed.
+        state["example_inputs"] = tree_map_only(torch.Tensor, lambda _: None, state["example_inputs"])
+        with patch.object(GraphPickler, "reducer_override", _graph_reducer_override):
+            state["graph_module"] = GraphPickler.dumps(state["graph_module"], Options(ops_filter=None))
+            state["example_inputs"] = GraphPickler.dumps(state["example_inputs"])
+        return pickle.dumps(state)
+    @classmethod
+    def deserialize_compile_artifacts(cls, data: bytes) -> "MagiSerializableFunction":
+        from torch._guards import TracingContext, tracing
+        from torch._subclasses import FakeTensorMode
+        from torch.fx._graph_pickler import GraphPickler
+        from torch.fx.experimental.symbolic_shapes import ShapeEnv
+        from .config import get_compile_config
+        from .magi_backend import MagiBackend
+        state = pickle.loads(data)
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+        state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
+        state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode)
+        magi_backend = MagiBackend(get_compile_config(), state["model_tag"])
+        def optimized_call(*example_inputs):
+            """
+            On the first run of the optimized call, we rerun the compiler
+            backend which should result in a cache hit. After the backend
+            call returns, we just do a one-time replacement of the optimized
+            call with the compiled function, so that subsequent calls are on
+            the AOT compiled path.
+            """
+            compile_inputs = [inp or example_inputs[i] for i, inp in enumerate(fn.example_inputs)]
+            with tracing(TracingContext(fake_mode)):
+                fn.optimized_call = magi_backend(state["graph_module"], compile_inputs).optimized_call
+            return fn.optimized_call(*example_inputs)
+        fn = cls(**state, optimized_call=optimized_call)
+        return fn
+    @property
+    def co_name(self):
+        """
+        Used for depyf debugging.
+        """
+        return "MagiSerializableFunction"

pkgs/MagiCompiler/magi_compiler/config.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from enum import Enum, unique
+from pathlib import Path
+from typing import Any, Literal
+import torch
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from .utils import OrderedSet, compute_hash, magi_logger
+@unique
+class CompileMode(Enum):
+    """
+    The compilation approach used for torch.compile-based compilation of the model.
+    NONE: No torch.compile compilation is applied, model runs in fully eager pytorch mode. The model runs as-is.
+    TORCH_COMPILE: The standard `torch.compile` compilation pipeline.
+    MAGI_COMPILE: Custom Inductor-based backend with caching, piecewise compilation, shape specialization, and custom passes.
+    """
+    NONE = 'NONE'
+    TORCH_COMPILE = 'TORCH_COMPILE'
+    MAGI_COMPILE = 'MAGI_COMPILE'
+@unique
+class CudaGraphMode(Enum):
+    """
+    Constants for the cudagraph mode in CompileConfig.
+    Different from the CUDAGraphMode for llm, PIECEWISE and FULL modes are enough for diffusion models.
+    NONE: No cudagraph is used.
+    PIECEWISE: Cudagraph is used for piecewise compilation.
+    FULL: Cudagraph is used for full compilation.
+    """
+    NONE = 'NONE'
+    PIECEWISE = 'PIECEWISE'
+    FULL = 'FULL'
+class PassConfig(BaseModel):
+    """Configuration for custom Inductor passes"""
+    enable_fusion: bool = Field(False, description="Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass.")
+    enable_attn_fusion: bool = Field(False, description="Whether to enable the custom attention+quant fusion pass.")
+    enable_noop: bool = Field(False, description="Whether to enable the custom no-op elimination pass.")
+    enable_sequence_parallelism: bool = Field(False, description="Whether to enable sequence parallelism.")
+    enable_async_tp: bool = Field(False, description="Whether to enable async TP.")
+    enable_fi_allreduce_fusion: bool = Field(False, description="Whether to enable flashinfer allreduce fusion.")
+    enable_sage_attn: bool = Field(False, description="Whether to replace flash attention with sage attention.")
+    fi_allreduce_fusion_max_token_num: int = Field(
+        16384, description="Max number of tokens to used in flashinfer allreduce fusion."
+    )
+    def __post_init__(self) -> None:
+        if not self.enable_noop:
+            if self.enable_fusion:
+                magi_logger.warning(
+                    "Fusion enabled but reshape elimination disabled. " "RMSNorm/SiluMul + quant (fp8) fusion might not work"
+                )
+            if self.enable_attn_fusion:
+                magi_logger.warning(
+                    "Fusion enabled but reshape elimination disabled. " "Attention + quant (fp8) fusion might not work"
+                )
+    @property
+    def hash(self) -> str:
+        return compute_hash(self.model_dump(mode="json"))
+    # Compatible with torch pass
+    def uuid(self) -> str:
+        return self.hash
+@unique
+class RecomputePolicy(Enum):
+    """
+    Defines the strategy for activation recomputation (rematerialization) to trade off
+    memory usage against computational overhead.
+    HANDCRAFT:
+        A manual strategy where the user controls the trade-off via a `memory_budget`
+        parameter. This parameter acts as a threshold (0.0 to 1.0) determining the
+        target percentage of activations to save.
+    HEURISTIC:
+        A rule-based strategy that selectively saves activations from compute-bound
+        operators (e.g., MatMul, Attention). Conversely, outputs from memory-bound
+        or element-wise operators are prioritized for recomputation to save memory.
+    AUTOSEARCH:
+        An automated strategy that searches for the optimal set of saved tensors based
+        on available device memory. It prioritizes saving tensors with high computational
+        cost relative to their memory footprint.
+        .. note::
+            Currently, a `repeat_number` argument is required to stabilize the profiling/search
+            phase. This requirement is temporary and will be deprecated once full-graph
+            capture is natively supported.
+    """
+    HANDCRAFT = "HANDCRAFT"
+    HEURISTIC = "HEURISTIC"
+    AUTOSEARCH = "AUTOSEARCH"
+class RecomputeConfig(BaseModel):
+    recompute_policy: RecomputePolicy = Field(RecomputePolicy.HEURISTIC, description="Recompute policy.")
+    memory_budget: float = Field(0.5, description="Activation memory budget for recomputation, only used for handcraft.")
+    repeat_number: int = Field(default=1, description="Repeat number for recomputation, only used for autosearch.")
+@unique
+class OffloadPolicy(Enum):
+    """
+    The policy for offloading the model to CPU.
+    BASE:
+        The base policy for offloading the model to CPU.
+        Offload all the submodules to CPU.
+    COST_EFFECTIVE:
+        The cost effective policy for offloading the model to CPU.
+        Offload the submodules to CPU based on the cost effective policy.
+    HEURISTIC:
+        The heuristic policy for offloading the model to CPU.
+        Offload the submodules to CPU based on the heuristic policy.
+    """
+    BASE = "BASE"
+    COST_EFFECTIVE = "COST_EFFECTIVE"
+    HEURISTIC = "HEURISTIC"
+class OffloadConfig(BaseModel):
+    model_cpu_offload: bool = Field(False, description="Whether to offload the model to CPU.")
+    gpu_resident_weight_ratio: float = Field(
+        0.3, description="The ratio of GPU memory to keep when offloading the model to CPU."
+    )
+    offload_policy: OffloadPolicy = Field(
+        OffloadPolicy.COST_EFFECTIVE, description="The policy for offloading the model to CPU."
+    )
+    bandwidth_safety_factor: float = Field(0.9, description="The safety factor for the H2D bandwidth.")
+class CompileConfig(BaseSettings):
+    model_config = SettingsConfigDict(cli_parse_args=True, cli_ignore_unknown_args=True, cli_implicit_flags=True)
+    # Basic configs
+    backend: Literal["inductor", "eager"] = Field("inductor", description="Compilation backend.")
+    compile_mode: CompileMode = Field(CompileMode.MAGI_COMPILE, description="Compilation mode.")
+    cache_root_dir: str = Field(
+        default=os.path.expanduser("~/.cache/magi_compiler"), description="Directory to cache the compiled model."
+    )
+    dynamic_sources: str = Field(
+        default=os.environ.get("TORCH_COMPILE_DYNAMIC_SOURCES", ""),
+        description="Comma delimited list of sources that should be marked as dynamic.",
+    )
+    # CPU Offload
+    offload_config: OffloadConfig = Field(OffloadConfig(), description="Offload configuration.")
+    # Inductor configs
+    # TODO(hongyu): Add unittest for compile_sizes
+    compile_sizes: list[int] = Field(default_factory=list, description="Sizes to compile the model for.")
+    use_inductor_graph_partition: bool = Field(
+        False, description="Whether to use inductor graph partition. Not fully supported yet."
+    )
+    # TODO(hongyu): Find a better way to specify the splitting ops.
+    splitting_ops: list[str] = Field(
+        default_factory=lambda: [
+            "athena::flash_attn_func",
+            "athena::flex_flash_attn_func",
+            "athena::sage_attn_func",
+            "athena::flash_attn_with_cp",
+            "athena::flex_flash_attn_with_cp",
+        ],
+        description="Operators to split the graph into piecewise graphs.",
+    )
+    # Pass configs
+    pass_config: PassConfig = Field(PassConfig(), description="Pass configuration.")
+    # Recompute configs
+    recompute_config: RecomputeConfig = Field(RecomputeConfig(), description="Recompute configuration.")
+    # Cudagraph configs
+    cudagraph_mode: CudaGraphMode = Field(CudaGraphMode.NONE, description="Cudagraph mode.")
+    cudagraph_copy_inputs: bool = Field(True, description="Whether to copy inputs for cudagraph.")
+    # Runtime configs, maybe changed at runtime
+    model_idx: int = Field(0, description="Index of the model.")
+    model_tag: str | None = Field(
+        default=None, description="Tag in cache path: model_{idx}_{model_tag}_rank_{rank}. Class name if unset."
+    )
+    inductor_compile_config: dict[str, Any] = Field(default_factory=dict, description="Inductor compilation configuration.")
+    traced_files: OrderedSet[str] = Field(default_factory=OrderedSet, description="Files traced by Dynamo.")
+    def _model_rank_dir_name(self) -> str:
+        """Directory name for this model instance: model_{idx}[_{model_tag}]_rank_{rank}."""
+        rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        if self.model_tag:
+            return f"model_{self.model_idx}_{self.model_tag}_rank_{rank}"
+        return f"model_{self.model_idx}_rank_{rank}"
+    def debug_dump_path(self) -> Path:
+        return Path(self.cache_root_dir) / "magi_depyf" / self._model_rank_dir_name()
+    def cache_dump_path(self) -> Path:
+        return Path(self.cache_root_dir) / "torch_compile_cache" / self._model_rank_dir_name()
+    @property
+    def hash(self) -> str:
+        # Create a copy of the config data for serialization
+        data = self.model_dump(mode="json", exclude={"inductor_compile_config"})
+        # Handle inductor_compile_config separately to serialize objects with uuid() method
+        # This is a workaround to support serialization of PostGradPassManager in Pydantic models.
+        if self.inductor_compile_config:
+            serialized_inductor_config = {}
+            for key, value in self.inductor_compile_config.items():
+                # If the value has a uuid() method (like PostGradPassManager), use it
+                if hasattr(value, "uuid") and callable(getattr(value, "uuid", None)):
+                    try:
+                        serialized_inductor_config[key] = value.uuid()
+                    except (AttributeError, RuntimeError):
+                        # Fallback to string representation if uuid() fails
+                        serialized_inductor_config[key] = str(value)
+                else:
+                    # For other types, try to serialize normally
+                    try:
+                        # Try to serialize as JSON-serializable
+                        json.dumps(value)
+                        serialized_inductor_config[key] = value
+                    except (TypeError, ValueError):
+                        # If not JSON-serializable, use string representation
+                        serialized_inductor_config[key] = str(value)
+            data["inductor_compile_config"] = serialized_inductor_config
+        return compute_hash(data)
+    def __str__(self, indent: int = 4):
+        data = self.model_dump(mode="json")
+        formatted = json.dumps(data, indent=indent, ensure_ascii=False, sort_keys=False)
+        # add configuration class name as title
+        class_name = self.__class__.__name__
+        return f"{class_name}:\n{formatted}".replace('"', "")
+    def __repr__(self, indent: int = 4):
+        return self.__str__(indent=indent)
+_GLOBAL_COMPILE_CONFIG = None
+def get_compile_config() -> CompileConfig:
+    global _GLOBAL_COMPILE_CONFIG
+    if _GLOBAL_COMPILE_CONFIG is None:
+        _GLOBAL_COMPILE_CONFIG = CompileConfig()
+        if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+            # 仅在首次初始化时打印一次编译配置，默认 WARNING 级别不会输出
+            magi_logger.info("compile config: %s", _GLOBAL_COMPILE_CONFIG)
+    assert _GLOBAL_COMPILE_CONFIG is not None, "compile config is not initialized"
+    return _GLOBAL_COMPILE_CONFIG

pkgs/MagiCompiler/magi_compiler/cuda/cudart.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ctypes
+import os
+import torch
+_cudart = None
+def init_cudart():
+    global _cudart
+    if _cudart is not None:
+        return _cudart
+    candidates = ["libcudart.so", "libcudart.so.11.0", "libcudart.so.12.0, libcudart.so.13"]
+    try:
+        cuda_path = os.path.dirname(torch.utils.cpp_extension._find_cuda_home())
+        candidates.append(os.path.join(cuda_path, "lib64", "libcudart.so"))
+    except:
+        pass
+    for lib in candidates:
+        try:
+            _cudart = ctypes.CDLL(lib)
+            return _cudart
+        except OSError:
+            continue
+    return None
+def pin_memory_in_place(tensor: torch.Tensor):
+    """
+    Pin memory in-place using cudaHostRegister.
+    """
+    if tensor.is_cuda:
+        return tensor
+    cudart = init_cudart()
+    if cudart is None:
+        return tensor
+    ptr = tensor.data_ptr()
+    size = tensor.numel() * tensor.element_size()
+    res = cudart.cudaHostRegister(ctypes.c_void_p(ptr), ctypes.c_size_t(size), 0)
+    if res == 0:
+        return tensor
+    else:
+        raise RuntimeError(f"cudaHostRegister failed with error code {res}")

pkgs/MagiCompiler/magi_compiler/cuda_graph_mgr.py ADDED Viewed

	@@ -0,0 +1,931 @@

+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, fields, is_dataclass
+from functools import wraps
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+from .utils import magi_logger, nvtx
+class InplaceSubstituteFakeClass:
+    """
+    The class which inherits from this class will not be replaced with a new instance,
+    but the attributes will be updated in-place.
+    For example, InferenceParams.
+    """
+    pass
+@dataclass
+class FakeTensor:
+    shape: Tuple[int, ...] = None
+    dtype: str = None
+    device: str = None
+@dataclass
+class HashableDataclass:
+    _cached_hash: Optional[int] = None
+    @nvtx.instrument_nvtx
+    def _get_hashable_fields(self) -> Tuple[Any, ...]:
+        hashable_values = []
+        for f in fields(self):
+            if f.name == "_cached_hash":
+                continue
+            value = getattr(self, f.name)
+            if value is None:
+                continue
+            if isinstance(value, HashableDataclass):
+                hashable_values.append(value._get_cached_hash())
+            elif isinstance(value, tuple):
+                tuple_vals = []
+                for item in value:
+                    if isinstance(item, (HashableDataclass, str, int, float, bool)):
+                        if isinstance(item, HashableDataclass):
+                            tuple_vals.append(item._get_cached_hash())
+                        else:
+                            tuple_vals.append(item)
+                if tuple_vals:
+                    hashable_values.append(tuple(tuple_vals))
+            elif isinstance(value, (str, int, float, bool)):
+                hashable_values.append(value)
+        return tuple(hashable_values)
+    @nvtx.instrument_nvtx
+    def _compute_hash(self) -> int:
+        """Computes a hash value based on the dataclass's hashable fields."""
+        hashable_fields = self._get_hashable_fields()
+        return hash(hashable_fields) % (1 << 64)  # 限制为 64 位
+    @nvtx.instrument_nvtx
+    def _get_cached_hash(self) -> int:
+        if self._cached_hash is None:
+            self._cached_hash = self._compute_hash()
+        return self._cached_hash
+    @nvtx.instrument_nvtx
+    def __hash__(self) -> int:
+        return self._get_cached_hash()
+    @nvtx.instrument_nvtx
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if self._get_cached_hash() != other._get_cached_hash():
+            return False
+        return True
+@dataclass(unsafe_hash=True)
+class LiteralsInfo(HashableDataclass):
+    literals: Tuple[Any, ...] = tuple()
+@dataclass(unsafe_hash=True)
+class TensorStaticInfo(HashableDataclass):
+    name: str = ""
+    shapes: Tuple[int, ...] = tuple()
+    dtype: str = ""
+@dataclass(unsafe_hash=True)
+class TensorDynamicInfo(HashableDataclass):
+    name: str = ""
+    shapes: Tuple[int, ...] = tuple()
+@dataclass(unsafe_hash=True)
+class StaticSignature(HashableDataclass):
+    func_name: str = ""
+    tensor_static_infos: Tuple[TensorStaticInfo, ...] = tuple()
+@dataclass(unsafe_hash=True)
+class DynamicSignature(HashableDataclass):
+    tensor_dynamic_infos: Tuple[TensorDynamicInfo, ...] = tuple()
+    literals_info: LiteralsInfo = None
+@dataclass
+class GraphEntry:
+    graph: Optional[torch.cuda.CUDAGraph] = None
+    inconsistent: bool = False
+    invalid: bool = False
+@dataclass
+class OutputTemplateEntry:
+    graph_entry_dict: Dict[int, GraphEntry] = None  # key = layer_number
+    output_template: Any = None  # 用于存储输出对象literals的结构模板
+@dataclass
+class StaticTensorEntry:
+    input_tensors: Optional[List[torch.Tensor]] = None
+    output_tensors: Optional[List[torch.Tensor]] = None
+    template_entry_dict: Dict[DynamicSignature, OutputTemplateEntry] = None
+class ArgsUtils:
+    @staticmethod
+    @nvtx.instrument_nvtx
+    def generate_both_signatures_from_tensors(
+        func_name: str, tensors: List[torch.Tensor], names: List[str], literals: List[Any]
+    ) -> Tuple[StaticSignature, DynamicSignature]:
+        num_tensors = len(tensors)
+        tensor_static_infos = [TensorStaticInfo() for _ in range(num_tensors)]
+        tensor_dynamic_infos = [TensorDynamicInfo() for _ in range(num_tensors)]
+        # Local references for performance
+        TensorStaticInfo_setattr = TensorStaticInfo.__setattr__
+        TensorDynamicInfo_setattr = TensorDynamicInfo.__setattr__
+        _tuple = tuple
+        for i in range(num_tensors):
+            t = tensors[i]
+            t_dim = t.dim()
+            t_shape = t.shape
+            t_dtype_str = str(t.dtype)
+            # Last dimension is static, others are dynamic (except for 1D tensor)
+            static_shapes = (
+                _tuple(-1 if idx != t_dim - 1 else dim_size for idx, dim_size in enumerate(t_shape)) if t_dim > 1 else (-1,)
+            )
+            static_info = tensor_static_infos[i]
+            TensorStaticInfo_setattr(static_info, "shapes", static_shapes)
+            TensorStaticInfo_setattr(static_info, "dtype", t_dtype_str)
+            dynamic_shapes = static_shapes = (
+                _tuple(-1 if idx == t_dim - 1 else dim_size for idx, dim_size in enumerate(t_shape))
+                if t_dim > 1
+                else _tuple(t_shape)
+            )
+            dynamic_info = tensor_dynamic_infos[i]
+            TensorDynamicInfo_setattr(dynamic_info, "shapes", dynamic_shapes)
+        literals_info = LiteralsInfo(literals=_tuple(literals))
+        static_sig = StaticSignature(func_name=func_name, tensor_static_infos=_tuple(tensor_static_infos))
+        dynamic_sig = DynamicSignature(tensor_dynamic_infos=_tuple(tensor_dynamic_infos), literals_info=literals_info)
+        return static_sig, dynamic_sig
+    @staticmethod
+    @nvtx.instrument_nvtx
+    def replace_sliced_with_static(obj: Any, static_tensors: List[torch.Tensor]) -> Any:
+        tensor_idx = 0
+        def recursive_replace(o: Any) -> Any:
+            nonlocal tensor_idx
+            if isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter):
+                # Copy data to the corresponding static tensor slice
+                static_tensor = static_tensors[tensor_idx]
+                slices = [slice(None)] * static_tensor.ndim
+                for i in range(min(o.ndim, static_tensor.ndim)):
+                    slices[i] = slice(0, o.shape[i])
+                # Only copy if the data_ptrs are different
+                if not o.data_ptr() == static_tensor[tuple(slices)].data_ptr():
+                    static_tensor[tuple(slices)].copy_(o)
+                tensor_idx += 1
+                return static_tensor[tuple(slices)]
+            elif isinstance(o, dict):
+                return {k: recursive_replace(v) for k, v in o.items()}
+            elif isinstance(o, (list, tuple)):
+                return type(o)(recursive_replace(item) for item in o)
+            elif is_dataclass(o):
+                field_values = {f.name: recursive_replace(getattr(o, f.name)) for f in fields(o)}
+                return type(o)(**field_values)
+            elif issubclass(o.__class__, InplaceSubstituteFakeClass):
+                # Do not create a new instance, but modify attributes in place (to keep original initialization logic)
+                for k, v in o.__dict__.items():
+                    if not callable(v):
+                        o.__dict__[k] = recursive_replace(v)
+                return o
+            elif o is None or isinstance(o, (int, float, str, bool)):
+                return o  # Keep None and basic types
+            else:
+                return o
+        return recursive_replace(obj)
+    @staticmethod
+    @nvtx.instrument_nvtx
+    def replace_sliced_with_static_simple(
+        sliced_tensors: List[torch.Tensor], static_tensors: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        for sliced_tensor, static_tensor in zip(sliced_tensors, static_tensors):
+            if not sliced_tensor.data_ptr() == static_tensor.data_ptr():
+                slices = [slice(None)] * static_tensor.ndim
+                for i in range(sliced_tensor.ndim):
+                    slices[i] = slice(0, sliced_tensor.shape[i])
+                static_tensor[tuple(slices)].copy_(sliced_tensor)
+    @staticmethod
+    @nvtx.instrument_nvtx
+    def replace_static_with_sliced(obj: Any, static_tensors: List[torch.Tensor]) -> Any:
+        tensor_idx = 0
+        def recursive_replace(o: Any) -> Any:
+            nonlocal tensor_idx
+            if (isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)) or isinstance(o, FakeTensor):
+                # Replace with the corresponding sliced tensor
+                static_tensor = static_tensors[tensor_idx]
+                shape_to_slice = o.shape
+                slices = [slice(0, dim_size) for dim_size in shape_to_slice]
+                result_tensor = static_tensor[tuple(slices)]
+                tensor_idx += 1
+                return result_tensor
+            elif isinstance(o, dict):
+                return {k: recursive_replace(v) for k, v in o.items()}
+            elif isinstance(o, (list, tuple)):
+                return type(o)(recursive_replace(item) for item in o)
+            elif is_dataclass(o):
+                field_values = {f.name: recursive_replace(getattr(o, f.name)) for f in fields(o)}
+                return type(o)(**field_values)
+            elif issubclass(o.__class__, InplaceSubstituteFakeClass):
+                # Do not create a new instance, but modify attributes in place (to keep original initialization logic)
+                for k, v in o.__dict__.items():
+                    if not callable(v):
+                        o.__dict__[k] = recursive_replace(v)
+                return o
+            elif o is None or isinstance(o, (int, float, str, bool)):
+                return o  # Keep None and basic types
+            else:
+                return o
+        return recursive_replace(obj)
+    @staticmethod
+    @nvtx.instrument_nvtx
+    def try_fx_extract_core(
+        obj: Any, extract_tensors: bool = True, extract_literals: bool = True, with_names: bool = False
+    ) -> Tuple[List[torch.Tensor], List[str], List[Any]]:
+        failed_tuple = None, None, None
+        tensors = []
+        names = []
+        literals = []
+        if not isinstance(obj, dict) or "args" not in obj or "kwargs" not in obj:
+            return failed_tuple
+        args, kwargs = obj["args"], obj["kwargs"]
+        if kwargs:
+            return failed_tuple
+        if not isinstance(args, (list, tuple)):
+            return failed_tuple
+        for idx, item in enumerate(args):
+            if extract_tensors and isinstance(item, torch.Tensor) and not isinstance(item, torch.nn.Parameter):
+                tensors.append(item)
+            elif extract_literals and isinstance(item, (int, float, str, bool)):
+                literals.append(item)
+        names = [""] * len(tensors)
+        return tensors, names, literals
+    @staticmethod
+    @nvtx.instrument_nvtx
+    def recursive_extract_core(
+        obj: Any, extract_tensors: bool = True, extract_literals: bool = True, with_names: bool = False
+    ) -> Tuple[List[torch.Tensor], List[str], List[Any]]:
+        tensors = []
+        names = []
+        literals = []
+        def recursive_traverse(o: Any, prefix: str = ""):
+            # 1. Extract tensors (if enabled)
+            if extract_tensors and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter):
+                tensors.append(o)
+                names.append(prefix) if with_names else None
+            elif extract_literals and isinstance(o, (int, float, str, bool)):
+                literals.append(o) if extract_literals else None
+            elif isinstance(o, dict):
+                for k, v in o.items():
+                    new_prefix = f"{prefix}.{k}" if (with_names and extract_tensors) else prefix
+                    recursive_traverse(v, new_prefix)
+            elif isinstance(o, (list, tuple)):
+                for idx, item in enumerate(o):
+                    new_prefix = f"{prefix}[{idx}]" if (with_names and extract_tensors) else prefix
+                    recursive_traverse(item, new_prefix)
+            elif is_dataclass(o):
+                for f in fields(o):
+                    new_prefix = f"{prefix}.{f.name}" if (with_names and extract_tensors) else prefix
+                    recursive_traverse(getattr(o, f.name), new_prefix)
+            elif issubclass(o.__class__, InplaceSubstituteFakeClass):
+                for k, v in o.__dict__.items():
+                    if not callable(v):
+                        new_prefix = f"{prefix}.{k}" if (with_names and extract_tensors) else prefix
+                        recursive_traverse(v, new_prefix)
+            elif o is None:
+                pass
+            else:
+                pass
+        recursive_traverse(obj)
+        return tensors, names if with_names else [""] * len(tensors), literals if extract_literals else None
+    @staticmethod
+    @nvtx.instrument_nvtx
+    def extract_output_template(obj: Any) -> Any:
+        def recursive_template(o: Any) -> Any:
+            if isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter):
+                return FakeTensor(shape=list(o.shape), dtype=str(o.dtype), device=str(o.device))
+            elif isinstance(o, dict):
+                return {k: recursive_template(v) for k, v in o.items()}
+            elif isinstance(o, (list, tuple)):
+                return type(o)(recursive_template(item) for item in o)
+            elif is_dataclass(o):
+                field_values = {f.name: recursive_template(getattr(o, f.name)) for f in fields(o)}
+                return type(o)(**field_values)
+            elif issubclass(o.__class__, InplaceSubstituteFakeClass):
+                # 不重新创建实例，直接修改属性（保持原有初始化逻辑）
+                for k, v in o.__dict__.items():
+                    if not callable(v):
+                        o.__dict__[k] = recursive_template(v)
+                return o
+            elif o is None or isinstance(o, (int, float, str, bool)):
+                return o
+            else:
+                return o
+        return recursive_template(obj)
+class CudaGraphMgr:
+    """CUDA Graph Manager for caching and managing CUDA Graphs and static tensors."""
+    def __init__(self):
+        self.cache: Dict[StaticSignature, StaticTensorEntry] = dict()
+        self.graph_mem_pool: Optional[torch.cuda.graph_pool_handle] = None
+        self.check_output_inconsistency = False  # Not enabled by default
+    @property
+    def graph_count(self) -> int:
+        count = 0
+        for tensor_entry in self.cache.values():
+            if tensor_entry.template_entry_dict is not None:
+                for template_entry in tensor_entry.template_entry_dict.values():
+                    for graph_entry in template_entry.graph_entry_dict.values():
+                        if graph_entry.graph is not None and not graph_entry.inconsistent and not graph_entry.invalid:
+                            count += 1
+        return count
+    @property
+    def tensor_entry_count(self) -> int:
+        count = 0
+        for tensor_entry in self.cache.values():
+            if tensor_entry.input_tensors is not None and tensor_entry.output_tensors is not None:
+                count += 1
+        return count
+    @property
+    def graph_mem_pool_size(self) -> float:
+        if not hasattr(self, "graph_mem_pool") or self.graph_mem_pool is None:
+            return 0.0
+        pool_stats = torch.cuda.memory.memory_stats(self.graph_mem_pool)
+        used_mem = pool_stats.get("allocated_bytes.all.current", 0)
+        return used_mem / (1024 * 1024)  # 转换为MB
+    @property
+    def tensor_mem_size(self) -> float:
+        total_size = 0  # 字节
+        for tensor_entry in self.cache.values():
+            if tensor_entry.input_tensors is not None:
+                for t in tensor_entry.input_tensors:
+                    total_size += t.element_size() * t.nelement()
+            if tensor_entry.output_tensors is not None:
+                for t in tensor_entry.output_tensors:
+                    total_size += t.element_size() * t.nelement()
+        return total_size / (1024 * 1024)  # 转换为MB
+    @nvtx.instrument_nvtx
+    def formatted_cache_str(self) -> str:
+        """Format the cache content as a string for debugging."""
+        lines = []
+        for static_sig, tensor_entry in self.cache.items():
+            lines.append(f"StaticSignature: {static_sig}")
+            s = "  Input Static Tensors: "
+            for it in tensor_entry.input_tensors:
+                s += f"[shape={list(it.shape)},dtype={str(it.dtype)}] "
+            lines.append(s)
+            s = "  Output Static Tensors: "
+            for ot in tensor_entry.output_tensors:
+                s += f"[shape={list(ot.shape)},dtype={str(ot.dtype)}] "
+            lines.append(s)
+            if tensor_entry.template_entry_dict is not None:
+                for dynamic_sig, template_entry in tensor_entry.template_entry_dict.items():
+                    lines.append(f"  DynamicSignature: {dynamic_sig}")
+                    lines.append(f"    Output Template: {template_entry.output_template}")
+                    for layer_number, graph_entry in template_entry.graph_entry_dict.items():
+                        status = "Valid"
+                        if graph_entry.inconsistent:
+                            status = "Inconsistent"
+                        elif graph_entry.invalid:
+                            status = "Invalid"
+                        lines.append(f"    Layer {layer_number}: Graph Status: {status}")
+        return "\n".join(lines)
+    @nvtx.instrument_nvtx
+    def try_get_cuda_graph(
+        self, static_sig: StaticSignature, dynamic_sig: DynamicSignature, layer_number: int
+    ) -> Optional[torch.cuda.CUDAGraph]:
+        graph_entry = self.try_get_graph_entry(static_sig, dynamic_sig, layer_number)
+        if (
+            graph_entry is not None
+            and graph_entry.graph is not None
+            and not graph_entry.inconsistent
+            and not graph_entry.invalid
+        ):
+            return graph_entry.graph
+        return None
+    @nvtx.instrument_nvtx
+    def get_static_tensors(self, input_static_sig: StaticSignature) -> Optional[Tuple[List[torch.Tensor], List[torch.Tensor]]]:
+        if input_static_sig in self.cache:
+            cached_entry = self.cache[input_static_sig]
+            return cached_entry.input_tensors, cached_entry.output_tensors
+        raise ValueError("Cached input/output tensors not found for the given static signature.")
+    @nvtx.instrument_nvtx
+    def warmup_run(self, func: Callable, *args, **kwargs) -> Union[torch.Tensor, List[torch.Tensor]]:
+        warmup_outputs = None
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s), torch.no_grad():
+            for _ in range(1):
+                warmup_outputs = func(*args, **kwargs)
+        torch.cuda.current_stream().wait_stream(s)
+        return warmup_outputs
+    @nvtx.instrument_nvtx
+    def add_static_entry(
+        self,
+        static_sig: StaticSignature,
+        input_tensors: Optional[List[torch.Tensor]] = None,
+        output_tensors: Optional[List[torch.Tensor]] = None,
+    ) -> None:
+        assert static_sig not in self.cache
+        self.cache[static_sig] = StaticTensorEntry(
+            input_tensors=input_tensors, output_tensors=output_tensors, template_entry_dict=dict()
+        )
+    @nvtx.instrument_nvtx
+    def add_template_entry(
+        self, input_static_sig: StaticSignature, input_dynamic_sig: DynamicSignature, output_obj: Any = None
+    ) -> None:
+        try:
+            output_template = ArgsUtils.extract_output_template(output_obj)
+            self.cache[input_static_sig].template_entry_dict[input_dynamic_sig] = OutputTemplateEntry(
+                graph_entry_dict=dict(), output_template=output_template
+            )
+        except KeyError:
+            raise ValueError("StaticSignature not found in cache when adding template entry.")
+    @nvtx.instrument_nvtx
+    def add_graph_entry(
+        self,
+        input_static_sig: StaticSignature,
+        input_dynamic_sig: DynamicSignature,
+        layer_number: int,
+        graph: torch.cuda.CUDAGraph,
+    ) -> None:
+        try:
+            self.cache[input_static_sig].template_entry_dict[input_dynamic_sig].graph_entry_dict[layer_number] = GraphEntry(
+                graph=graph, inconsistent=False, invalid=False
+            )
+        except KeyError:
+            raise ValueError("StaticSignature or DynamicSignature not found in cache when adding graph entry.")
+    @nvtx.instrument_nvtx
+    def try_get_graph_entry(
+        self, input_static_sig: StaticSignature, input_dynamic_sig: DynamicSignature, layer_number: int
+    ) -> Optional[GraphEntry]:
+        try:
+            return self.cache[input_static_sig].template_entry_dict[input_dynamic_sig].graph_entry_dict[layer_number]
+        except KeyError:
+            pass
+        return None
+    @nvtx.instrument_nvtx
+    def batch_set_graph_invalid(self, static_sig: StaticSignature) -> None:
+        if static_sig in self.cache:
+            static_tensor_entry = self.cache[static_sig]
+            if static_tensor_entry.template_entry_dict is not None:
+                for template_entry in static_tensor_entry.template_entry_dict.values():
+                    for graph_entry in template_entry.graph_entry_dict.values():
+                        graph_entry.invalid = True
+    @nvtx.instrument_nvtx
+    def set_graph_inconsistent(
+        self, input_static_sig: StaticSignature, input_dynamic_sig: DynamicSignature, layer_number: int
+    ) -> None:
+        if input_static_sig not in self.cache:
+            self.add_static_entry(input_static_sig, None, None)
+        if input_dynamic_sig not in self.cache[input_static_sig].template_entry_dict:
+            self.add_template_entry(input_static_sig, input_dynamic_sig, None)
+        if layer_number not in self.cache[input_static_sig].template_entry_dict[input_dynamic_sig].graph_entry_dict:
+            self.cache[input_static_sig].template_entry_dict[input_dynamic_sig].graph_entry_dict[layer_number] = GraphEntry(
+                graph=None, inconsistent=True, invalid=False
+            )
+        self.cache[input_static_sig].template_entry_dict[input_dynamic_sig].graph_entry_dict[layer_number].inconsistent = True
+    @nvtx.instrument_nvtx
+    def wrapped_graph_capture(
+        self,
+        func: Callable,
+        input_obj: Any,
+        static_input_tensors: List[torch.Tensor],
+        static_output_tensors: List[torch.Tensor],
+    ) -> torch.cuda.CUDAGraph:
+        init_cudagraph_global_pool()
+        _set_capture_start()
+        try:
+            graph = torch.cuda.CUDAGraph()
+            _static_input_obj = ArgsUtils.replace_sliced_with_static(input_obj, static_input_tensors)
+            s = None  # future: s = GreenCtxManager(0).create_stream()
+            with torch.cuda.graph(graph, pool=self.graph_mem_pool, stream=s), torch.no_grad():
+                _sliced_output_obj = func(*_static_input_obj["args"], **_static_input_obj["kwargs"])
+                _static_output_obj = ArgsUtils.replace_sliced_with_static(_sliced_output_obj, static_output_tensors)
+        except Exception as e:
+            torch.cuda.synchronize()  # 等待所有异步操作完成
+            _set_capture_end()
+            raise e
+        _set_capture_end()
+        return graph
+    @nvtx.instrument_nvtx
+    def wrapped_graph_replay(
+        self,
+        graph: torch.cuda.CUDAGraph,
+        static_input_tensors: List[torch.Tensor],
+        static_output_tensors: List[torch.Tensor],
+        input_obj: Any,
+        output_template: Any,
+    ) -> Any:
+        _static_input_obj = ArgsUtils.replace_sliced_with_static(input_obj, static_input_tensors)
+        graph.replay()
+        output_obj = ArgsUtils.replace_static_with_sliced(output_template, static_output_tensors)
+        return output_obj
+    @nvtx.instrument_nvtx
+    def replay_graph(
+        self, input_static_sig: StaticSignature, input_dynamic_sig: DynamicSignature, input_obj: Any, layer_number: int
+    ) -> Any:
+        output_template = self.cache[input_static_sig].template_entry_dict[input_dynamic_sig].output_template
+        static_input_tensors = self.cache[input_static_sig].input_tensors
+        static_output_tensors = self.cache[input_static_sig].output_tensors
+        graph = self.try_get_cuda_graph(input_static_sig, input_dynamic_sig, layer_number=layer_number)
+        assert graph is not None, "CUDA Graph not found for replay."
+        output_obj = self.wrapped_graph_replay(
+            graph=graph,
+            static_input_tensors=static_input_tensors,
+            static_output_tensors=static_output_tensors,
+            input_obj=input_obj,
+            output_template=output_template,
+        )
+        return output_obj
+    @nvtx.instrument_nvtx
+    def capture_and_cache(
+        self,
+        func: Callable,
+        input_obj: Any,
+        layer_number: int,
+        input_static_sig: StaticSignature,
+        input_dynamic_sig: DynamicSignature,
+    ) -> Any:
+        """Capture a new CUDA Graph and cache it."""
+        # Access static tensors from cache
+        static_tensor_entry = self.cache[input_static_sig]
+        assert static_tensor_entry.input_tensors is not None
+        assert static_tensor_entry.output_tensors is not None
+        static_input_tensors = static_tensor_entry.input_tensors
+        static_output_tensors = static_tensor_entry.output_tensors
+        # Capture CUDA Graph
+        graph = self.wrapped_graph_capture(
+            func=func,
+            input_obj=input_obj,
+            static_input_tensors=static_input_tensors,
+            static_output_tensors=static_output_tensors,
+        )
+        # Cache the captured graph
+        graph_entry = self.try_get_graph_entry(input_static_sig, input_dynamic_sig, layer_number)
+        if graph_entry:
+            graph_entry.graph = graph
+            graph_entry.inconsistent = False
+            graph_entry.invalid = False
+        else:
+            self.add_graph_entry(
+                input_static_sig=input_static_sig, input_dynamic_sig=input_dynamic_sig, layer_number=layer_number, graph=graph
+            )
+    @nvtx.instrument_nvtx
+    def if_need_expand_static_tensors(
+        self, static_tensors: List[torch.Tensor], new_tensors: List[torch.Tensor], input_static_sig: StaticSignature
+    ) -> bool:
+        """Judge whether static tensors need to be expanded based on new tensors."""
+        res = False
+        static_infos = input_static_sig.tensor_static_infos
+        if len(static_tensors) != len(new_tensors) or len(static_tensors) != len(static_infos):
+            raise AssertionError(
+                f"[CUDA Graph] Tensor count mismatch. {len(static_tensors)=}, {len(new_tensors)=}, {len(static_infos)=}"
+            )
+        for static_t, new_t, static_info in zip(static_tensors, new_tensors, static_infos):
+            if static_t.ndim != new_t.ndim:
+                raise AssertionError(f"[CUDA Graph] Rank mismatch. {static_t.shape=}, {new_t.shape=}")
+            if static_t.dtype != new_t.dtype:
+                raise AssertionError(f"[CUDA Graph] Dtype mismatch. {static_t.dtype=}, {new_t.dtype=}")
+            for i in range(static_t.ndim):
+                if static_info.shapes[i] != -1 and static_info.shapes[i] != new_t.shape[i]:
+                    raise AssertionError(
+                        f"[CUDA Graph] Static dimension mismatch. {static_t.shape=}, {new_t.shape=}, {static_info.shapes=}, dim={i}"
+                    )
+                if static_t.shape[i] < new_t.shape[i]:
+                    res = True
+        return res
+    @nvtx.instrument_nvtx
+    def get_expanded_static_tensors(
+        self, static_tensors: List[torch.Tensor], new_tensors: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        """Get expanded static tensors based on new tensors. Reuses existing tensors when possible."""
+        expanded_tensors = []
+        for static_t, new_t in zip(static_tensors, new_tensors):
+            if static_t.ndim != new_t.ndim:
+                raise AssertionError(
+                    f"[CUDA Graph] Rank mismatch during expansion. Static: {static_t.shape}, New: {new_t.shape}"
+                )
+            new_shape = tuple(max(s, n) for s, n in zip(static_t.shape, new_t.shape))
+            if static_t.shape == new_shape:
+                expanded_tensors.append(static_t)
+            elif new_shape == new_t.shape:
+                expanded_tensors.append(new_t)
+            else:
+                expanded_tensor = torch.empty(new_shape, dtype=static_t.dtype, device=static_t.device)
+                expanded_tensors.append(expanded_tensor)
+        return expanded_tensors
+    @nvtx.instrument_nvtx
+    def try_replay_graph_inline(
+        self, func: Callable, args: Tuple, kwargs: Dict, layer_number: int
+    ) -> Tuple[bool, Optional[Union[torch.Tensor, List[torch.Tensor]]]]:
+        """Try to replay the CUDA Graph inline for fast execution."""
+        try:
+            func_name = func.__qualname__
+            input_obj = {"args": args, "kwargs": kwargs}
+            input_tensors, input_tensor_names, literals = ArgsUtils.try_fx_extract_core(input_obj)
+            if None in (input_tensors, input_tensor_names, literals):
+                input_tensors, input_tensor_names, literals = ArgsUtils.recursive_extract_core(input_obj)
+            input_static_sig, input_dynamic_sig = ArgsUtils.generate_both_signatures_from_tensors(
+                func_name, input_tensors, input_tensor_names, literals
+            )
+            static_tensor_entry = self.cache[input_static_sig]
+            static_input_tensors = static_tensor_entry.input_tensors
+            static_output_tensors = static_tensor_entry.output_tensors
+            template_entry = static_tensor_entry.template_entry_dict[input_dynamic_sig]
+            output_template = template_entry.output_template
+            graph_entry = template_entry.graph_entry_dict[layer_number]
+            graph = graph_entry.graph
+            assert graph is not None, "CUDA Graph not found for inline replay."
+            assert graph_entry.inconsistent is False, "CUDA Graph marked as inconsistent for inline replay."
+            assert graph_entry.invalid is False, "CUDA Graph marked as invalid for inline replay."
+            ArgsUtils.replace_sliced_with_static_simple(input_tensors, static_input_tensors)
+            graph.replay()
+            output_obj = ArgsUtils.replace_static_with_sliced(output_template, static_output_tensors)
+            if self.check_output_inconsistency:
+                cur_output_tensors, cur_output_tensor_names, cur_output_literals = ArgsUtils.recursive_extract_core(output_obj)
+                cur_output_static_sig, cur_output_dynamic_sig = ArgsUtils.generate_both_signatures_from_tensors(
+                    func.__qualname__, cur_output_tensors, cur_output_tensor_names, cur_output_literals
+                )
+                output_tensors, output_tensor_names, output_literals = ArgsUtils.recursive_extract_core(output_obj)
+                cached_output_static_sig, cached_output_dynamic_sig = ArgsUtils.generate_both_signatures_from_tensors(
+                    func.__qualname__, output_tensors, output_tensor_names, output_literals
+                )
+                if cur_output_static_sig != cached_output_static_sig or cur_output_dynamic_sig != cached_output_dynamic_sig:
+                    magi_logger.warning(
+                        f"[CUDA Graph] Warning: Output signature changed during inline replay. {func.__qualname__=}, {layer_number=}"
+                    )
+                    self.set_graph_inconsistent(input_static_sig, input_dynamic_sig, layer_number)
+                    return False, None
+            return True, output_obj
+        except KeyError:
+            return False, None
+        except AssertionError:
+            return False, None
+        except Exception as e:
+            magi_logger.info(
+                f"[CUDA Graph] Exception during inline replay: {e=}, {func.__qualname__=}, {layer_number=}", rank="all"
+            )
+            raise e
+    @nvtx.instrument_nvtx
+    def run(self, func: Callable, *args, layer_number: Optional[int], **kwargs) -> Union[torch.Tensor, List[torch.Tensor]]:
+        """Run the function with CUDA Graph optimization if possible."""
+        # Try inline replay first
+        success, output_obj = self.try_replay_graph_inline(func=func, args=args, kwargs=kwargs, layer_number=layer_number)
+        if success:
+            # print_rank_0(f"[CUDA Graph] Current cache stats: {self.tensor_entry_count=}, {self.graph_count=}.")
+            return output_obj
+        # Extract input signatures
+        func_name = func.__qualname__
+        input_obj = {"args": args, "kwargs": kwargs}
+        input_tensors, input_tensor_names, literals = ArgsUtils.recursive_extract_core(input_obj)
+        input_static_sig, input_dynamic_sig = ArgsUtils.generate_both_signatures_from_tensors(
+            func_name, input_tensors, input_tensor_names, literals
+        )
+        # Judge if the graph is marked as inconsistent
+        graph_entry = self.try_get_graph_entry(input_static_sig, input_dynamic_sig, layer_number)
+        if graph_entry is not None and graph_entry.inconsistent:
+            return func(*args, **kwargs)
+        # Judge if need to expand static tensors
+        if_need_expand_static_tensors = False
+        if_cached_tensor_entry = input_static_sig in self.cache
+        if if_cached_tensor_entry:
+            static_input_tensors, static_output_tensors = self.get_static_tensors(input_static_sig)
+            if_need_expand_static_tensors = self.if_need_expand_static_tensors(
+                static_input_tensors, input_tensors, input_static_sig
+            )
+        # Warmup run
+        warmup_output_obj = self.warmup_run(func, *args, **kwargs)
+        # Check input signature consistency after warmup
+        warmup_input_tensors, warmup_input_tensor_names, warmup_literals = ArgsUtils.recursive_extract_core(input_obj)
+        warmup_input_static_sig, warmup_input_dynamic_sig = ArgsUtils.generate_both_signatures_from_tensors(
+            func_name, warmup_input_tensors, warmup_input_tensor_names, warmup_literals
+        )
+        if warmup_input_static_sig != input_static_sig or warmup_input_dynamic_sig != input_dynamic_sig:
+            magi_logger.warning(
+                f"[CUDA Graph] Warning: Input signature changed during warmup run. {func_name=}, {layer_number=}"
+            )
+            self.set_graph_inconsistent(input_static_sig, input_dynamic_sig, layer_number)
+            return warmup_output_obj
+        # Update cache entries
+        if if_cached_tensor_entry:
+            if if_need_expand_static_tensors:
+                output_tensors, _, _ = ArgsUtils.recursive_extract_core(warmup_output_obj, extract_literals=False)
+                # Need to expand static tensors
+                new_static_input_tensors = self.get_expanded_static_tensors(static_input_tensors, input_tensors)
+                new_static_output_tensors = self.get_expanded_static_tensors(static_output_tensors, output_tensors)
+                # Register as new cache entries
+                self.batch_set_graph_invalid(input_static_sig)
+                self.cache[input_static_sig].input_tensors = new_static_input_tensors
+                self.cache[input_static_sig].output_tensors = new_static_output_tensors
+                self.add_template_entry(input_static_sig, input_dynamic_sig, warmup_output_obj)
+            else:
+                # Simply reuse existing static tensor entry
+                static_tensor_entry = self.cache[input_static_sig]
+                if input_dynamic_sig not in static_tensor_entry.template_entry_dict:
+                    self.add_template_entry(input_static_sig, input_dynamic_sig, warmup_output_obj)
+        else:
+            # Create new static tensor entry
+            output_tensors, _, _ = ArgsUtils.recursive_extract_core(warmup_output_obj, extract_literals=False)
+            self.add_static_entry(input_static_sig, input_tensors, output_tensors)
+            self.add_template_entry(input_static_sig, input_dynamic_sig, warmup_output_obj)
+        # Capture and cache new CUDA Graph
+        self.capture_and_cache(
+            func=func,
+            input_obj=input_obj,
+            layer_number=layer_number,
+            input_static_sig=input_static_sig,
+            input_dynamic_sig=input_dynamic_sig,
+        )
+        magi_logger.info(
+            f"[CUDA Graph] Current cache stats: {self.tensor_entry_count=}, {self.graph_count=}, {self.tensor_mem_size=:.2f} MB, {self.graph_mem_pool_size=:.2f} MB"
+        )
+        return warmup_output_obj
+_IS_GRAPH_CAPTURING = False
+def _is_graph_capturing():
+    """Query if currently capturing."""
+    global _IS_GRAPH_CAPTURING
+    return _IS_GRAPH_CAPTURING
+def _set_capture_start():
+    """Set graph capture has started."""
+    global _IS_GRAPH_CAPTURING
+    _IS_GRAPH_CAPTURING = True
+def _set_capture_end():
+    """Set graph capture has ended."""
+    global _IS_GRAPH_CAPTURING
+    _IS_GRAPH_CAPTURING = False
+# Singleton instance of CudaGraphMgr
+_CUDA_GRAPH_MGR = CudaGraphMgr()
+def cuda_graph_mgr() -> CudaGraphMgr:
+    """
+    Get the current CudaGraphMgr instance.
+    Returns:
+        CudaGraphMgr: The current CudaGraphMgr instance.
+    Raises:
+        AssertionError: If the CudaGraphMgr has not been initialized.
+    """
+    assert _CUDA_GRAPH_MGR is not None, "cuda graph manager is not initialized"
+    return _CUDA_GRAPH_MGR
+def cuda_graph_enable_if(condition: Callable):
+    def decorator(func):
+        """
+        Decorator to enable CUDA graph option for a function. The function will be executed using CUDA Graph if the condition func provided outputs True.
+        Args:
+        condition (Callable): A callable that returns a bool indicating whether enable CUDA Graph.
+        """
+        @wraps(func)
+        def wrapped_func(*args, **kwargs):
+            enable_cuda_graph = condition()
+            if not enable_cuda_graph or _is_graph_capturing():
+                return func(*args, **kwargs)
+            layer_number = getattr(args[0], "layer_number", None) if args else None
+            return cuda_graph_mgr().run(func, *args, layer_number=layer_number, **kwargs)
+        return wrapped_func
+    return decorator
+def gen_wrap_func_for_cudagraph(func: Callable, mode_prefix: str, target_prefix=None) -> Callable:
+    """
+    Wrap the given function for CUDA Graph:
+    1. Generate a unique __qualname__ for caching
+    2. Built-in call to cuda_graph_mgr().run
+    """
+    # Generate a unique identifier to avoid cache conflicts
+    func_id = id(func) if not hasattr(func, "__name__") else func.__name__
+    if mode_prefix == "full":
+        wrapped_func_name = f"Athena_CUDAGraph_{mode_prefix}_{func_id}"
+    else:  # piecewise
+        wrapped_func_name = f"Athena_CUDAGraph_{mode_prefix}_{target_prefix}_{func_id}"
+    @nvtx.instrument_nvtx
+    def wrapped_func(*args, **kwargs):
+        layer_number = kwargs.pop("layer_number", None)
+        res = cuda_graph_mgr().run(func, *args, layer_number=layer_number, **kwargs)
+        return res
+    func.__qualname__ = wrapped_func_name
+    magi_logger.info(f"Set original function qualname to {wrapped_func_name} for CUDA Graph caching.")
+    # Copy attributes from the original function to the wrapped function
+    wrapped_func.__dict__.update(func.__dict__)
+    wrapped_func.__qualname__ = wrapped_func_name
+    for attr in ["__is_first_graph", "__is_last_graph", "__sym_shape_indices"]:
+        if hasattr(func, attr):
+            setattr(wrapped_func, attr, getattr(func, attr))
+    return wrapped_func
+def init_cudagraph_global_pool():
+    """Initialize the global CUDA graph memory pool if not already initialized."""
+    from magi_compiler.cuda_graph_mgr import cuda_graph_mgr
+    if cuda_graph_mgr().graph_mem_pool is None:
+        cuda_graph_mgr().graph_mem_pool = torch.cuda.graph_pool_handle()
+        magi_logger.info("Initialized global CUDA graph pool for Athena.")

pkgs/MagiCompiler/magi_compiler/joint_graph_partition.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Any, Optional, Sequence, Tuple
+from unittest.mock import patch
+import torch
+import torch.fx as fx
+from torch._functorch.compile_utils import get_aten_target
+from torch._functorch.partitioners import NodeInfo, OpTypes, get_default_op_list, min_cut_rematerialization_partition
+from torch._inductor.custom_graph_pass import CustomPartitionerFn
+from torch.utils._ordered_set import OrderedSet
+# from magi_compiler.partitioners import min_cut_rematerialization_partition
+from .config import RecomputePolicy, get_compile_config
+from .utils import compute_code_hash, magi_logger
+from .utils.visualize import joint_graph_vis
+SAVE_TENSOR_NODES: Optional[list[fx.Node]] = None
+def is_memory_increase_by_node(node: fx.Node) -> bool:
+    # Only support aten.to now
+    assert get_aten_target(node) == torch.ops.prims.convert_element_type
+    input_dtype = node.args[0].meta["tensor_meta"].dtype
+    output_dtype = node.args[1]
+    assert output_dtype is not None
+    return output_dtype.itemsize > input_dtype.itemsize
+def is_primal_contribute_to_bwd_directly(primal_node: fx.Node, node_info: NodeInfo, op_types: OpTypes) -> bool:
+    """
+    FSDP ensures that weights already reside in memory. If there exists a path from the primal to the bwd, and the path does not contain any matmul, then the primal contributes to the bwd directly.
+    And we should save this primals.
+    """
+    if node_info.is_required_bw(primal_node):
+        return True
+    topology_start = set({primal_node})
+    while len(topology_start) > 0:
+        cur_node = topology_start.pop()
+        for user in cur_node.users:
+            if node_info.is_required_bw(user):
+                return True
+            if op_types.is_compute_intensive(user):
+                continue
+            topology_start.add(user)
+    return False
+def is_compute_intensive_and_has_following_recomputable_ops(
+    intermidiate_node: fx.Node, node_info: NodeInfo, op_types: OpTypes
+) -> Tuple[bool, fx.Node]:
+    """
+    If compute-intensive node(CIN) is not the output of fwd graph(has following memory-intensive ops in the fwd graph), then we should save this CIN node.
+    NOTE: For CIN+aten.to, we should save aten.to op instead of CIN op to save more memory.
+    """
+    if not op_types.is_compute_intensive(intermidiate_node):
+        return False, None
+    save_node = intermidiate_node
+    topology_start = set({save_node})
+    while len(topology_start) > 0:
+        cur_node = topology_start.pop()
+        fwd_user_nodes = []
+        for user in cur_node.users:
+            if node_info.is_required_fw(user):
+                fwd_user_nodes.append(user)
+        if len(fwd_user_nodes) > 1:  # multiple users, save current node
+            return True, save_node
+        elif len(fwd_user_nodes) == 0:  # output, return
+            return False, None
+        # save current node if it's user is recomputable
+        next_node = fwd_user_nodes[0]
+        if op_types.is_view(next_node):
+            if save_node == cur_node:
+                save_node = next_node
+            topology_start.add(next_node)
+        # Special case for aten.to, memory efficient case
+        elif get_aten_target(next_node) == torch.ops.prims.convert_element_type:
+            is_memory_increase = is_memory_increase_by_node(next_node)
+            if not is_memory_increase:
+                save_node = next_node
+            topology_start.add(next_node)
+        elif next_node.op == "output":
+            return False, None
+        else:
+            return True, save_node
+    assert False, f"Should not reach here: {intermidiate_node=} {save_node=}"
+# TODO: We find an elegant impl to heuristically save nodes, reconstruct this later
+def heuristic_choose_saved_values_set(joint_graph: fx.Graph, node_info: NodeInfo, memory_budget=1) -> list[fx.Node]:
+    output: OrderedSet[fx.Node] = OrderedSet()
+    op_types = get_default_op_list()
+    # Select the inputs that are required by the backward pass
+    for primal_node in node_info.inputs:
+        if is_primal_contribute_to_bwd_directly(primal_node, node_info, op_types):
+            output.add(primal_node)
+    magi_logger.info("MagiCompiler: saved_output forward-input = %s", output)
+    # Select the compute-intensive nodes that are required by the forward pass
+    for intermidiate_node in node_info.required_fw_nodes:
+        is_save, save_node = is_compute_intensive_and_has_following_recomputable_ops(intermidiate_node, node_info, op_types)
+        if is_save:
+            output.add(save_node)
+    magi_logger.info("MagiCompiler: saved_output compute-intensive = %s", output)
+    global SAVE_TENSOR_NODES
+    SAVE_TENSOR_NODES = list(output)
+    return list(output)
+def custom_joint_graph_partition_fn(
+    joint_module: fx.GraphModule,
+    _joint_inputs,
+    compiler="inductor",
+    *,
+    num_fwd_outputs,
+    static_lifetime_input_indices: Optional[list[int]] = None,
+) -> tuple[fx.GraphModule, fx.GraphModule]:
+    recompute_config = get_compile_config().recompute_config
+    if recompute_config.recompute_policy == RecomputePolicy.HANDCRAFT:
+        magi_logger.info("MagiCompiler using handcraft recompute policy")
+        # TODO: different memory budget definition from torch
+        with patch("torch._functorch.config.activation_memory_budget", recompute_config.memory_budget):
+            fwd_module, bwd_module = min_cut_rematerialization_partition(
+                joint_module,
+                _joint_inputs,
+                compiler,
+                num_fwd_outputs=num_fwd_outputs,
+                static_lifetime_input_indices=static_lifetime_input_indices,
+            )
+    elif recompute_config.recompute_policy == RecomputePolicy.HEURISTIC:
+        magi_logger.info("MagiCompiler using heuristic recompute policy")
+        with patch("torch._functorch.partitioners.choose_saved_values_set", heuristic_choose_saved_values_set):
+            fwd_module, bwd_module = min_cut_rematerialization_partition(
+                joint_module,
+                _joint_inputs,
+                compiler,
+                num_fwd_outputs=num_fwd_outputs,
+                static_lifetime_input_indices=static_lifetime_input_indices,
+            )
+    elif recompute_config.recompute_policy == RecomputePolicy.AUTOSEARCH:
+        raise ValueError(f"AutoSearch recompute policy is not supported yet")
+    else:
+        raise ValueError(f"Invalid recompute policy: {recompute_config.recompute_policy}")
+    joint_graph_vis(joint_module, fwd_module, bwd_module, save_tensor_nodes=SAVE_TENSOR_NODES)
+    return fwd_module, bwd_module
+class CustomJointGraphPartitionFn(CustomPartitionerFn):
+    def __call__(
+        self, gm: torch.fx.GraphModule, joint_inputs: Sequence[object], **kwargs: Any
+    ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+        """
+        Implementation of the custom partitioner.
+        """
+        return custom_joint_graph_partition_fn(gm, joint_inputs, **kwargs)
+    def uuid(self) -> Optional[Any]:
+        """
+        Return an ID to uniquely identify your custom partitioner implementation.
+        Return None to skip inductor code caching entirely.
+        """
+        return compute_code_hash({os.path.abspath(__file__)})

pkgs/MagiCompiler/magi_compiler/magi_backend.py ADDED Viewed

	@@ -0,0 +1,607 @@

+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+import dataclasses
+import pprint
+import time
+from collections.abc import Callable
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any
+import magi_compiler.utils.envs as envs
+import torch
+import torch.fx as fx
+from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo.utils import lazy_format_graph_code
+from torch._guards import detect_fake_mode
+from ._cache_data_cls import CacheEntry, CacheHandle
+from .compile_artifacts import MagiSerializableFunction
+from .config import CompileConfig, CompileMode, CudaGraphMode
+from .cuda_graph_mgr import gen_wrap_func_for_cudagraph
+from .joint_graph_partition import CustomJointGraphPartitionFn
+from .offload.offload_warpper import OffloadWrapper
+from .partition_rules import inductor_partition_rule_context, resolve_defined_ops
+from .passes import PostGradPassManager
+from .passes.inductor_pass import pass_context
+from .passes.replace_pass import FullGraphPassManager
+from .piecewise_backend import PiecewiseBackend
+from .piecewise_compiler import CompilerInterface, EagerAdaptor, InductorStandaloneAdaptor
+from .utils import (
+    CompileMonitor,
+    compilation_counter,
+    compute_code_hash,
+    compute_hash,
+    detect_symbolic_tensor_indices,
+    magi_logger,
+)
+from .utils.envs import MAGI_CUSTOM_PARTITIONER_FN, MAGI_MODEL_TAG, MAGI_POST_GRAD_PASS
+from .utils.visualize import save_fx_graph_visualization
+compilation_start_time: float = 0.0
+def _print_with_shape_and_time(runtime_shape: int | None, prefix: str = ""):
+    elapsed = time.time() - compilation_start_time
+    if runtime_shape is None:
+        magi_logger.info("%s for dynamic shape, took %.3f s", prefix, elapsed)
+    else:
+        magi_logger.info("%s for shape %s, took %.3f s", prefix, str(runtime_shape), elapsed)
+@dataclasses.dataclass
+class SplitItem:
+    submod_name: str
+    graph_id: int
+    is_splitting_graph: bool
+    graph: fx.GraphModule
+def make_compiler(compile_config: CompileConfig) -> CompilerInterface:
+    if compile_config.backend == "inductor":
+        # Use standalone_compile with PyTorch 2.8+
+        assert hasattr(torch._inductor, "standalone_compile"), "standalone_compile not found in PyTorch Inductor"
+        magi_logger.info("Using InductorStandaloneAdaptor")
+        return InductorStandaloneAdaptor()
+    else:
+        assert compile_config.backend == "eager", f"Invalid backend for MagiCompiler: {compile_config.backend}"
+        magi_logger.info("Using EagerAdaptor")
+        return EagerAdaptor()
+class CompilerManager:
+    """
+    Manage the compilation process, including graph compilation, compile artifacts caching and loading.
+    The cache is a dict mapping `(runtime_shape, graph_index, backend_name)` to `any_data` returned from the compiler.
+    When serializing the cache, we save it to a Python file for readability. We don't use json here because json doesn't support int as key.
+    """
+    def __init__(self, compile_config: CompileConfig):
+        self.cache: dict[CacheEntry, CacheHandle] = dict()
+        self.compile_config = compile_config
+        self.compiler = make_compiler(compile_config)
+        self.disable_cache = envs.MAGI_DISABLE_COMPILE_CACHE
+    @property
+    def hash(self) -> str:
+        return self.compiler.hash
+    @contextmanager
+    def compile_context(self, runtime_shape: int | None = None):
+        """Provide compilation context for the duration of compilation to set
+        any torch global properties we want to scope to a single Inductor
+        compilation (e.g. partition rules, pass context)."""
+        with pass_context(runtime_shape):
+            if self.compile_config.use_inductor_graph_partition:
+                inductor_partition_ops = resolve_defined_ops(self.compile_config.splitting_ops)
+                with inductor_partition_rule_context(inductor_partition_ops):
+                    yield
+            else:
+                yield
+    def initialize_cache(self, cache_dir: Path, prefix: str = ""):
+        """
+        Initialize the cache directory for the compiler.
+        The organization of the cache directory is as follows:
+        cache_dir=/path/to/torch_compile_cache/rank_i_j/hash_str/prefix/
+        inside cache_dir, there will be:
+        - magi_compile_cache.py
+        - computation_graph.py
+        for multiple prefixes, they can share the same base cache dir of
+        /path/to/torch_compile_cache/rank_i_j/hash_str/ to store some
+        common compilation artifacts.
+        """
+        self.cache_dir: Path = cache_dir
+        self.cache_file_path: Path = cache_dir / "magi_compile_cache.py"
+        if self.disable_cache:
+            magi_logger.info("MagiCompiler's cache is disabled.")
+            return
+        magi_logger.info("Using cache directory: %s for MagiCompiler", cache_dir)
+        if self.cache_file_path.exists():
+            # load the cache from the file
+            with self.cache_file_path.open() as f:
+                # Parse Python literals using ast.literal_eval, which is a safe alternative to eval().
+                raw = ast.literal_eval(f.read())
+                self.cache = {CacheEntry(*entry): CacheHandle(*handle) for entry, handle in raw.items()}
+        self.compiler.initialize_cache(cache_dir=self.cache_dir, prefix=prefix)
+    def save_to_file(self):
+        if self.disable_cache:
+            return
+        # serialize to a literal-friendly dict
+        serializable = {(e.runtime_shape, e.graph_index, e.backend_name): (h.key, h.path) for e, h in self.cache.items()}
+        printer = pprint.PrettyPrinter(indent=4)
+        data = printer.pformat(serializable)
+        with self.cache_file_path.open("w") as f:
+            f.write(data)
+    def load(self, graph: fx.GraphModule, example_inputs: list[Any], cache_entry: CacheEntry) -> Callable | None:
+        if cache_entry not in self.cache:
+            return None
+        cache_handle = self.cache[cache_entry]
+        _print_with_shape_and_time(
+            cache_entry.runtime_shape,
+            f"Directly load the {cache_entry.graph_index}-th graph from {cache_entry.backend_name} via handle {cache_handle}",
+        )
+        return self.compiler.load(graph, example_inputs, cache_entry, cache_handle)
+    # TODO(hongyu): Support training mode here
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: tuple[torch.fx.node.Argument, ...],
+        compile_config: CompileConfig,
+        graph_index: int = 0,
+        num_graphs: int = 1,
+        runtime_shape: int | None = None,
+    ) -> Callable:
+        # Step0: update some global metrics
+        compilation_counter.num_backend_compilations += 1
+        if graph_index == 0:
+            global compilation_start_time
+            compilation_start_time = time.time()
+        # Step1: Try loading from the cache
+        cache_entry = CacheEntry(runtime_shape, graph_index, self.compiler.name)
+        compiled_graph = self.load(graph, example_inputs, cache_entry)
+        if compiled_graph is not None:
+            return compiled_graph
+        # Step2: Compile the graph
+        key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
+        with self.compile_context(runtime_shape):
+            compiled_graph, cache_handle = self.compiler.compile(
+                graph, example_inputs, compile_config.inductor_compile_config, runtime_shape, key
+            )
+            assert compiled_graph is not None, "Failed to compile the graph"
+        # Step3: Store the artifact in the cache
+        if not self.disable_cache and cache_handle is not None:
+            assert cache_entry not in self.cache, "Cache entry already exists"
+            self.cache[cache_entry] = cache_handle
+            compilation_counter.num_cache_entries += 1
+        _print_with_shape_and_time(runtime_shape, f"Compile the {graph_index}/{num_graphs} graph")
+        return compiled_graph
+# TODO(hongyu): Support training mode here
+class PiecewiseCompileInterpreter(torch.fx.Interpreter):
+    """
+    Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
+    It runs the given graph with fake inputs, and compile some submodules specified by `compile_submod_names` with compilation configs.
+    NOTE: the order in `compile_submod_names` matters, because it will be used to determine the order of the compiled piecewise graphs.
+    The first graph will handle logging, and the last graph has some special cudagraph output handling.
+    """
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        compiler_manager: CompilerManager,
+        compile_submod_names: list[str],
+        compile_config: CompileConfig,
+    ):
+        super().__init__(module)
+        self.fake_mode = detect_fake_mode()
+        self.compiler_manager = compiler_manager
+        self.compile_submod_names = compile_submod_names
+        self.compile_config = compile_config
+        # extra_traceback is attribute of torch.fx.Interpreter, when it is True, it annoyingly dumps the torch.fx.Graph on errors.
+        self.extra_traceback = False
+    def _fix_graph_device_placement(self, module: torch.nn.Module):
+        for name, child in module.named_children():
+            self._fix_graph_device_placement(child)
+        if isinstance(module, torch.fx.GraphModule):
+            needs_recompile = False
+            target_device = torch.cuda.current_device()
+            factory_functions = [
+                torch.empty,
+                torch.zeros,
+                torch.ones,
+                torch.full,
+                torch.rand,
+                torch.randn,
+                torch.arange,
+                torch.tensor,
+                torch.ops.aten.empty.memory_format,
+            ]
+            for node in module.graph.nodes:
+                if node.op == 'call_function':
+                    is_factory = node.target in factory_functions or (
+                        hasattr(node.target, '__name__') and node.target.__name__ in ['empty', 'zeros', 'ones', 'full']
+                    )
+                    if is_factory:
+                        if 'device' in node.kwargs:
+                            current_dev = node.kwargs['device']
+                            if str(current_dev) == 'cpu' or current_dev == torch.device('cpu'):
+                                node.update_kwarg('device', target_device)
+                                needs_recompile = True
+            if needs_recompile:
+                module.recompile()
+    def run(self, *args):
+        fake_args = [self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t for t in args]
+        if self.compile_config.offload_config.model_cpu_offload:
+            self._fix_graph_device_placement(self.module)
+            for i, arg in enumerate(fake_args):
+                if isinstance(arg, torch.Tensor):
+                    fake_args[i] = arg.cuda()
+        with self.fake_mode, enable_python_dispatcher():
+            return super().run(*fake_args)
+    def call_module(
+        self, target: torch.fx.node.Target, args: tuple[torch.fx.node.Argument, ...], kwargs: dict[str, Any]
+    ) -> Any:
+        assert isinstance(target, str)
+        output = super().call_module(target, args, kwargs)
+        if target not in self.compile_submod_names:
+            return output
+        index = self.compile_submod_names.index(target)
+        submod = self.fetch_attr(target)
+        sym_shape_indices = [i for i, x in enumerate(args) if isinstance(x, torch.SymInt)]
+        magi_logger.info(f"Compiling {target=}, {sym_shape_indices=}, {args=}")
+        compiled_graph_for_dynamic_shape = self.compiler_manager.compile(
+            submod, args, self.compile_config, graph_index=index, num_graphs=len(self.compile_submod_names), runtime_shape=None
+        )
+        piecewise_backend = PiecewiseBackend(
+            submod,
+            compiled_graph_for_dynamic_shape,
+            self.compile_config,
+            index,
+            len(self.compile_submod_names),
+            sym_shape_indices,
+            self.compiler_manager,
+        )
+        if self.compile_config.use_inductor_graph_partition or self.compile_config.cudagraph_mode != CudaGraphMode.PIECEWISE:
+            self.module.__dict__[target] = piecewise_backend
+        else:
+            wrapped_backend = gen_wrap_func_for_cudagraph(
+                func=piecewise_backend, mode_prefix=CudaGraphMode.PIECEWISE.name.lower(), target_prefix=target
+            )
+            self.module.__dict__[target] = wrapped_backend
+            magi_logger.info(
+                f"Wrapped piecewise submodule {target} (index {index}) with CUDA Graph "
+                f"[PIECEWISE mode, first_graph={piecewise_backend.is_first_graph}, last_graph={piecewise_backend.is_last_graph}]"
+            )
+        return output
+class MagiBackend:
+    """
+    The compilation backend for `torch.compile` with MagiCompiler.
+    It is used for compilation mode of `CompileMode.MAGI_COMPILE`,
+    where we customize the compilation.
+    The major work of this backend is to split the graph into
+    piecewise graphs, and pass them to the piecewise backend.
+    This backend also adds the PostGradPassManager to Inductor config,
+    which handles the post-grad passes.
+    """
+    compile_config: CompileConfig
+    _called_once: bool = False
+    # for the graph we compiled
+    graph: fx.GraphModule
+    compiler_manager: CompilerManager
+    # for cudagraph
+    sym_tensor_indices: list[int]  # indices for tensors that have symbolic shapes
+    input_buffers: list[torch.Tensor]  # buffers for input tensors that have symbolic shapes
+    def __init__(self, compile_config: CompileConfig, model_tag: str = ""):
+        self.model_tag = model_tag or MAGI_MODEL_TAG
+        self.compile_config = compile_config
+        self._configure_custom_passes()
+        self.compiler_manager: CompilerManager = CompilerManager(self.compile_config)
+        self.sym_tensor_indices = []
+        self.input_buffers = []
+    def _configure_custom_passes(self):
+        # Custom pass 1: full graph passes between Dynamo and AOTAutograd
+        self.full_graph_pass_manager = FullGraphPassManager(self.compile_config.pass_config)
+        # Custom pass 2: custom partitioner function
+        custom_partitioner_fn = CustomJointGraphPartitionFn()
+        if MAGI_CUSTOM_PARTITIONER_FN in self.compile_config.inductor_compile_config:
+            existing_fn = self.compile_config.inductor_compile_config[MAGI_CUSTOM_PARTITIONER_FN]
+            assert isinstance(existing_fn, CustomJointGraphPartitionFn)
+            assert existing_fn.uuid() == custom_partitioner_fn.uuid()
+        self.compile_config.inductor_compile_config[MAGI_CUSTOM_PARTITIONER_FN] = custom_partitioner_fn
+        # Custom pass 3: post-grad passes after AOTAutograd
+        post_grad_pass_manager = PostGradPassManager()
+        post_grad_pass_manager.configure(self.compile_config)
+        # Run post-grad custom passes with post_grad_custom_post_pass hook
+        if MAGI_POST_GRAD_PASS in self.compile_config.inductor_compile_config:
+            existing_pass = self.compile_config.inductor_compile_config[MAGI_POST_GRAD_PASS]
+            assert isinstance(existing_pass, PostGradPassManager)
+            assert existing_pass.uuid() == post_grad_pass_manager.uuid()
+        self.compile_config.inductor_compile_config[MAGI_POST_GRAD_PASS] = post_grad_pass_manager
+    def _init_cache(self) -> str:
+        hash_key = compute_hash(
+            [self.compile_config.hash, self.compiler_manager.hash, compute_code_hash(self.compile_config.traced_files)]
+        )
+        self.compile_config.traced_files.clear()
+        # Path: .../model_{idx}_{model_tag}_rank_{rank}/{hash}/{model_tag}/ (last segment = class name or user tag)
+        self.local_cache_dir: Path = self.compile_config.cache_dump_path() / hash_key / self.model_tag
+        self.local_cache_dir.mkdir(parents=True, exist_ok=True)
+        self.compiler_manager.initialize_cache(self.local_cache_dir, self.model_tag)
+    def _save_partitioned_graph(self, split_gm: fx.GraphModule):
+        graph_path = self.local_cache_dir / "computation_graph.py"
+        if not graph_path.exists():
+            # code adapted from
+            # https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30
+            # use `print_readable` because it can include submodules
+            src = "from __future__ import annotations\nimport torch\n" + split_gm.print_readable(print_output=False)
+            src = src.replace("<lambda>", "GraphModule")
+            with open(graph_path, "w") as f:
+                f.write(src)
+            magi_logger.info("Computation graph saved to %s", graph_path)
+    def _split_graph(self, graph: fx.GraphModule) -> tuple[fx.GraphModule, list[SplitItem]]:
+        # Step 1: resolve the splitting ops
+        if self.compile_config.use_inductor_graph_partition:
+            # Let Inductor decide partitioning; avoid FX-level pre-splitting.
+            fx_split_ops: list[str] = []
+        else:
+            fx_split_ops = self.compile_config.splitting_ops or []
+        resolved_ops: list[torch._ops.OpOverload] = resolve_defined_ops(fx_split_ops)
+        magi_logger.info(f"Setting up FX-level graph split with ops: {fx_split_ops=}")
+        magi_logger.info(f"Resolved splitting ops for FX-level graph split: {resolved_ops=}")
+        # Step 2: split graph by ops, we split graph based on resolved_ops, which becomes the partitioned single graph.
+        subgraph_id = 0
+        node_to_subgraph_id = {}
+        split_op_graphs = []
+        for node in graph.graph.nodes:
+            if node.op in ("output", "placeholder"):
+                continue
+            # Match node.target against resolved_ops, node.target can be OpOverloadPacket, need to check .default
+            if node.op == "call_function" and (
+                node.target in resolved_ops or (hasattr(node.target, "default") and node.target.default in resolved_ops)
+            ):
+                magi_logger.info(f"Splitting graph at {node=} with {node.target=}")
+                subgraph_id += 1
+                node_to_subgraph_id[node] = subgraph_id
+                split_op_graphs.append(subgraph_id)
+                subgraph_id += 1
+            else:
+                node_to_subgraph_id[node] = subgraph_id
+        # Step 3: split the graph based on node_to_subgraph_id
+        # pytorch might reorder the nodes and the semantics of the graph will change when we have mutations in the graph, if we don't set keep_original_order=True
+        split_gm = torch.fx.passes.split_module.split_module(
+            graph, None, lambda node: node_to_subgraph_id[node], keep_original_order=True
+        )
+        def _extract_example_values(args) -> list:
+            example_values = []
+            def _recurse_extract(arg):
+                if isinstance(arg, (list, tuple)):
+                    for sub_arg in arg:
+                        _recurse_extract(sub_arg)
+                else:
+                    example_value = arg.meta.get("example_value")
+                    assert example_value is not None, f"Output arg {arg} has no example_value for tensor_meta recovery"
+                    example_values.append(example_value)
+            _recurse_extract(args)
+            return example_values
+        def _format_output_values(values: list):
+            if not values:
+                return None
+            return tuple(values) if len(values) > 1 else values[0]
+        def _recursive_recover_tensor_meta(gm: fx.GraphModule):
+            """
+            递归恢复指定 GraphModule 及其所有嵌套 submodule 中所有 node 的 example_value
+            支持任意层级的 submodule 嵌套
+            """
+            for node in gm.graph.nodes:
+                if node.meta.get("example_value") is not None:
+                    continue
+                if node.op == "call_module":
+                    submod: fx.GraphModule = getattr(gm, node.target)
+                    _recursive_recover_tensor_meta(submod)  # 递归调用，处理嵌套 submodule
+                    output_node = next(n for n in submod.graph.nodes if n.op == "output")
+                    assert output_node is not None, f"Output node not found in submodule {node.target}"
+                    output_values = _extract_example_values(output_node.args)
+                    node.meta["example_value"] = _format_output_values(output_values)
+                elif node.op == "call_function":
+                    if "getitem" in str(node.target):
+                        prev_node, getitem_index = node.args
+                        prev_example_value = prev_node.meta.get("example_value")
+                        assert (
+                            prev_example_value is not None
+                        ), f"Previous node {prev_node} has no example_value for tensor_meta recovery of node {node}"
+                        node.meta["example_value"] = prev_example_value[getitem_index]
+                    elif "grad" in str(node.target) or "device" in str(node.target):  # 暂时不做处理
+                        node.meta["example_value"] = None
+                elif node.op == "output":
+                    output_values = _extract_example_values(node.args[0])
+                    node.meta["example_value"] = _format_output_values(output_values)
+                else:
+                    raise ValueError(f"Unsupported node op for tensor_meta recovery: {node.op} for node {node}")
+                magi_logger.info(f"Recovered example_value for node {node.name}: {node.meta['example_value']=}")
+        # Recover tensor_meta for all nodes in split_gm and its submodules
+        if envs.MAGI_ENABLE_PROFILE:
+            _recursive_recover_tensor_meta(split_gm)
+        # Step 4: fetch all the submodules
+        piecewise_graphs = []
+        names = [name for (name, module) in split_gm.named_modules()]
+        for name in names:
+            # Only keep the top-level modules, skip recursive child modules or the root module
+            if "." in name or name == "":
+                continue
+            module = getattr(split_gm, name)
+            assert isinstance(module, fx.GraphModule), f"Expected fx.GraphModule, got {type(module)}"
+            graph_id = int(name.replace("submod_", ""))
+            piecewise_graphs.append(SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
+        # sort by integer graph_id, rather than string name
+        piecewise_graphs.sort(key=lambda x: x.graph_id)
+        # Step 5: visualize the split graph
+        # depyf already hooks lazy_format_graph_code and dumps the graph, we do not print the graph here
+        lazy_format_graph_code("Before split", graph, print_output=True, include_stride=True, include_device=True)
+        lazy_format_graph_code("After split", split_gm, print_output=True, include_stride=True, include_device=True)
+        if envs.MAGI_ENABLE_FX_GRAPH_VIZ:
+            save_fx_graph_visualization(split_gm.graph, sub_dir="after_split", filename="split_gm_root")
+            for item in piecewise_graphs:
+                save_fx_graph_visualization(item.graph.graph, sub_dir="after_split", filename=item.submod_name)
+        return split_gm, piecewise_graphs
+    def __call__(self, graph: fx.GraphModule, example_inputs) -> MagiSerializableFunction:
+        assert not self._called_once, "MagiBackend can only be called once cause compilation is a one-time process"
+        self._called_once = True
+        magi_logger.info("Dynamo traced files (for compilation cache):\n%s", "\n".join(self.compile_config.traced_files))
+        compilation_counter.num_graphs_seen += 1
+        CompileMonitor().mark("Dynamo bytecode transform")
+        self._init_cache()
+        self.full_graph_pass_manager(graph)
+        split_gm, piecewise_graphs = self._split_graph(graph)
+        submod_names_to_compile = [item.submod_name for item in piecewise_graphs if not item.is_splitting_graph]
+        compilation_counter.num_piecewise_graphs_seen += len(piecewise_graphs)
+        compilation_counter.num_piecewise_capturable_graphs_seen += len(submod_names_to_compile)
+        magi_logger.info(f"Piecewise modules waiting for compilation: {submod_names_to_compile}")
+        # propagate the split graph to the piecewise backend, compile submodules with symbolic shapes
+        try:
+            PiecewiseCompileInterpreter(split_gm, self.compiler_manager, submod_names_to_compile, self.compile_config).run(
+                *example_inputs
+            )
+        except Exception as e:
+            # Magi compile 的集中失败入口：直接打印 ERROR，方便在大模型日志中 grep
+            magi_logger.error("Magi compile failed while compiling piecewise submodules %s: %s", submod_names_to_compile, e)
+            raise
+        self._save_partitioned_graph(split_gm)
+        # TODO: Support DBO and NAT here
+        # split_gm = DBOGraphModule(split_gm, self.compile_config)
+        if self.compile_config.offload_config.model_cpu_offload:
+            split_gm = OffloadWrapper(split_gm, self.compile_config)
+        # if envs.MAGI_ENABLE_TOKENFLOW:
+        #     from magi_compiler.tokenflow.graph_fork import GraphForkWrapper
+        if envs.MAGI_ENABLE_PROFILE:
+            from magi_compiler.tokenflow.graph_profile import gen_profile_wrap_func
+            split_gm = gen_profile_wrap_func(split_gm)
+        if self.compile_config.cudagraph_mode == CudaGraphMode.FULL and self.compile_config.cudagraph_copy_inputs:
+            return self._serialize_func_with_cudagraph(graph, split_gm, example_inputs)
+        return MagiSerializableFunction(graph, example_inputs, self.model_tag, split_gm)
+    def _serialize_func_with_cudagraph(
+        self, graph: fx.GraphModule, split_gm: fx.GraphModule, example_inputs: list[Any]
+    ) -> MagiSerializableFunction:
+        fake_mode = detect_fake_mode()
+        fake_args = [fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t for t in example_inputs]
+        self.sym_tensor_indices = detect_symbolic_tensor_indices(fake_args)
+        wrapped_split_gm = gen_wrap_func_for_cudagraph(func=split_gm, mode_prefix=CudaGraphMode.FULL.name.lower())
+        return MagiSerializableFunction(graph, example_inputs, self.model_tag, wrapped_split_gm)
+def init_backend(compile_config: CompileConfig) -> str | Callable:
+    """
+    Initialize the backend based on CompileConfig.
+    """
+    if compile_config.compile_mode is None or compile_config.compile_mode == CompileMode.NONE:
+        raise ValueError("No compilation mode is set.")
+    from torch._dynamo.backends.registry import list_backends
+    torch_backends = list_backends(exclude_tags=tuple())
+    magi_logger.info("Supported torch backends: %s", torch_backends)
+    if compile_config.compile_mode == CompileMode.TORCH_COMPILE:
+        assert compile_config.backend in torch_backends, f"Invalid backend for torch compilation: {compile_config.backend}"
+        return compile_config.backend
+    elif compile_config.compile_mode == CompileMode.MAGI_COMPILE:
+        assert compile_config.backend in ["eager", "inductor"], f"Invalid backend for MagiCompiler: {compile_config.backend}"
+        model_tag = getattr(compile_config, "model_tag", None) or MAGI_MODEL_TAG
+        return MagiBackend(compile_config, model_tag=model_tag)
+    else:
+        raise ValueError(f"Invalid compile mode: {compile_config.compile_mode}")

pkgs/MagiCompiler/magi_compiler/magi_compiler_base.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import inspect
+import os
+import sys
+from abc import abstractmethod
+from contextlib import contextmanager
+from types import CodeType
+from typing import Callable, Literal
+import magi_compiler.utils.envs as envs
+import torch
+from magi_compiler.utils import compute_hash, get_git_version
+from magi_compiler.utils.compile_time_monitor import CompileMonitor
+from .config import CompileConfig, CompileMode
+from .magi_backend import init_backend
+from .utils import compute_code_hash, compute_code_hash_with_content, magi_logger
+def _verify_source_unchanged(source_info, compile_config: CompileConfig) -> None:
+    file_contents = {}
+    for source in source_info.inlined_sources:
+        module = sys.modules[source.module]
+        file = inspect.getfile(module)
+        file_contents[file] = source.content
+        compile_config.traced_files.add(file)
+    expected_checksum = compute_code_hash_with_content(file_contents)
+    actual_checksum = compute_code_hash(set(file_contents.keys()))
+    if expected_checksum != actual_checksum:
+        raise RuntimeError("Source code has changed since the last compilation. Recompiling the model.")
+class MagiCompilerBase:
+    compile_config: CompileConfig
+    """
+    A wrapper class for torch.compile, with a custom dispatch logic.
+    Subclasses should:
+    1. Implement the forward method
+    2. Implement the dispatch logic in the __call__ method
+        It can use `self.compiled_codes` to access the compiled bytecode,
+        and `with self.dispatch_to_compiled_code:` to dispatch to
+        the compiled code.
+    3. Implement the `__init__` method to determine how to call
+        `torch.compile` over the forward method.
+    """
+    def __init__(self, compile_config: CompileConfig):
+        backend = init_backend(compile_config)
+        options = None
+        if isinstance(backend, str) and backend == "inductor":
+            options = compile_config.inductor_compile_config
+        if envs.MAGI_AOT_COMPILE:
+            options = options or {}
+            # Drop all the guards in the AOT compile mode as bytecode hook is not used anymore.
+            options["guard_filter_fn"] = lambda guards: [False for _ in guards]
+            assert hasattr(torch._dynamo.config, "enable_aot_compile"), "enable_aot_compile config not available"
+            torch._dynamo.config.enable_aot_compile = True
+        self.compiled_callable = torch.compile(self.forward, fullgraph=True, backend=backend, options=options)
+        self.original_code_object: CodeType = self.__class__.forward.__code__
+        self.compiled_code: CodeType | None = None
+        self.aot_compiled_fn: Callable | None = None
+    @property
+    def aot_compilation_path(self) -> str:
+        """
+        When using torch.compile in AOT mode, we store the cache artifacts
+        under cache_root_dir/torch_aot_compile/{hash}/rank_i_j. The {hash}
+        contains all of the factors except for the source files being
+        traced through, because we don't actually know which source files
+        to check at this point (before dynamo runs).
+        On loading we will actually look at the source files being traced
+        through. If any source file have changed (compared with the
+        serialized backend artifacts), then we need to generate a new AOT
+        compile artifact from scratch.
+        """
+        hash_key = compute_hash([self.forward, self.compile_config.model_idx, self.compile_config.hash, get_git_version()])
+        cache_dir = os.path.join(self.compile_config.cache_root_dir, "torch_aot_compile", hash_key)
+        rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        cache_dir = os.path.join(cache_dir, f"rank_{rank}")
+        os.makedirs(cache_dir, exist_ok=True)
+        aot_compilation_path = os.path.join(cache_dir, "model")
+        return aot_compilation_path
+    def try_load_aot_compile_artifacts(self) -> Callable | None:
+        if self.aot_compiled_fn is not None:
+            return self.aot_compiled_fn
+        if not os.path.exists(self.aot_compilation_path):
+            return None
+        with open(self.aot_compilation_path, "rb") as f:
+            CompileMonitor().start(
+                self.compile_config.compile_mode == CompileMode.MAGI_COMPILE, self.compile_config.debug_dump_path()
+            )
+            loaded_fn = torch.compiler.load_compiled_function(f)
+        _verify_source_unchanged(loaded_fn.source_info(), self.compile_config)
+        return loaded_fn
+    def aot_compile(self, *args, **kwargs):
+        """
+        Run the model in AOT (Ahead-Of-Time) compile mode.
+        All compilation work is completed before execution, suitable for production environment.
+        This results in longer compilation time but superior runtime performance.
+        """
+        assert hasattr(self.compiled_callable, "aot_compile"), "aot_compile is not supported by the current configuration"
+        return self.compiled_callable.aot_compile((args, kwargs))
+    def jit_compile(self, *args, **kwargs):
+        """
+        Run the model in JIT (Just-In-Time) compile mode.
+        Compilation occurs at runtime, first run may be slower due to compilation overhead.
+        """
+        handle = torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
+        output = self.compiled_callable(*args, **kwargs)
+        handle.remove()
+        return output
+    @abstractmethod
+    def forward(self, *args, **kwargs):
+        ...
+    def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
+        """Hook to save the compiled bytecode for direct execution."""
+        if old_code is not self.original_code_object:
+            return
+        # Step1: Check if the old bytecode is from the compiled code
+        # code borrowed from depyf enable_debugging.py
+        frame = sys._getframe()
+        while frame and frame.f_back:
+            frame = frame.f_back
+            code_name = frame.f_code.co_name
+            file_name = frame.f_code.co_filename.split(os.path.sep)[-1]
+            if code_name == "_compile" and file_name == "convert_frame.py":
+                break
+        frame = frame.f_locals["frame"]
+        assert frame.f_code == old_code
+        if hasattr(frame.f_locals, "self") and frame.f_locals["self"] is not self:
+            return
+        # Step2: Save the compiled bytecode
+        self.compiled_code = new_code
+        # Step3: Save the decompiled code
+        path = self.compile_config.debug_dump_path()
+        decompiled_file = os.path.join(path, "decompiled_code.py")
+        if os.path.exists(decompiled_file):
+            return
+        try:
+            # usually the decompilation will succeed for most models, as we guarantee a full-graph compilation in Dynamo.
+            # but there's no 100% guarantee, since decompliation is not a reversible process.
+            from magi_compiler.magi_depyf import decompile as magi_decompile
+            src = magi_decompile(new_code)
+            with open(decompiled_file, "w") as f:
+                f.write(src)
+            magi_logger.info("Dynamo transformed code saved to %s", decompiled_file)
+        except Exception:
+            pass
+    @contextmanager
+    def dispatch_to_compiled_fwd(self, mode: Literal["jit", "aot"] = "jit"):
+        """
+        Context manager to dispatch to the compiled code.
+        Why does this work? Because Dynamo guarantees that the compiled
+        bytecode has exactly the same arguments, cell variables, and free
+        variables as the original code. Therefore we can directly switch
+        the code object in the function and call it.
+        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7
+        for more details.
+        NOTE: Why compile `forward` but invoke through `old_call`?
+        In torch.nn.Module, `__call__` wraps `forward` with critical runtime logic:
+        - Pre/post forward hooks
+        - FSDP parameter sharding/gathering and device placement
+        Our strategy: use this context manager to temporarily replace `self.forward`
+        with the compiled version, then invoke `old_call(self, *args, **kwargs)`.
+        This way:
+        1. `old_call` executes hooks and FSDP mechanics normally
+        2. When `old_call` internally calls `self.forward`, it hits our compiled code
+        3. Compiled code runs within the proper FSDP/hook context
+        Calling `self.forward()` directly would bypass FSDP (seeing sharded/invalid
+        params) and skip hooks that other components may rely on.
+        """
+        if mode == "jit":
+            assert self.compiled_code is not None
+            self.__class__.forward.__code__ = self.compiled_code
+            yield
+            self.__class__.forward.__code__ = self.original_code_object
+        elif mode == "aot":
+            assert self.aot_compiled_fn is not None
+            old_forward = self.forward
+            self.forward = lambda *args, **kwargs: self.aot_compiled_fn(self, *args, **kwargs)
+            yield
+            self.forward = old_forward
+        else:
+            raise ValueError(f"Invalid mode: {mode}")

pkgs/MagiCompiler/magi_compiler/magi_depyf/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""magi_depyf — a modern bytecode decompiler and torch.compile inspector."""
+from .decompile import DecompilationError, Decompiler, decompile, safe_decompile
+__version__ = "0.1.0"
+__all__ = ["Decompiler", "decompile", "safe_decompile", "DecompilationError", "__version__"]

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decompilation: bytecode → Python source, plus recompile/fix/postprocess."""
+from .decompiler import DecompilationError, Decompiler, decompile, safe_decompile
+__all__ = ["Decompiler", "decompile", "safe_decompile", "DecompilationError"]

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Bytecode processing — pure Python, no torch dependency."""
+from .decompile_context import DecompileContext
+from .handler_registry import HandlerRegistry, registry
+from .instruction import Instruction
+from .source_emitter import SourceEmitter
+__all__ = ["Instruction", "SourceEmitter", "HandlerRegistry", "DecompileContext", "registry"]

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/decompile_context.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DecompileContext — read-only bag passed to every handler.
+Handlers receive ``(emitter, inst, ctx)`` — they mutate *emitter*
+and call *ctx* methods but never touch the ``Decompiler`` directly.
+"""
+from __future__ import annotations
+from types import CodeType
+from typing import TYPE_CHECKING, Callable, Dict, Tuple
+if TYPE_CHECKING:
+    from .instruction import Instruction
+class DecompileContext:
+    """Read-only context providing handlers with instructions, code object,
+    and the ``decompile_range`` callback for recursive sub-block decompilation."""
+    def __init__(
+        self,
+        code: CodeType,
+        instructions: Tuple["Instruction", ...],
+        indentation: int,
+        decompile_range: Callable,
+        offset_to_index: Dict[int, int],
+    ) -> None:
+        self.code = code
+        self.instructions = instructions
+        self.indentation = indentation
+        self.decompile_range = decompile_range
+        self._offset_to_index = offset_to_index
+    def index_of(self, offset: int) -> int:
+        """Return the index of the instruction at *offset* (O(1) lookup)."""
+        try:
+            return self._offset_to_index[offset]
+        except KeyError:
+            raise ValueError(f"No instruction at offset {offset}") from None

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handler_registry.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HandlerRegistry — opcode-to-handler dispatch.
+A *handler* is a plain function with signature::
+    (emitter: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> Optional[int]
+Returning ``None`` advances to the next instruction.
+Returning an ``int`` jumps to that instruction index.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Callable, List, Optional
+if TYPE_CHECKING:
+    pass
+HandlerFn = Callable[..., Optional[int]]
+class HandlerRegistry:
+    """Maps opcode names -> handler functions."""
+    def __init__(self) -> None:
+        self._handlers: dict[str, HandlerFn] = {}
+    def register(self, *opnames: str) -> Callable[[HandlerFn], HandlerFn]:
+        """Decorator that registers *fn* for one or more opcode names."""
+        def decorator(fn: HandlerFn) -> HandlerFn:
+            for name in opnames:
+                self._handlers[name] = fn
+            return fn
+        return decorator
+    def get(self, opname: str) -> Optional[HandlerFn]:
+        return self._handlers.get(opname)
+    def __contains__(self, opname: str) -> bool:
+        return opname in self._handlers
+    def supported_opnames(self) -> List[str]:
+        return sorted(self._handlers.keys())
+# Singleton registry — handlers register against this at import time.
+registry = HandlerRegistry()

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Import every handler module so they register against the global registry."""
+from . import arithmetic  # noqa: F401
+from . import calls  # noqa: F401
+from . import containers  # noqa: F401
+from . import control_flow  # noqa: F401
+from . import load_store  # noqa: F401
+from . import stack_ops  # noqa: F401

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/arithmetic.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Handlers for unary, binary, inplace, and comparison operations."""
+from __future__ import annotations
+from ..decompile_context import DecompileContext
+from ..handler_registry import registry
+from ..instruction import Instruction
+from ..source_emitter import SourceEmitter
+_reg = registry.register
+# ── Unary ─────────────────────────────────────────────────────────────────
+_UNARY_SYMBOLS = {"UNARY_NEGATIVE": "-", "UNARY_POSITIVE": "+", "UNARY_INVERT": "~", "UNARY_NOT": "not"}
+@_reg(*_UNARY_SYMBOLS)
+def _unary(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.push(f"({_UNARY_SYMBOLS[inst.opname]} {em.pop()})")
+@_reg("GET_LEN")
+def _get_len(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.push(f"len({em.peek()})")
+# ── Binary ────────────────────────────────────────────────────────────────
+_BINARY_SYMBOLS = {
+    "BINARY_MULTIPLY": "*",
+    "BINARY_ADD": "+",
+    "BINARY_SUBTRACT": "-",
+    "BINARY_TRUE_DIVIDE": "/",
+    "BINARY_FLOOR_DIVIDE": "//",
+    "BINARY_MODULO": "%",
+    "BINARY_POWER": "**",
+    "BINARY_AND": "&",
+    "BINARY_OR": "|",
+    "BINARY_XOR": "^",
+    "BINARY_LSHIFT": "<<",
+    "BINARY_RSHIFT": ">>",
+    "BINARY_MATRIX_MULTIPLY": "@",
+}
+@_reg(*_BINARY_SYMBOLS)
+def _binary(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    rhs = em.pop()
+    lhs = em.pop()
+    em.push(f"({lhs} {_BINARY_SYMBOLS[inst.opname]} {rhs})")
+@_reg("BINARY_SUBSCR")
+def _subscr(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    rhs = em.pop()
+    lhs = em.pop()
+    em.push(f"{lhs}[{rhs}]")
+@_reg("BINARY_SLICE")
+def _slice(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    end = em.pop()
+    start = em.pop()
+    container = em.pop()
+    em.push(f"{container}[{start}:{end}]")
+# ── Inplace ───────────────────────────────────────────────────────────────
+_INPLACE_SYMBOLS = {
+    "INPLACE_MULTIPLY": "*",
+    "INPLACE_ADD": "+",
+    "INPLACE_SUBTRACT": "-",
+    "INPLACE_TRUE_DIVIDE": "/",
+    "INPLACE_FLOOR_DIVIDE": "//",
+    "INPLACE_MODULO": "%",
+    "INPLACE_POWER": "**",
+    "INPLACE_AND": "&",
+    "INPLACE_OR": "|",
+    "INPLACE_XOR": "^",
+    "INPLACE_LSHIFT": "<<",
+    "INPLACE_RSHIFT": ">>",
+    "INPLACE_MATRIX_MULTIPLY": "@",
+}
+@_reg(*_INPLACE_SYMBOLS)
+def _inplace(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    rhs = em.pop()
+    lhs = em.pop()
+    em.emit(f"{lhs} {_INPLACE_SYMBOLS[inst.opname]}= {rhs}")
+    em.push(lhs)
+@_reg("BINARY_OP")
+def _binary_op(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Python 3.12+ unified BINARY_OP."""
+    rhs = em.pop()
+    lhs = em.pop()
+    if "=" in inst.argrepr:
+        em.emit(f"{lhs} {inst.argrepr} {rhs}")
+        em.push(lhs)
+    else:
+        em.push(f"({lhs} {inst.argrepr} {rhs})")
+# ── Comparison ────────────────────────────────────────────────────────────
+@_reg("COMPARE_OP")
+def _compare(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    rhs = em.pop()
+    lhs = em.pop()
+    em.push(f"({lhs} {inst.argval} {rhs})")
+@_reg("IS_OP")
+def _is_op(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    rhs = em.pop()
+    lhs = em.pop()
+    op = "is" if inst.argval == 0 else "is not"
+    em.push(f"({lhs} {op} {rhs})")
+@_reg("CONTAINS_OP")
+def _contains(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    rhs = em.pop()
+    lhs = em.pop()
+    op = "in" if inst.argval == 0 else "not in"
+    em.push(f"({lhs} {op} {rhs})")

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/calls.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Handlers for function-call and function-creation opcodes."""
+from __future__ import annotations
+import sys
+from typing import Optional
+from ..decompile_context import DecompileContext
+from ..handler_registry import registry
+from ..instruction import Instruction
+from ..source_emitter import SourceEmitter
+_reg = registry.register
+@_reg("KW_NAMES")
+def _kw_names(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    # Python 3.11+ instruction that passes keyword argument names to the subsequent CALL.
+    # inst.arg indexes into co_consts for the key-name tuple, e.g. ('y', 'z').
+    # Push repr so it becomes the string "('y', 'z')"; the CALL handler later eval()s it back to a tuple.
+    names = ctx.code.co_consts[inst.arg]
+    em.push(repr(names))
+@_reg("CALL")
+def _call(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Python 3.11+ unified CALL.
+    3.12 stack layout: [NULL, callable, arg0, ..., argN-1]  (KW_NAMES precedes)
+    3.11 stack layout: [NULL, callable, arg0, ..., argN-1]  (KW_NAMES → PRECALL → CALL)
+    """
+    # Check whether KW_NAMES precedes CALL (indicating keyword arguments exist).
+    # 3.12: KW_NAMES → CALL;  3.11: KW_NAMES → PRECALL → CALL
+    preceding = [x for x in ctx.instructions if x.offset < inst.offset]
+    has_kw = False
+    if preceding:
+        if preceding[-1].opname == "KW_NAMES" or (
+            len(preceding) > 1
+            and preceding[-2].opname == "KW_NAMES"
+            and preceding[-1].opname == "PRECALL"  # 3.11 transitional opcode, removed in 3.12
+        ):
+            has_kw = True
+    kw_names: tuple = ()
+    if has_kw:
+        kw_names = eval(em.pop())  # retrieve the tuple stored by KW_NAMES from the stack
+    args = [em.pop() for _ in range(inst.argval)][::-1]
+    pos_args = args[: len(args) - len(kw_names)]
+    kw_args = args[len(args) - len(kw_names) :]
+    kwcalls = [f"{n}={v}" for n, v in zip(kw_names, kw_args)]
+    func = em.pop()
+    # 3.11+ PUSH_NULL / LOAD_GLOBAL(NULL+name) pushes a NULL sentinel before the call.
+    # After popping the callable, the top of stack may be NULL (represented as None); clear it.
+    if em.stack_size and em.peek() is None:
+        em.pop()
+    # GET_ITER produces "iter(x)"; if func happens to be "iter(x)" it is actually an argument
+    # (e.g. in the next(iter(x)) pattern), and the real callable is further down the stack.
+    if "iter(" in str(func):
+        pos_args = [func]
+        func = em.pop()
+    em.push(f"{func}({', '.join(pos_args + kwcalls)})")
+    # replace_tos_with_temp: the call result may be referenced multiple times (assignment, passing,
+    # method call), so store it in a temp to avoid repeated evaluation and side effects.
+    em.replace_tos_with_temp()
+@_reg("CALL_FUNCTION", "CALL_METHOD")
+def _call_legacy(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """CALL_FUNCTION / CALL_METHOD (Python ≤3.10)."""
+    args = [em.pop() for _ in range(inst.argval)][::-1]
+    func = em.pop()
+    em.push(f"{func}({', '.join(args)})")
+    em.replace_tos_with_temp()
+@_reg("CALL_FUNCTION_KW")
+def _call_function_kw(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    kw_args = eval(em.pop())
+    kw_vals = [em.pop() for _ in range(len(kw_args))]
+    kw_vals.reverse()
+    kwcalls = [f"{n}={v}" for n, v in zip(kw_args, kw_vals)]
+    pos_args = [em.pop() for _ in range(inst.argval - len(kw_args))][::-1]
+    func = em.pop()
+    em.push(f"{func}({', '.join(pos_args + kwcalls)})")
+    em.replace_tos_with_temp()
+@_reg("CALL_FUNCTION_EX")
+def _call_function_ex(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    # 3.11+ stack: [NULL, func, args (, kwargs)]
+    # After popping func, clear the NULL sentinel before pushing the result
+    if inst.argval == 0:
+        a = em.pop()
+        f = em.pop()
+        if em.stack_size and em.peek() is None:
+            em.pop()
+        em.push(f"{f}(*{a})")
+    elif inst.argval == 1:
+        kw = em.pop()
+        a = em.pop()
+        f = em.pop()
+        if em.stack_size and em.peek() is None:
+            em.pop()
+        em.push(f"{f}(*{a}, **{kw})")
+    em.replace_tos_with_temp()
+@_reg("CALL_INTRINSIC_1")
+def _intrinsic_1(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Python 3.12 instruction replacing some internal C-level calls.
+    argrepr identifies the specific operation, e.g. INTRINSIC_PRINT, INTRINSIC_UNARY_POSITIVE.
+    Most are compiler-internal operations (import *, typealias) rarely triggered by user code."""
+    _SKIP = {
+        "INTRINSIC_1_INVALID",
+        "INTRINSIC_IMPORT_STAR",
+        "INTRINSIC_STOPITERATION_ERROR",
+        "INTRINSIC_ASYNC_GEN_WRAP",
+        "INTRINSIC_TYPEVAR",
+        "INTRINSIC_PARAMSPEC",
+        "INTRINSIC_TYPEVARTUPLE",
+        "INTRINSIC_SUBSCRIPT_GENERIC",
+        "INTRINSIC_TYPEALIAS",
+    }
+    if inst.argrepr in _SKIP:
+        return
+    if inst.argrepr == "INTRINSIC_PRINT":
+        em.emit(f"print({em.pop()})")
+        em.push("None")
+    elif inst.argrepr == "INTRINSIC_UNARY_POSITIVE":
+        em.set_at(0, f"+{em.peek()}")
+    elif inst.argrepr == "INTRINSIC_LIST_TO_TUPLE":
+        em.push(f"tuple({em.pop()})")
+# ── MAKE_FUNCTION ─────────────────────────────────────────────────────────
+@_reg("MAKE_FUNCTION")
+def _make_function(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> Optional[int]:
+    """Handle bytecode for def inner(...) and lambda.
+    Bytecode: LOAD_CONST <code_object> → MAKE_FUNCTION → STORE_FAST name
+    The handler recursively decompiles the inner code object and emits the full def statement.
+    """
+    if sys.version_info < (3, 11):
+        # 3.10: qualified_name string is still on the stack
+        qual_name = em.pop()
+        try:
+            qual_name = eval(qual_name)
+        except Exception:
+            pass
+        func_name = qual_name.split(".")[-1]
+        if "<" in func_name:  # <lambda>, <listcomp>, etc. — invalid identifiers
+            em.emit(f'"original function name {func_name} is illegal, use a temp name."')
+            func_name = em.make_temp()
+    else:
+        func_name = em.make_temp()
+    code = em.pop()  # inner CodeType object pushed by LOAD_CONST
+    # argval bit flags indicate whether extra function components remain on the stack
+    if inst.argval & 0x08:
+        em.pop()  # closure tuple (cell references for freevars)
+    if inst.argval & 0x04:
+        em.pop()  # annotations dict
+    if inst.argval & 0x02:
+        em.pop()  # keyword-only defaults tuple
+    if inst.argval & 0x01:
+        em.pop()  # positional defaults tuple
+    # If the next instruction is STORE_FAST, use the target variable name as the function name
+    this_idx = ctx.index_of(inst.offset)
+    immediately_used = False
+    if ctx.instructions[this_idx + 1].opname == "STORE_FAST":
+        func_name = ctx.instructions[this_idx + 1].argval
+        immediately_used = True
+    # Recurse: create a new Decompiler instance for the inner code object
+    from ...decompiler import Decompiler
+    inner = Decompiler(code).decompile(overwrite_fn_name=func_name)
+    em.emit_raw(inner)
+    if immediately_used:
+        return this_idx + 2  # skip the MAKE_FUNCTION + STORE_FAST pair
+    em.push(func_name)  # not immediately assigned — push onto stack for later use (e.g. as an argument)
+    return None

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/containers.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Handlers for BUILD_*, UNPACK_*, LIST_EXTEND/APPEND, SET_ADD, MAP_ADD,
+FORMAT_VALUE, and BUILD_SLICE / BUILD_STRING."""
+from __future__ import annotations
+import sys
+from ..decompile_context import DecompileContext
+from ..handler_registry import registry
+from ..instruction import Instruction
+from ..source_emitter import SourceEmitter
+_reg = registry.register
+# ── BUILD tuple / list / set ──────────────────────────────────────────────
+def _safe_str(val) -> str:
+    """Convert a stack value to string, handling None sentinels from PUSH_NULL."""
+    return "None" if val is None else str(val)
+@_reg("BUILD_TUPLE", "BUILD_TUPLE_UNPACK", "BUILD_TUPLE_UNPACK_WITH_CALL")
+def _build_tuple(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    args = [_safe_str(em.pop()) for _ in range(inst.argval)][::-1]
+    if "UNPACK" in inst.opname:
+        args = [f"*{a}" for a in args]
+    em.push(f"({args[0]},)" if inst.argval == 1 else f"({', '.join(args)})")
+@_reg("BUILD_LIST", "BUILD_LIST_UNPACK")
+def _build_list(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    args = [_safe_str(em.pop()) for _ in range(inst.argval)][::-1]
+    if "UNPACK" in inst.opname:
+        args = [f"*{a}" for a in args]
+    em.push(f"[{', '.join(args)}]")
+    em.replace_tos_with_temp()
+@_reg("BUILD_SET", "BUILD_SET_UNPACK")
+def _build_set(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    if inst.argval == 0:
+        em.push("set()")
+    else:
+        args = [em.pop() for _ in range(inst.argval)][::-1]
+        if "UNPACK" in inst.opname:
+            args = [f"*{a}" for a in args]
+        em.push(f"{{{', '.join(args)}}}")
+    em.replace_tos_with_temp()
+# ── BUILD map ─────────────────────────────────────────────────────────────
+@_reg("BUILD_MAP")
+def _build_map(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    items = [em.pop() for _ in range(inst.argval * 2)][::-1]
+    keys, vals = items[::2], items[1::2]
+    em.push(f"{{{', '.join(f'{k}: {v}' for k, v in zip(keys, vals))}}}")
+    em.replace_tos_with_temp()
+@_reg("BUILD_MAP_UNPACK", "BUILD_MAP_UNPACK_WITH_CALL")
+def _build_map_unpack(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    if inst.argval == 0:
+        em.push("dict()")
+    else:
+        args = [em.pop() for _ in range(inst.argval)][::-1]
+        em.push(f"{{{', '.join(f'**{a}' for a in args)}}}")
+    em.replace_tos_with_temp()
+@_reg("BUILD_CONST_KEY_MAP")
+def _const_key_map(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    keys = eval(em.pop())
+    vals = [em.pop() for _ in range(inst.argval)][::-1]
+    em.push(f"{{{', '.join(f'{k!r}: {v}' for k, v in zip(keys, vals))}}}")
+    em.replace_tos_with_temp()
+@_reg("BUILD_STRING")
+def _build_string(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    args = [em.pop() for _ in range(inst.argval)][::-1]
+    em.push(" + ".join(args))
+@_reg("BUILD_SLICE")
+def _build_slice(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    tos = em.pop()
+    tos1 = em.pop()
+    if inst.argval == 2:
+        em.push(f"slice({tos1}, {tos})")
+    elif inst.argval == 3:
+        tos2 = em.pop()
+        em.push(f"slice({tos2}, {tos1}, {tos})")
+@_reg("LIST_TO_TUPLE")
+def _list_to_tuple(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.push(f"tuple({em.pop()})")
+# ── Mutating container ops ────────────────────────────────────────────────
+@_reg("LIST_EXTEND")
+def _list_extend(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    values = em.pop()
+    temp = em.replace_tos_with_temp(depth=inst.argval)
+    em.emit(f"{temp}.extend({values})")
+@_reg("LIST_APPEND")
+def _list_append(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    argval = inst.argval if inst.argval != 1 else 2
+    container = em.stack[-argval]
+    value = em.pop()
+    em.emit(f"{container}.append({value})")
+@_reg("SET_UPDATE", "DICT_UPDATE", "DICT_MERGE")
+def _generic_update(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    assert inst.argval == 1, "Only tested for argval==1"
+    values = em.pop()
+    temp = em.replace_tos_with_temp()
+    em.emit(f"{temp}.update({values})")
+@_reg("SET_ADD")
+def _set_add(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    argval = inst.argval if inst.argval != 1 else 2
+    container = em.stack[-argval]
+    value = em.pop()
+    em.emit(f"{container}.add({value})")
+@_reg("MAP_ADD")
+def _map_add(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    container = em.stack[-inst.argval - 1]
+    if sys.version_info >= (3, 8):
+        value = em.pop()
+        key = em.pop()
+    else:
+        key = em.pop()
+        value = em.pop()
+    em.emit(f"{container}.__setitem__({key}, {value})")
+# ── Unpack ────────────────────────────────────────────────────────────────
+@_reg("UNPACK_SEQUENCE")
+def _unpack_seq(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    varname = em.pop()
+    tmps = [em.make_temp() for _ in range(inst.argval)]
+    em.emit("".join(f"{t}, " for t in tmps) + f"= {varname}")
+    for t in reversed(tmps):
+        em.push(t)
+@_reg("UNPACK_EX")
+def _unpack_ex(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    varname = em.pop()
+    tmps = [em.make_temp() for _ in range(inst.argval)]
+    star = em.make_temp()
+    em.emit(f"{', '.join(tmps)}, *{star} = {varname}")
+    em.push(star)
+    for t in reversed(tmps):
+        em.push(t)
+# ── Format ────────────────────────────────────────────────────────────────
+@_reg("FORMAT_VALUE")
+def _format_value(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    func, spec = inst.argval
+    if spec:
+        form_spec = em.pop()
+        value = em.pop()
+        em.push(f"format({value}, {form_spec})")
+    else:
+        value = em.pop()
+        fn = str if func is None else func
+        em.push(f"{fn.__name__}({value})")

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/control_flow.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Handlers for control-flow opcodes: jumps, if/else, for, return, yield, raise."""
+from __future__ import annotations
+from typing import Optional
+from ..decompile_context import DecompileContext
+from ..handler_registry import registry
+from ..instruction import Instruction
+from ..source_emitter import LoopContext, SourceEmitter
+_reg = registry.register
+# ── Simple returns / yield / raise ────────────────────────────────────────
+@_reg("RETURN_VALUE")
+def _return_value(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.emit(f"return {em.peek()}")
+    em.pop()
+@_reg("RETURN_CONST")
+def _return_const(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.emit(f"return {repr(inst.argval)}")
+@_reg("YIELD_VALUE")
+def _yield_value(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    import sys
+    if sys.version_info >= (3, 12):
+        raise NotImplementedError("YIELD_VALUE is not supported in Python 3.12+")
+    em.emit(f"yield {em.peek()}")
+@_reg("RETURN_GENERATOR")
+def _return_generator(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Python 3.11+ generator function prologue. Each generator has its own stack frame;
+    RETURN_GENERATOR creates the generator object and returns it to the caller,
+    subsequent next(gen) resumes from RESUME. Push None as a placeholder during decompilation."""
+    em.push(None)
+@_reg("GEN_START")
+def _gen_start(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Python 3.11 marks generator start (replaced by RESUME in 3.12)."""
+    assert inst.argval == 0, "Only generator expression is supported"
+@_reg("RAISE_VARARGS")
+def _raise_varargs(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    if inst.argval == 0:
+        em.emit("raise")
+    elif inst.argval == 1:
+        em.emit(f"raise {em.pop()}")
+    elif inst.argval == 2:
+        tos = em.pop()
+        tos1 = em.pop()
+        em.emit(f"raise {tos1} from {tos}")
+@_reg("BREAK_LOOP")
+def _break_loop(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.emit("break")
+# ── Unconditional jumps ───────────────────────────────────────────────────
+@_reg("JUMP_ABSOLUTE")
+@_reg("JUMP_FORWARD")
+@_reg("JUMP_BACKWARD")
+@_reg("JUMP_BACKWARD_NO_INTERRUPT")
+def _abs_jump(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> Optional[int]:
+    """Unconditional jump. Returns len(instructions) to make decompile_range stop immediately."""
+    target = inst.jump_target_offset()
+    idx = ctx.index_of(target)
+    loop = em.loop
+    if loop is not None:
+        if idx >= loop.end_index:
+            em.emit("break")
+            return len(ctx.instructions)
+        if idx == loop.start_index:
+            em.emit("continue")
+            return len(ctx.instructions)
+    return idx
+# ── Conditional jumps (if / else) ─────────────────────────────────────────
+@_reg("POP_JUMP_IF_TRUE", "POP_JUMP_IF_FALSE")
+@_reg("POP_JUMP_FORWARD_IF_TRUE", "POP_JUMP_FORWARD_IF_FALSE")
+@_reg("POP_JUMP_BACKWARD_IF_TRUE", "POP_JUMP_BACKWARD_IF_FALSE")
+@_reg("POP_JUMP_FORWARD_IF_NONE", "POP_JUMP_FORWARD_IF_NOT_NONE")
+@_reg("POP_JUMP_BACKWARD_IF_NONE", "POP_JUMP_BACKWARD_IF_NOT_NONE")
+@_reg("JUMP_IF_TRUE_OR_POP", "JUMP_IF_FALSE_OR_POP")
+@_reg("POP_JUMP_IF_NOT_NONE", "POP_JUMP_IF_NONE")
+def _jump_if(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> Optional[int]:
+    """Decompile if/else structure.
+    Standard if/else bytecode:
+      POP_JUMP_IF_FALSE else_start   ← this_idx
+      (if-body)
+      JUMP_FORWARD after_else        ← last instruction of if-body
+      >> else_start:                 ← jump_idx
+      (else-body)
+      >> after_else:                 ← merge point (end)
+    """
+    jump_offset = inst.jump_target_offset()
+    jump_idx = ctx.index_of(jump_offset)
+    this_idx = ctx.index_of(inst.offset)
+    # ── Step 1: condition expression and branch stack state ──
+    cond = em.peek()
+    fall_stack = list(em.stack)
+    jump_stack = list(em.stack)
+    if "IF_NOT_NONE" in inst.opname:
+        cond = f"({cond} is None)"
+    elif "IF_NONE" in inst.opname:
+        cond = f"({cond} is not None)"
+    elif "IF_TRUE" in inst.opname:
+        cond = f"(not {cond})"
+    else:
+        cond = f"{cond}"
+    if "POP_JUMP" in inst.opname:
+        jump_stack.pop()
+        fall_stack.pop()
+    elif "OR_POP" in inst.opname:
+        fall_stack.pop()
+    # ── Step 2: merge point candidate upper bounds ──
+    merge_upper_bounds = [len(ctx.instructions)]
+    if em.loop is not None:
+        merge_upper_bounds.append(em.loop.end_index)
+    # ── Step 3: find "skip else" JUMPs in the if-body ──
+    def _is_forward_past_else(i: Instruction) -> bool:
+        return i.is_jump and i.jump_target_offset() >= jump_offset
+    forward_targets = [i.jump_target_offset() for i in ctx.instructions[this_idx:jump_idx] if _is_forward_past_else(i)]
+    # ── Step 4: compute merge point by case ──
+    if not forward_targets:
+        if jump_idx <= this_idx:
+            # Case C: backward jump (inside loop), emit if cond: continue
+            rev_cond = em.peek()
+            if "IF_NOT_NONE" in inst.opname:
+                rev_cond = f"({rev_cond} is not None)"
+            elif "IF_NONE" in inst.opname:
+                rev_cond = f"({rev_cond} is None)"
+            elif "IF_TRUE" in inst.opname:
+                rev_cond = f"{rev_cond}"
+            elif "IF_FALSE" in inst.opname:
+                rev_cond = f"(not {rev_cond})"
+            em.emit(f"if {rev_cond}:")
+            em.emit(em.indent("continue\n").rstrip("\n"))
+            return None
+        # Case B: both branches terminate with RETURN/RAISE
+        end = jump_idx
+    else:
+        # Case A: standard if/else, infer merge point from forward_targets
+        max_jump = max(forward_targets)
+        max_idx = ctx.index_of(max_jump)
+        all_targets = [i.jump_target_offset() for i in ctx.instructions[this_idx:max_idx] if _is_forward_past_else(i)]
+        max_idx = ctx.index_of(max(all_targets))
+        last = ctx.instructions[max_idx - 1]
+        if not ("RAISE" in last.opname or "RETURN" in last.opname or "STORE" in last.opname):
+            old = max_idx
+            while max_idx < len(ctx.instructions):
+                op = ctx.instructions[max_idx].opname
+                if "STORE" in op or "RETURN" in op:
+                    max_idx += 1
+                    break
+                if ("JUMP" in op and max_idx > old) or "FOR_ITER" in op:
+                    break
+                max_idx += 1
+        merge_upper_bounds.append(max_idx)
+        end = min(merge_upper_bounds)
+    # ── Step 5: else-body end position (PR#91 fix) ──
+    else_end = end
+    if end == jump_idx and jump_idx < len(ctx.instructions):
+        last_if = ctx.instructions[jump_idx - 1]
+        if "RETURN" in last_if.opname or "RAISE" in last_if.opname:
+            else_end = len(ctx.instructions)
+            if em.loop is not None:
+                else_end = min(else_end, em.loop.end_index)
+    # ── Step 6: decompile both branches ──
+    with em.fork(stack=fall_stack) as if_em:
+        ctx.decompile_range(this_idx + 1, jump_idx, if_em)
+    if_body = em.indent(if_em.get_source())
+    if_end_stack = list(if_em.stack)
+    em.emit_raw(f"if {cond}:\n{if_body}")
+    with em.fork(stack=jump_stack) as else_em:
+        ctx.decompile_range(jump_idx, else_end, else_em)
+    else_body = else_em.get_source()
+    if else_body:
+        em.emit_raw(f"else:\n{em.indent(else_body)}")
+    em.stack[:] = if_end_stack
+    return else_end
+# ── FOR_ITER ──────────────────────────────────────────────────────────────
+@_reg("FOR_ITER")
+def _for_iter(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> Optional[int]:
+    """Decompile for loop.
+    Bytecode layout (3.12):
+      FOR_ITER target       ← get next value; jump to target when exhausted (END_FOR)
+      (loop body)
+      JUMP_BACKWARD for_iter ← normal back-jump (not continue)
+      >> target: END_FOR
+    Loop body range excludes the trailing JUMP_BACKWARD to avoid emitting a spurious continue.
+    """
+    start_idx = ctx.index_of(inst.offset)
+    end_idx = ctx.index_of(inst.jump_target_offset())
+    temp = em.make_temp()
+    iterator = em.pop()
+    em.push(temp)
+    # Determine the actual end position of the loop body:
+    # if the instruction at end_idx is a back-jump to FOR_ITER, extend end_idx so
+    # the LoopContext boundary is correct (break needs to jump past end_idx)
+    if end_idx < len(ctx.instructions):
+        at_end = ctx.instructions[end_idx]
+        if at_end.is_jump and at_end.jump_target_offset() == inst.offset:
+            end_idx += 1
+    # Exclude the trailing JUMP_BACKWARD: it is the normal loop back-jump mechanism, not continue.
+    # Only JUMP_BACKWARDs in the middle of the loop body are continue (handled by _abs_jump).
+    body_end = end_idx
+    if body_end > start_idx + 1:
+        back_jump = ctx.instructions[body_end - 1]
+        if back_jump.is_jump and back_jump.jump_target_offset() == inst.offset:
+            body_end -= 1
+    loop = LoopContext(start_index=start_idx, end_index=end_idx)
+    with em.fork(stack=list(em.stack), loop=loop) as body_em:
+        ctx.decompile_range(start_idx + 1, body_end, body_em)
+    body_src = em.indent(body_em.get_source())
+    em.emit_raw(f"for {temp} in {iterator}:\n{body_src}")
+    em.stack[:] = body_em.stack
+    return end_idx

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/load_store.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Handlers for LOAD_*, STORE_*, DELETE_*, IMPORT_*, PUSH_NULL, GET_ITER."""
+from __future__ import annotations
+from types import CodeType
+from ..decompile_context import DecompileContext
+from ..handler_registry import registry
+from ..instruction import Instruction
+from ..source_emitter import SourceEmitter
+_reg = registry.register
+# ── NOP / unsupported sentinels ──────────────────────────────────────────
+@_reg("NOP", "RESUME", "EXTENDED_ARG", "SETUP_LOOP", "POP_BLOCK")
+@_reg("PRECALL", "BEGIN_FINALLY", "END_FINALLY", "MAKE_CELL")
+@_reg("RERAISE", "END_FOR", "COPY_FREE_VARS")
+def _nop(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    pass
+@_reg("GET_YIELD_FROM_ITER")
+@_reg("POP_EXCEPT", "WITH_EXCEPT_START", "JUMP_IF_NOT_EXC_MATCH")
+@_reg("CHECK_EG_MATCH", "PUSH_EXC_INFO", "PREP_RERAISE_STAR")
+@_reg("WITH_CLEANUP_FINISH", "CALL_FINALLY", "POP_FINALLY")
+@_reg("WITH_CLEANUP_START", "SETUP_EXCEPT", "CHECK_EXC_MATCH")
+@_reg("CLEANUP_THROW")
+@_reg("GET_AWAITABLE", "GET_AITER", "GET_ANEXT", "END_ASYNC_FOR")
+@_reg("BEFORE_ASYNC_WITH", "SETUP_ASYNC_WITH", "SEND", "ASYNC_GEN_WRAP")
+@_reg("CACHE")
+@_reg("PRINT_EXPR", "COPY_DICT_WITHOUT_KEYS")
+@_reg("IMPORT_STAR")
+@_reg("YIELD_FROM", "SETUP_ANNOTATIONS", "LOAD_BUILD_CLASS")
+@_reg("MATCH_MAPPING", "MATCH_SEQUENCE", "MATCH_KEYS", "MATCH_CLASS")
+@_reg("CALL_INTRINSIC_2")
+@_reg("SETUP_FINALLY", "SETUP_WITH", "BEFORE_WITH")
+def _unsupported(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    from ...decompiler import DecompilationError
+    raise DecompilationError(f"Unsupported opcode: {inst.opname}", instruction=inst)
+# ── LOAD instructions ────────────────────────────────────────────────────
+@_reg("LOAD_CONST")
+def _load_const(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Load a constant. Branches: can_repr → direct repr / type → importlib /
+    torch prefix → import torch / CodeType → push as-is for MAKE_FUNCTION."""
+    can_repr = False
+    try:
+        can_repr = eval(repr(inst.argval)) == inst.argval
+    except BaseException:
+        pass
+    if can_repr:
+        em.push(repr(inst.argval))
+    elif isinstance(inst.argval, type):
+        module = inst.argval.__module__
+        name = inst.argval.__name__
+        em.emit("import importlib")
+        tmp = em.make_temp()
+        em.emit(f'{tmp} = importlib.import_module("{module}").{name}')
+        em.push(tmp)
+    elif inst.argrepr.startswith("torch."):
+        em.emit("import torch")
+        tmp = em.make_temp()
+        em.emit(f"{tmp} = {inst.argval}")
+        em.push(tmp)
+    elif isinstance(inst.argval, CodeType):
+        em.push(inst.argval)
+    else:
+        from ...decompiler import DecompilationError
+        raise DecompilationError(
+            f"LOAD_CONST: cannot represent co_consts[{inst.arg}] = {repr(inst.argval)!r} "
+            f"(type {type(inst.argval).__name__}) as source code",
+            instruction=inst,
+        )
+@_reg("LOAD_FAST", "LOAD_FAST_CHECK")
+@_reg("LOAD_GLOBAL", "LOAD_DEREF", "LOAD_NAME")
+@_reg("LOAD_CLASSDEREF", "LOAD_CLOSURE")
+def _generic_load(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Generic load. 3.11+ LOAD_GLOBAL argrepr "NULL + name" pushes a NULL sentinel first.
+    Python <3.12 comprehension parameter name ".0" is replaced with "comp_arg_0"."""
+    if "NULL + " in inst.argrepr:
+        em.push(None)
+    if inst.argrepr.startswith("."):
+        em.push(inst.argval.replace(".", "comp_arg_"))
+    else:
+        em.push(inst.argval)
+# Python 3.12 comprehension variable protection: LOAD_FAST_AND_CLEAR saves old value + STORE_FAST restores.
+# During decompilation, temp variables used for loops don't need save/restore; push a sentinel so STORE_FAST skips.
+_CLEAR_SENTINEL = object()
+@_reg("LOAD_FAST_AND_CLEAR")
+def _load_fast_and_clear(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.push(_CLEAR_SENTINEL)
+@_reg("LOAD_LOCALS")
+def _load_locals(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """3.12 class body: locals() returns a new dict snapshot, cached in a temp to avoid repeated calls."""
+    em.push("locals()")
+    em.replace_tos_with_temp()
+@_reg("LOAD_FROM_DICT_OR_GLOBALS", "LOAD_FROM_DICT_OR_DEREF")
+def _load_from_dict(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """3.12 class body: look up in locals dict first, fall back to globals if not found."""
+    tos = em.pop()
+    em.push(f"{tos}[{inst.argval}] if '{inst.argval}' in {tos} else {inst.argval}")
+    em.replace_tos_with_temp()
+@_reg("LOAD_ATTR")
+def _load_attr(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Attribute access. isidentifier() checks if the attr name is valid; if not, use getattr()."""
+    lhs = str(em.pop())
+    rhs = inst.argval
+    em.push(f"{lhs}.{rhs}" if rhs.isidentifier() else f"getattr({lhs}, {rhs!r})")
+@_reg("LOAD_SUPER_ATTR")
+def _load_super_attr(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    self_obj = em.pop()
+    cls_obj = em.pop()
+    super_obj = em.pop()
+    em.push(f"{super_obj}({cls_obj}, {self_obj}).{inst.argval}")
+    em.replace_tos_with_temp()
+@_reg("LOAD_METHOD")
+def _load_method(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.push(f"{em.pop()}.{inst.argval}")
+@_reg("LOAD_ASSERTION_ERROR")
+def _load_assertion_error(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.push("AssertionError")
+@_reg("PUSH_NULL")
+def _push_null(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """3.11+ pushes a NULL sentinel before function calls; the CALL handler will clear it."""
+    em.push(None)
+@_reg("GET_ITER")
+def _get_iter(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.push(f"iter({em.pop()})")
+# ── STORE instructions ───────────────────────────────────────────────────
+@_reg("STORE_FAST", "STORE_GLOBAL", "STORE_DEREF", "STORE_NAME")
+def _generic_store(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Generic store. Skips _CLEAR_SENTINEL and self-assignment, protects variable names on the stack that are about to be overwritten."""
+    left = inst.argval
+    right = em.pop()
+    if right is _CLEAR_SENTINEL:
+        return
+    if left != right:
+        if isinstance(left, str) and left in em.stack:
+            tmp = em.make_temp()
+            em.emit(f"{tmp} = {left}")
+            em.stack[:] = [tmp if x == left else x for x in em.stack]
+        em.emit(f"{left} = {right}")
+@_reg("STORE_SUBSCR")
+def _store_subscr(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    index = em.pop()
+    obj = em.pop()
+    value = em.pop()
+    em.emit(f"{obj}[{index}] = {value}")
+@_reg("STORE_SLICE")
+def _store_slice(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    end = em.pop()
+    start = em.pop()
+    container = em.pop()
+    value = em.pop()
+    em.emit(f"{container}[{start}:{end}] = {value}")
+@_reg("STORE_ATTR")
+def _store_attr(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    obj = em.pop()
+    value = em.pop()
+    em.emit(f"{obj}.{inst.argval} = {value}")
+# ── DELETE instructions ──────────────────────────────────────────────────
+@_reg("DELETE_SUBSCR")
+def _delete_subscr(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    index = em.pop()
+    obj = em.pop()
+    if f"{obj}[{index}]" not in em.stack:
+        em.emit(f"del {obj}[{index}]")
+@_reg("DELETE_NAME", "DELETE_GLOBAL", "DELETE_DEREF")
+def _generic_delete(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.emit(f"del {inst.argval}")
+@_reg("DELETE_FAST")
+def _delete_fast(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Dynamo cleans up temp variables; no explicit del needed after decompilation."""
+    pass
+@_reg("DELETE_ATTR")
+def _delete_attr(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    em.emit(f"del {em.pop()}.{inst.argval}")
+# ── IMPORT instructions ──────────────────────────────────────────────────
+@_reg("IMPORT_NAME")
+def _import_name(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """import os.path → binds 'os' (top-level module), accesses submodules via os.path.sep."""
+    name = inst.argval.split(".")[0]
+    fromlist = em.pop()
+    level = em.pop()
+    em.emit(f"{name} = __import__({inst.argval!r}, fromlist={fromlist}, level={level})")
+    em.push(name)
+@_reg("IMPORT_FROM")
+def _import_from(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    name = inst.argval
+    module = em.peek()
+    em.emit(f"{name} = {module}.{name}")
+    em.push(name)

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/handlers/stack_ops.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Handlers for stack-manipulation opcodes: ROT, SWAP, COPY, POP, DUP.
+See bytecode_explained.py §16 for details.
+"""
+from __future__ import annotations
+from ..decompile_context import DecompileContext
+from ..handler_registry import registry
+from ..instruction import Instruction
+from ..source_emitter import SourceEmitter
+_reg = registry.register
+# ── ROT_N family (Python ≤3.10, replaced by SWAP/COPY in 3.11+) ───────────
+@_reg("ROT_N")
+@_reg("ROT_TWO")
+@_reg("ROT_THREE")
+@_reg("ROT_FOUR")
+def _rot_n(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Top-n stack rotation: [a, b, c] → [c, a, b] (n=3).
+    ROT_TWO:   a, b → b, a             (swap, used for a, b = b, a)
+    ROT_THREE: a, b, c → c, a, b       (3-element rotation)
+    ROT_FOUR:  a, b, c, d → d, a, b, c (4-element rotation)
+    ROT_N:     generic n-element rotation (argval = n)
+    """
+    n = inst.argval if inst.opname == "ROT_N" else {"ROT_TWO": 2, "ROT_THREE": 3, "ROT_FOUR": 4}[inst.opname]
+    vals = em.stack[-n:]
+    em.stack[-n:] = [vals[-1]] + vals[:-1]
+@_reg("SWAP")
+def _swap(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Python 3.11+: swap stack[-1] and stack[-n]."""
+    n = inst.argval
+    em.stack[-1], em.stack[-n] = em.stack[-n], em.stack[-1]
+@_reg("COPY")
+def _copy(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Python 3.11+: copy stack[-n] to top of stack (COPY 1 = DUP_TOP)."""
+    n = inst.argval
+    if n == 0:
+        return
+    em.push(em.stack[-1 - (n - 1)])
+@_reg("POP_TOP")
+def _pop_top(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    if em.stack_size > 0:
+        em.pop()
+@_reg("DUP_TOP")
+def _dup_top(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Python ≤3.10: duplicate top of stack. Replaced by COPY 1 in 3.11+."""
+    em.push(em.peek())
+@_reg("DUP_TOP_TWO")
+def _dup_top_two(em: SourceEmitter, inst: Instruction, ctx: DecompileContext) -> None:
+    """Python ≤3.10: duplicate top two stack items. Replaced by two COPYs in 3.11+."""
+    tos = em.peek(0)
+    tos1 = em.peek(1)
+    em.push(tos1)
+    em.push(tos)

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/instruction.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Enhanced Instruction dataclass with rich querying properties."""
+from __future__ import annotations
+import dataclasses
+import dis
+import sys
+from typing import Any, Optional
+_ALL_JUMP_OPCODES = frozenset(dis.hasjabs) | frozenset(dis.hasjrel)
+_PY311 = sys.version_info >= (3, 11)
+_LOAD_OPCODES = frozenset(n for n in dis.opname if n.startswith("LOAD_") or n in ("PUSH_NULL", "GET_ITER"))
+_STORE_OPCODES = frozenset(n for n in dis.opname if n.startswith("STORE_"))
+_DELETE_OPCODES = frozenset(n for n in dis.opname if n.startswith("DELETE_"))
+@dataclasses.dataclass
+class Instruction:
+    """Mutable mirror of ``dis.Instruction`` with convenience queries.
+    Unlike the stdlib version this is mutable so cleanup passes can
+    modify instructions in-place (e.g. NOP-ing unreachable bytecode).
+    """
+    opcode: int
+    opname: str
+    # arg:     raw integer argument (the number in the bytecode), may be an index into co_consts/co_varnames
+    # argval:  Python object resolved by the dis module (value of co_consts[arg], or a variable name string)
+    # argrepr: human-readable string of argval (e.g. "NULL + print", "to 20")
+    # See bytecode_explained.py §1 for details
+    arg: Optional[int]
+    argval: Any
+    argrepr: str
+    offset: Optional[int] = None
+    starts_line: Optional[int] = None
+    is_jump_target: bool = False
+    # -- identity / hashing (by object id, not value) ----------------------
+    def __hash__(self) -> int:
+        return id(self)
+    def __eq__(self, other: object) -> bool:
+        return self is other
+    def __repr__(self) -> str:
+        return f"Instruction({self.opname}, offset={self.offset}, argval={self.argrepr!r})"
+    # -- category queries ---------------------------------------------------
+    @property
+    def is_load(self) -> bool:
+        return self.opname in _LOAD_OPCODES
+    @property
+    def is_store(self) -> bool:
+        return self.opname in _STORE_OPCODES
+    @property
+    def is_delete(self) -> bool:
+        return self.opname in _DELETE_OPCODES
+    @property
+    def is_jump(self) -> bool:
+        return self.opcode in _ALL_JUMP_OPCODES
+    @property
+    def is_conditional_jump(self) -> bool:
+        return self.is_jump and ("IF" in self.opname or "FOR_ITER" in self.opname)
+    @property
+    def is_unconditional_jump(self) -> bool:
+        return self.is_jump and not self.is_conditional_jump
+    @property
+    def is_return(self) -> bool:
+        return self.opname in ("RETURN_VALUE", "RETURN_CONST")
+    @property
+    def is_nop(self) -> bool:
+        return self.opname == "NOP"
+    # -- jump target --------------------------------------------------------
+    def jump_target_offset(self) -> Optional[int]:
+        """Return the absolute bytecode offset this instruction jumps to,
+        or ``None`` if it is not a jump instruction."""
+        if not self.is_jump:
+            return None
+        if "to " in self.argrepr:
+            return int(self.argrepr.replace("to ", "").strip())
+        if self.opcode in dis.hasjabs:
+            return self.argval
+        if self.opcode in dis.hasjrel:
+            return self.argval if _PY311 else self.offset + self.argval
+        return None
+    # -- mutation helpers (for cleanup passes) ------------------------------
+    def nop_(self) -> None:
+        """In-place convert this instruction to a NOP."""
+        self.opname = "NOP"
+        self.opcode = dis.opmap["NOP"]
+        self.arg = 0
+        self.argval = 0
+        self.argrepr = ""
+        self.is_jump_target = False
+    # -- factory ------------------------------------------------------------
+    @staticmethod
+    def from_dis(i: dis.Instruction) -> "Instruction":
+        """Create from a stdlib ``dis.Instruction``."""
+        return Instruction(i.opcode, i.opname, i.arg, i.argval, i.argrepr, i.offset, i.starts_line, i.is_jump_target)

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/bytecode/source_emitter.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SourceEmitter: manages the evaluation stack and source-code emission.
+This replaces the bare ``DecompilerState`` (just ``source_code: str``
+and ``stack: list``) with a proper class that owns *all* mutable state
+touched during decompilation, including the temp-variable counter
+(instance-level, not class-level — thread-safe by design).
+"""
+from __future__ import annotations
+import contextlib
+import dataclasses
+from typing import Any, Iterator, List, Optional
+@dataclasses.dataclass
+class LoopContext:
+    """Current loop boundaries, used for break/continue determination.
+    Range semantics similar to range(start, end):
+      start_index: index of FOR_ITER itself (inclusive — part of the loop)
+      end_index:   index of the first instruction outside the loop (exclusive — not part of the loop)
+    break/continue determination (used in _abs_jump):
+      jump target >= end_index   → break  (jump out of the loop)
+      jump target == start_index → continue (jump back to loop head)
+    """
+    start_index: int  # index of FOR_ITER, inclusive (part of the loop)
+    end_index: int  # first instruction outside the loop, exclusive (not part of the loop)
+class SourceEmitter:
+    """Stateful accumulator for the decompiler's output.
+    Improvements over depyf's ``DecompilerState``:
+    * ``_temp_counter`` is **instance-level** (no thread-safety issues).
+    * Stack operations (``push / pop / peek``) are proper methods.
+    * ``emit()`` appends with a trailing newline automatically.
+    * ``fork()`` context-manager creates a child emitter for sub-blocks
+      (if-else branches, loop bodies, etc.) and returns it so the caller
+      can inspect the generated source and final stack.
+    """
+    def __init__(self, indent_size: int = 4, temp_prefix: str = "__temp_", *, _parent_counter: Optional[list] = None) -> None:
+        self._lines: List[str] = []
+        self._stack: List[Any] = []
+        self._indent_size = indent_size
+        self._temp_prefix = temp_prefix
+        # Share counter across forks so names are globally unique within
+        # one Decompiler invocation, but still instance-scoped.
+        self._counter: list = _parent_counter if _parent_counter is not None else [0]
+        self.loop: Optional[LoopContext] = None
+    # -- source emission ----------------------------------------------------
+    def emit(self, line: str) -> None:
+        """Append *line* (with auto newline) to accumulated source."""
+        self._lines.append(line + "\n")
+    def emit_raw(self, text: str) -> None:
+        """Append pre-formatted *text* verbatim (e.g. nested function defs)."""
+        self._lines.append(text)
+    def get_source(self) -> str:
+        return "".join(self._lines)
+    # -- stack operations ---------------------------------------------------
+    def push(self, value: Any) -> None:
+        self._stack.append(value)
+    def pop(self) -> Any:
+        return self._stack.pop()
+    def peek(self, depth: int = 0) -> Any:
+        """Return item at ``stack[-(depth+1)]`` without popping."""
+        return self._stack[-(depth + 1)]
+    def set_at(self, depth: int, value: Any) -> None:
+        """Set ``stack[-(depth+1)]`` to *value*."""
+        self._stack[-(depth + 1)] = value
+    @property
+    def stack(self) -> List[Any]:
+        """Direct access (for complex multi-item operations)."""
+        return self._stack
+    @property
+    def stack_size(self) -> int:
+        return len(self._stack)
+    # -- temp variables (instance-scoped counter) ---------------------------
+    def make_temp(self) -> str:
+        """Return a unique temporary variable name."""
+        self._counter[0] += 1
+        return f"{self._temp_prefix}{self._counter[0]}"
+    def replace_tos_with_temp(self, depth: int = 1) -> str:
+        """Replace ``stack[-depth]`` with a fresh temp, emitting the
+        assignment ``__temp_N = <old_value>``.  Returns the temp name."""
+        old = self._stack[-depth]
+        name = self.make_temp()
+        self.emit(f"{name} = {old}")
+        self._stack[-depth] = name
+        return name
+    # -- sub-block forking --------------------------------------------------
+    @contextlib.contextmanager
+    def fork(self, stack: Optional[List[Any]] = None, loop: Optional[LoopContext] = None) -> Iterator["SourceEmitter"]:
+        """Create a child emitter for a sub-block (if-branch, loop body …).
+        The child shares the temp counter but has its own ``_lines`` and
+        ``_stack``.  If *loop* is ``None`` the parent's loop context is
+        inherited (matching depyf's ``new_state`` semantics).
+        Usage::
+            with emitter.fork(stack=my_stack) as child:
+                decompile_range(start, end, child)
+            child_source = child.get_source()
+            child_final_stack = child.stack
+        """
+        child = SourceEmitter(indent_size=self._indent_size, temp_prefix=self._temp_prefix, _parent_counter=self._counter)
+        child._stack = list(stack) if stack is not None else list(self._stack)
+        if loop is not None:
+            child.loop = loop
+        elif self.loop is not None:
+            child.loop = self.loop
+        yield child
+    # -- indentation helpers ------------------------------------------------
+    def indent(self, text: str) -> str:
+        """Add one level of indentation to every line in *text*."""
+        prefix = " " * self._indent_size
+        return "".join(prefix + line + "\n" for line in text.splitlines())

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/decompiler.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decompiler — the orchestrator that ties everything together.
+This module is the only place that coordinates ``SourceEmitter``,
+``HandlerRegistry``, and ``DecompileContext``.
+Individual handler functions never import from here (except for
+``DecompilationError`` and recursive ``Decompiler`` usage in
+``MAKE_FUNCTION``).
+"""
+from __future__ import annotations
+import dis
+import inspect
+import os
+from types import CodeType
+from typing import Callable, List, Optional, Union
+# Force handler registration by importing the package.
+import magi_compiler.magi_depyf.decompile.bytecode.handlers  # noqa: F401
+from .bytecode.decompile_context import DecompileContext
+from .bytecode.handler_registry import registry
+from .bytecode.instruction import Instruction
+from .bytecode.source_emitter import SourceEmitter
+# ---------------------------------------------------------------------------
+# Errors
+# ---------------------------------------------------------------------------
+class DecompilationError(Exception):
+    """Raised when decompilation fails.
+    Carries optional ``instruction`` context so callers can produce
+    actionable error messages.
+    """
+    def __init__(self, message: str = "", *, instruction: Optional[Instruction] = None):
+        self.message = message
+        self.instruction = instruction
+        super().__init__(message)
+    def __str__(self) -> str:
+        loc = ""
+        if self.instruction is not None:
+            loc = f" at {self.instruction}"
+        return f"DecompilationError: {self.message}{loc}"
+# ---------------------------------------------------------------------------
+# Signature builder (lives here — it's a Decompiler concern, not a util)
+# ---------------------------------------------------------------------------
+class SignatureBuilder:
+    """Build the ``def fn(args):`` header from a ``CodeType``."""
+    @staticmethod
+    def build(code: CodeType, overwrite_name: Optional[str] = None) -> str:
+        n = code.co_argcount + code.co_kwonlyargcount
+        names = [x.replace(".", "comp_arg_") if x.startswith(".") else x for x in code.co_varnames[:n]]
+        if code.co_flags & inspect.CO_VARARGS:
+            names.append("*" + code.co_varnames[n])
+            n += 1
+        if code.co_flags & inspect.CO_VARKEYWORDS:
+            names.append("**" + code.co_varnames[n])
+            n += 1
+        fn_name = overwrite_name or code.co_name
+        return f"def {fn_name}({', '.join(names)}):\n"
+# ---------------------------------------------------------------------------
+# Decompiler
+# ---------------------------------------------------------------------------
+class Decompiler:
+    """Decompile a ``CodeType`` into Python source code.
+    Design differences from depyf's ``Decompiler``:
+    * Handlers live in separate modules and receive
+      ``(emitter, inst, ctx)`` — they never reference this class.
+    * All mutable state is on ``SourceEmitter`` (instance-scoped counter).
+    * ``decompile_range`` is delegated *through* ``DecompileContext``
+      so handlers can recurse without importing this class (except
+      ``MAKE_FUNCTION`` which needs a fresh ``Decompiler`` instance).
+    """
+    _TERMINATORS = frozenset({"RETURN_VALUE", "RETURN_CONST", "RAISE_VARARGS"})
+    def __init__(self, code: Union[CodeType, Callable]) -> None:
+        if callable(code) and not isinstance(code, CodeType):
+            code = _get_code_owner(code).__code__
+        self.code: CodeType = code
+        self.instructions = [Instruction.from_dis(i) for i in dis.get_instructions(code)]
+        self._cleanup()
+    # -- bytecode cleanup ---------------------------------------------------
+    def _cleanup(self) -> None:
+        """Propagate line numbers and NOP dead code after unconditional exits."""
+        cur: Optional[int] = None
+        for inst in self.instructions:
+            if inst.starts_line is not None:
+                cur = inst.starts_line
+            inst.starts_line = cur
+        in_dead = False
+        for inst in self.instructions:
+            if in_dead:
+                if inst.is_jump_target:
+                    in_dead = False
+                else:
+                    inst.nop_()
+            elif inst.opname in self._TERMINATORS:
+                in_dead = True
+    # -- core loop ----------------------------------------------------------
+    def decompile_range(self, start: int, end: int, emitter: SourceEmitter) -> None:
+        """Execute instruction handlers from *start* to *end* (exclusive)."""
+        idx = start
+        try:
+            while idx < end:
+                inst = self.instructions[idx]
+                handler = registry.get(inst.opname)
+                if handler is None:
+                    raise DecompilationError(f"No handler for opcode {inst.opname}", instruction=inst)
+                ctx = self._make_context(emitter)
+                result = handler(emitter, inst, ctx)
+                idx = result if result is not None else idx + 1
+        except DecompilationError:
+            raise
+        except Exception as e:
+            raise DecompilationError(f"Failed at {inst!r} in {self.code.co_name}", instruction=inst) from e
+    def _make_context(self, emitter: SourceEmitter) -> DecompileContext:
+        return DecompileContext(
+            code=self.code,
+            instructions=tuple(self.instructions),
+            indentation=emitter._indent_size,
+            decompile_range=lambda start, end, em: self.decompile_range(start, end, em),
+            offset_to_index={inst.offset: idx for idx, inst in enumerate(self.instructions)},
+        )
+    # -- public API ---------------------------------------------------------
+    def decompile(self, indentation: int = 4, temp_prefix: str = "__temp_", overwrite_fn_name: Optional[str] = None) -> str:
+        """Return decompiled Python source code."""
+        try:
+            emitter = SourceEmitter(indent_size=indentation, temp_prefix=temp_prefix)
+            self.decompile_range(0, len(self.instructions), emitter)
+            body = emitter.get_source()
+            if os.environ.get("DEPYF_REMOVE_TEMP", "1") == "1":
+                from .postprocess import run_all as _postprocess
+                body = _postprocess(body, temp_prefix, indentation)
+            header = SignatureBuilder.build(self.code, overwrite_fn_name)
+            global_names = {i.argval for i in dis.get_instructions(self.code) if i.opname == "STORE_GLOBAL"}
+            preamble = ""
+            if global_names:
+                preamble += "global " + ", ".join(global_names) + "\n"
+            if self.code.co_freevars:
+                preamble += "nonlocal " + ", ".join(self.code.co_freevars) + "\n"
+            body = preamble + body
+            return header + emitter.indent(body)
+        except DecompilationError:
+            raise
+        except Exception as e:
+            raise DecompilationError(f"Failed to decompile {self.code.co_name}") from e
+    @staticmethod
+    def supported_opnames() -> List[str]:
+        return registry.supported_opnames()
+# ---------------------------------------------------------------------------
+# Module-level convenience
+# ---------------------------------------------------------------------------
+def decompile(code: Union[CodeType, Callable]) -> str:
+    """One-liner: decompile a code object or callable to source."""
+    return Decompiler(code).decompile()
+def safe_decompile(code: CodeType) -> str:
+    """Decompile *code* without raising; fall back to depyf then placeholder."""
+    try:
+        return Decompiler(code).decompile()
+    except Exception:
+        try:
+            from depyf import decompile as _depyf_decompile
+            return _depyf_decompile(code)
+        except Exception:
+            return f"# Failed to decompile {code.co_name}\n"
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+def _get_code_owner(fn):
+    """Walk through wrappers to find the object that owns ``__code__``."""
+    if hasattr(fn, "__func__"):
+        return fn.__func__
+    if hasattr(fn, "__wrapped__"):
+        return _get_code_owner(fn.__wrapped__)
+    return fn

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/postprocess/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Source-level post-processing pipeline for decompiled code.
+Each pass is a function ``(source, ...) -> source`` that performs one
+semantics-preserving transformation.  ``run_all`` applies them in order.
+All passes are best-effort: on any exception they return the input unchanged.
+"""
+from .branch_dedup import dedup_branch_tails
+from .for_temps import eliminate_for_temps
+from .inline_temps import eliminate_inline_temps
+def run_all(source: str, temp_prefix: str = "__temp_", indent: int = 4) -> str:
+    """Apply all post-processing passes in sequence."""
+    source = eliminate_for_temps(source, temp_prefix, indent)
+    source = eliminate_inline_temps(source, temp_prefix, indent)
+    source = dedup_branch_tails(source, indent)
+    return source
+__all__ = ["run_all", "eliminate_for_temps", "eliminate_inline_temps", "dedup_branch_tails"]

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/postprocess/branch_dedup.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pass 3: if/else branch tail deduplication.
+Move identical trailing statements from if/else branches to after the block.
+Example::
+    if cond:            if cond:
+        x = 1               x = 1
+        return x    →   else:
+    else:                   x = 2
+        x = 2           return x
+        return x
+"""
+from __future__ import annotations
+import ast
+from typing import List, Tuple
+import astor
+def dedup_branch_tails(source: str, indent: int = 4) -> str:
+    """Move identical trailing statements from if/else branches to after the block."""
+    try:
+        tree = ast.parse(source)
+        changed = False
+        for node in ast.walk(tree):
+            if hasattr(node, "body") and isinstance(node.body, list):
+                new_body, c = _dedup_stmts(node.body)
+                if c:
+                    node.body = new_body
+                    changed = True
+        if not changed:
+            return source
+        ast.fix_missing_locations(tree)
+        return astor.to_source(tree, indent_with=" " * indent)
+    except Exception:
+        return source
+def _dedup_stmts(stmts: List[ast.stmt]) -> Tuple[List[ast.stmt], bool]:
+    """Process a statement list, extracting common if/else tails."""
+    result: List[ast.stmt] = []
+    changed = False
+    for stmt in stmts:
+        for attr in ("body", "orelse", "handlers", "finalbody"):
+            sub = getattr(stmt, attr, None)
+            if isinstance(sub, list) and sub:
+                new_sub, c = _dedup_stmts(sub)
+                if c:
+                    setattr(stmt, attr, new_sub)
+                    changed = True
+        if isinstance(stmt, ast.If) and stmt.orelse:
+            n = _common_tail_length(stmt.body, stmt.orelse)
+            if n > 0:
+                common = stmt.body[-n:]
+                stmt.body = stmt.body[:-n] or [ast.Pass()]
+                stmt.orelse = stmt.orelse[:-n] or []
+                result.append(stmt)
+                result.extend(common)
+                changed = True
+                continue
+        result.append(stmt)
+    return result, changed
+def _common_tail_length(body: List[ast.stmt], orelse: List[ast.stmt]) -> int:
+    """Count identical trailing statements (by AST dump equality)."""
+    count = 0
+    i, j = len(body) - 1, len(orelse) - 1
+    while i >= 0 and j >= 0:
+        if ast.dump(body[i]) == ast.dump(orelse[j]):
+            count += 1
+            i -= 1
+            j -= 1
+        else:
+            break
+    if count >= len(body) or count >= len(orelse):
+        count = min(len(body), len(orelse)) - 1
+    return max(count, 0)

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/postprocess/for_temps.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pass 1: for-loop temp elimination.
+``for __temp in iter: var = __temp; ...`` → ``for var in iter: ...``
+"""
+from __future__ import annotations
+import ast
+import astor
+def eliminate_for_temps(source: str, temp_prefix: str = "__temp_", indent: int = 4) -> str:
+    """Only applies when the first body statement is a plain assignment
+    from the temp to a real variable."""
+    try:
+        tree = ast.parse(source)
+        tree = _ForTempEliminator(temp_prefix).visit(tree)
+        ast.fix_missing_locations(tree)
+        return astor.to_source(tree, indent_with=" " * indent)
+    except Exception:
+        return source
+class _ForTempEliminator(ast.NodeTransformer):
+    def __init__(self, prefix: str):
+        self._prefix = prefix
+    def visit_For(self, node: ast.For) -> ast.For:
+        self.generic_visit(node)
+        if not (
+            isinstance(node.target, ast.Name)
+            and node.target.id.startswith(self._prefix)
+            and node.body
+            and isinstance(node.body[0], ast.Assign)
+            and len(node.body[0].targets) == 1
+            and isinstance(node.body[0].value, ast.Name)
+            and node.body[0].value.id == node.target.id
+        ):
+            return node
+        node.target = node.body[0].targets[0]
+        node.body = node.body[1:] or [ast.Pass()]
+        return node

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/postprocess/inline_temps.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pass 2: single-use temp inlining.
+``__temp = expr; use(__temp)`` → ``use(expr)`` for single-use temps.
+"""
+from __future__ import annotations
+import ast
+from collections import defaultdict
+from typing import List, Optional
+import astor
+def eliminate_inline_temps(source: str, temp_prefix: str = "__temp_", indent: int = 4) -> str:
+    """Inline single-use temporaries into their use site."""
+    try:
+        tree = ast.parse(source)
+        _set_parents(tree)
+        occurrences: dict[str, list] = defaultdict(list)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Name) and node.id.startswith(temp_prefix):
+                occurrences[node.id].append(node)
+        _INDENT_NODES = (
+            ast.FunctionDef,
+            ast.AsyncFunctionDef,
+            ast.For,
+            ast.AsyncFor,
+            ast.While,
+            ast.If,
+            ast.Try,
+            ast.With,
+            ast.AsyncWith,
+            ast.ClassDef,
+        )
+        for name in occurrences:
+            occ = occurrences[name]
+            if len(occ) == 2:
+                n1, n2 = occ
+                _, p1, p2 = _lowest_common_parent(n1, n2)
+                ap = p1 if isinstance(getattr(n1, "parent", None), ast.Assign) else p2
+                can = not isinstance(ap, _INDENT_NODES)
+                if can:
+                    can = _safe_to_inline(tree, n1, n2)
+                occ.append(can)
+            tree = _RemoveAssign(name, occurrences).visit(tree)
+            tree = _InlineTemp(name, occurrences).visit(tree)
+        return astor.to_source(tree, indent_with=" " * indent)
+    except Exception:
+        return source
+# ---------------------------------------------------------------------------
+# AST helpers
+# ---------------------------------------------------------------------------
+def _set_parents(node: ast.AST, parent: Optional[ast.AST] = None) -> None:
+    for child in ast.iter_child_nodes(node):
+        child.parent = parent  # type: ignore[attr-defined]
+        _set_parents(child, child)
+def _get_parents(node: ast.AST) -> List[ast.AST]:
+    out = []
+    while node:
+        out.append(node)
+        node = getattr(node, "parent", None)
+    return out
+def _lowest_common_parent(n1: ast.AST, n2: ast.AST):
+    p1 = _get_parents(n1)
+    p2 = _get_parents(n2)
+    p1.reverse()
+    p2.reverse()
+    last = c1 = c2 = None
+    for a, b in zip(p1, p2):
+        if a is b:
+            last = a
+        else:
+            c1, c2 = a, b
+            break
+    return last, c1, c2
+def _safe_to_inline(tree: ast.AST, def_node: ast.AST, use_node: ast.AST) -> bool:
+    """Verify the RHS variable is not reassigned between definition and use."""
+    assign_parent = getattr(def_node, "parent", None)
+    if not isinstance(assign_parent, ast.Assign):
+        return True
+    rhs = assign_parent.value
+    if not isinstance(rhs, ast.Name):
+        return True
+    rhs_name = rhs.id
+    stmts: List[ast.stmt] = []
+    for node in ast.walk(tree):
+        if hasattr(node, "body") and isinstance(node.body, list):
+            stmts = node.body
+            break
+    try:
+        def_idx = next(i for i, s in enumerate(stmts) if s is assign_parent)
+        use_stmt = getattr(use_node, "parent", None)
+        while use_stmt and use_stmt not in stmts:
+            use_stmt = getattr(use_stmt, "parent", None)
+        use_idx = next(i for i, s in enumerate(stmts) if s is use_stmt)
+    except StopIteration:
+        return True
+    for stmt in stmts[def_idx + 1 : use_idx]:
+        if isinstance(stmt, ast.Assign):
+            for t in stmt.targets:
+                if isinstance(t, ast.Name) and t.id == rhs_name:
+                    return False
+    return True
+class _RemoveAssign(ast.NodeTransformer):
+    def __init__(self, name: str, occ: dict):
+        self._name = name
+        self._occ = occ
+    def visit_Assign(self, node: ast.Assign):
+        if len(node.targets) == 1 and isinstance(node.targets[0], ast.Name):
+            n = node.targets[0].id
+            if n == self._name:
+                o = self._occ[n]
+                if len(o) == 1:
+                    return ast.Expr(value=node.value)
+                if len(o) == 3 and isinstance(o[-1], bool):
+                    o.append(node.value)
+                    if o[-2]:
+                        return None
+        return node
+class _InlineTemp(ast.NodeTransformer):
+    def __init__(self, name: str, occ: dict):
+        self._name = name
+        self._occ = occ
+    def visit_Name(self, node: ast.Name):
+        o = self._occ.get(node.id, [])
+        if node.id == self._name and len(o) == 4 and isinstance(o[-2], bool) and o[-2]:
+            return o[-1]
+        return node

pkgs/MagiCompiler/magi_compiler/magi_depyf/decompile/recompiler.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CodeRecompiler: round-trip decompile -> compile -> extract target CodeType.
+Pipeline: CodeType -> decompile -> compile -> find target.
+"""
+from __future__ import annotations
+from types import CodeType
+from typing import List
+from .decompiler import Decompiler
+class CodeRecompiler:
+    """Decompile *code*, recompile, and produce a compatible ``CodeType``."""
+    @staticmethod
+    def recompile(
+        code_to_decompile: CodeType, reference_code: CodeType, indentation: int = 4, temp_prefix: str = "__temp_"
+    ) -> CodeType:
+        """Full round-trip: decompile -> compile -> find target."""
+        fn_name = reference_code.co_name
+        src = Decompiler(code_to_decompile).decompile(
+            indentation=indentation, temp_prefix=temp_prefix, overwrite_fn_name=fn_name
+        )
+        compiled = compile(src, "noname", "exec")
+        all_codes = CodeRecompiler.collect_code_objects(compiled)
+        return [c for c in all_codes if c.co_name == fn_name][0]
+    @staticmethod
+    def collect_code_objects(code: CodeType) -> List[CodeType]:
+        """Recursively collect all ``CodeType`` objects from *code*."""
+        result = [code]
+        for c in code.co_consts:
+            if isinstance(c, CodeType):
+                result.extend(CodeRecompiler.collect_code_objects(c))
+        return result

pkgs/MagiCompiler/magi_compiler/magi_depyf/demo_toy_example.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Demo: magi_depyf.dump_src with the depyf tutorial toy_example.
+Run: PYTHONPATH=. python demo_toy_example.py
+"""
+import torch
+from magi_compiler.magi_depyf.inspect import dump_src
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch.set_default_device(device)
+@torch.compile
+def toy_example(a, b):
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+def main():
+    for _ in range(100):
+        toy_example(torch.randn(10), torch.randn(10))
+if __name__ == "__main__":
+    import os
+    import shutil
+    out = "./magi_dump_src_dir"
+    if os.path.exists(out):
+        shutil.rmtree(out)
+    with dump_src(out):
+        main()
+    print("\n=== Generated files ===")
+    for root, dirs, files in os.walk(out):
+        level = root.replace(out, "").count(os.sep)
+        print(f"{'  ' * level}{os.path.basename(root)}/")
+        for f in files:
+            print(f"{'  ' * (level + 1)}{f}")

pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/__init__.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inspection layer: capture torch.compile events, introspect artifacts, write structured output."""
+from typing import Optional
+from .dump_src import dump_src
+from .introspect import Introspector
+from .model import CompiledFnInfo, EntryInfo, FunctionInfo, GuardInfo, GuardNode, SubgraphInfo
+from .result import CaptureResult
+from .session import CaptureSession
+from .writer import FunctionWriter, write_function
+def debug_compiled(fn, output_dir: Optional[str] = None) -> FunctionInfo:
+    """Introspect a compiled function and optionally write debug output.
+    Args:
+        fn: The original (uncompiled) function.
+        output_dir: If provided, write organized files to this directory.
+    Returns:
+        FunctionInfo with full compilation state.
+    """
+    info = Introspector.build_function_info(fn)
+    if output_dir is not None:
+        write_function(info, output_dir)
+    return info
+__all__ = [
+    "CompiledFnInfo",
+    "EntryInfo",
+    "FunctionInfo",
+    "GuardInfo",
+    "GuardNode",
+    "SubgraphInfo",
+    "Introspector",
+    "FunctionWriter",
+    "write_function",
+    "dump_src",
+    "debug_compiled",
+    "CaptureSession",
+    "CaptureResult",
+]

pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/dump_src.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""dump_src: context manager that captures torch.compile artifacts and
+writes structured source output to disk.
+Usage::
+    from magi_compiler.magi_depyf.inspect import dump_src
+    @torch.compile
+    def my_fn(x):
+        return x.sum()
+    with dump_src("./output_dir"):
+        my_fn(torch.randn(10))
+Internally uses ``CaptureSession`` to intercept compilation events,
+then runs ``Introspector`` post-hoc for full introspection.
+"""
+from __future__ import annotations
+import contextlib
+from pathlib import Path
+from typing import Set
+from magi_compiler.utils import magi_logger
+from .introspect import Introspector
+from .session import CaptureSession
+from .writer import write_function
+@contextlib.contextmanager
+def dump_src(dump_src_dir: str):
+    """Context manager that captures torch.compile artifacts and writes output.
+    Uses CaptureSession for hook management and post-hoc introspection
+    of CacheEntries after execution completes.
+    """
+    dump_dir = Path(dump_src_dir)
+    dump_dir.mkdir(parents=True, exist_ok=True)
+    with CaptureSession() as session:
+        yield
+    seen: Set[str] = set()
+    overview_paths: list[Path] = []
+    for r in session.results:
+        name = r.original_code.co_name
+        if name in seen:
+            continue
+        if name.startswith("torch_dynamo_resume_in_"):
+            continue
+        seen.add(name)
+        try:
+            info = Introspector.build_function_info(r.original_code, fn_globals=r.fn_globals)
+            root = write_function(info, dump_dir)
+            overview_paths.append(root / "overview.md")
+        except Exception as e:
+            magi_logger.warning("[magi_depyf] failed to process '%s': %s", name, e)
+    for p in overview_paths:
+        if p.exists():
+            magi_logger.info("[magi_depyf] %s", p)

pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/introspect.py ADDED Viewed

	@@ -0,0 +1,524 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Runtime introspection of torch.compile artifacts.
+Walk actual runtime state (CacheEntry chain, guard trees, __compiled_fn
+objects) to build the structured model.  All torch imports are lazy so
+this module can be imported without torch.
+"""
+from __future__ import annotations
+import io
+from pathlib import Path
+from typing import Any, Dict, Optional
+from ..decompile import safe_decompile
+from .model import CompiledFnInfo, EntryInfo, FunctionInfo, GuardInfo, GuardNode, SubgraphInfo
+class Introspector:
+    """Namespace for runtime introspection helpers (all static methods)."""
+    @staticmethod
+    def get_cache_entries(fn) -> list:
+        """Return CacheEntry list for *fn* (function or code object)."""
+        from torch._dynamo.eval_frame import _debug_get_cache_entry_list
+        code = fn.__code__ if hasattr(fn, "__code__") else fn
+        return _debug_get_cache_entry_list(code)
+    @staticmethod
+    def build_guard_tree(node, max_depth: int = 32, _depth: int = 0) -> GuardNode:
+        """Recursively build a GuardNode from a GuardManager C++ object."""
+        type_name = type(node).__name__
+        leaf_guards = []
+        for lg in node.get_leaf_guards():
+            for part in lg.verbose_code_parts():
+                leaf_guards.append(part.strip()[:120])
+        children = []
+        if _depth < max_depth:
+            for child in node.get_child_managers():
+                children.append(Introspector.build_guard_tree(child, max_depth, _depth + 1))
+        return GuardNode(type_name=type_name, leaf_guards=leaf_guards, children=children)
+    @staticmethod
+    def extract_guard_info(entry) -> Optional[GuardInfo]:
+        """Extract structured guard info from a CacheEntry (post-hoc introspection).
+        Operates on the persisted CacheEntry and builds a full GuardNode tree.
+        """
+        try:
+            gm = entry.guard_manager
+            tree = Introspector.build_guard_tree(gm.root)
+            closure_vars: Dict[str, str] = {}
+            if hasattr(gm, "closure_vars") and gm.closure_vars:
+                for k, v in list(gm.closure_vars.items())[:10]:
+                    closure_vars[k] = repr(v)[:100]
+            return GuardInfo(tree=tree, closure_vars=closure_vars or None)
+        except Exception:
+            return None
+    @staticmethod
+    def extract_compiled_fn_info(name: str, fn_globals: dict) -> Optional[CompiledFnInfo]:
+        """Inspect a __compiled_fn_xxx from fn.__globals__.
+        Handles three backend types:
+          eager:        wrapper -> closure[0]=GraphModule.forward (bound method)
+          inductor:     wrapper -> closure[0]=aot_forward -> ... -> CompiledFxGraph
+          magi_compile: MagiSerializableFunction -> split_gm -> PiecewiseBackend(s)
+        """
+        obj = fn_globals.get(name)
+        if obj is None:
+            return None
+        magi_info = Introspector._try_extract_magi_info(name, obj)
+        if magi_info is not None:
+            return magi_info
+        info = CompiledFnInfo(name=name, backend="eager")
+        gm = Introspector._find_graph_module(obj)
+        if gm is not None:
+            Introspector._fill_graph_module_info(info, gm)
+        cfx = Introspector._find_compiled_fx_graph(obj)
+        if cfx is not None:
+            info.backend = "inductor"
+            Introspector._fill_compiled_fx_graph_info(info, cfx)
+        return info
+    @staticmethod
+    def _fill_graph_module_info(info: CompiledFnInfo, gm) -> None:
+        try:
+            info.readable_code = gm.print_readable(print_output=False)
+        except Exception:
+            pass
+        try:
+            info.graph_module_code = str(gm.code) if hasattr(gm, "code") else None
+        except Exception:
+            pass
+        try:
+            buf = io.StringIO()
+            gm.graph.print_tabular(file=buf)
+            info.fx_graph_tabular = buf.getvalue()
+        except Exception:
+            pass
+    @staticmethod
+    def _fill_compiled_fx_graph_info(info: CompiledFnInfo, cfx) -> None:
+        try:
+            info.source_code = cfx.source_code
+        except Exception:
+            pass
+        try:
+            info.inductor_post_grad_graph = cfx.inductor_post_grad_graph_str
+        except Exception:
+            pass
+        try:
+            info.cache_key = cfx.cache_key
+        except Exception:
+            pass
+        try:
+            info.runnable_graph_str = cfx.runnable_graph_str
+        except Exception:
+            pass
+    # -- Magi backend introspection ----------------------------------------
+    @staticmethod
+    def _try_extract_magi_info(name: str, obj) -> Optional[CompiledFnInfo]:
+        """Detect MagiSerializableFunction and walk its hierarchy.
+        MagiSerializableFunction hierarchy:
+          .graph_module   → fx.GraphModule (full graph before splitting)
+          .optimized_call → split_gm (fx.GraphModule with PiecewiseBackend submodules)
+            .submod_N → PiecewiseBackend
+              .graph                           → fx.GraphModule (the subgraph)
+              .compiled_graph_for_general_shape → inductor compiled output
+        Dynamo wraps the backend result in a DisableContext closure, so the
+        MagiSerializableFunction may live one level deep in the closure chain.
+        """
+        msf = obj if (hasattr(obj, "graph_module") and hasattr(obj, "optimized_call")) else None
+        if msf is None and callable(obj) and getattr(obj, "__closure__", None):
+            for cell in obj.__closure__:
+                try:
+                    val = cell.cell_contents
+                except ValueError:
+                    continue
+                if hasattr(val, "graph_module") and hasattr(val, "optimized_call"):
+                    msf = val
+                    break
+        if msf is None:
+            return None
+        obj = msf
+        import torch.fx
+        info = CompiledFnInfo(name=name, backend="magi_compile")
+        full_gm = getattr(obj, "graph_module", None)
+        if isinstance(full_gm, torch.fx.GraphModule):
+            Introspector._fill_graph_module_info(info, full_gm)
+        split_gm = getattr(obj, "optimized_call", None)
+        # In FULL cudagraph mode, optimized_call is a wrapper function whose
+        # __dict__ carries the GraphModule's attributes (via __dict__.update).
+        # Unwrap to find the actual GraphModule for print_readable / named_children.
+        actual_gm = split_gm if isinstance(split_gm, torch.fx.GraphModule) else None
+        if actual_gm is None and split_gm is not None:
+            actual_gm = Introspector._find_graph_module_deep(split_gm)
+        info.cudagraph_mode = Introspector._detect_cudagraph_mode(split_gm, actual_gm)
+        if actual_gm is not None:
+            try:
+                info.split_graph_readable = actual_gm.print_readable(print_output=False)
+            except Exception:
+                pass
+            # PiecewiseCompileInterpreter replaces submodules via __dict__,
+            # so named_children() still sees the original GraphModules while
+            # __dict__ contains the PiecewiseBackend (or cudagraph wrapper).
+            # In FULL cudagraph mode, those __dict__ entries are copied onto
+            # the wrapper function, so we look up runtime objects from
+            # split_gm (the wrapper) rather than actual_gm.
+            runtime_source = split_gm if split_gm is not None else actual_gm
+            for sub_name, original_gm in actual_gm.named_children():
+                runtime_obj = runtime_source.__dict__.get(sub_name, original_gm)
+                sg_info = Introspector._extract_subgraph_info(sub_name, runtime_obj, original_gm)
+                if sg_info is not None:
+                    info.subgraph_infos.append(sg_info)
+            info.subgraph_infos.sort(key=lambda s: s.name)
+        return info
+    @staticmethod
+    def _extract_subgraph_info(sub_name: str, runtime_obj, original_gm=None) -> Optional[SubgraphInfo]:
+        """Extract info from one submodule of the split graph.
+        Args:
+            sub_name: The submodule name (e.g. "submod_0").
+            runtime_obj: The actual runtime object — PiecewiseBackend,
+                         cudagraph wrapper, or the original GraphModule.
+            original_gm: The original GraphModule before replacement (from _modules).
+        """
+        import torch.fx
+        piecewise = Introspector._unwrap_piecewise_backend(runtime_obj)
+        if piecewise is not None:
+            sg = SubgraphInfo(name=sub_name, is_splitting_graph=False)
+            inner_gm = getattr(piecewise, "graph", None)
+            if isinstance(inner_gm, torch.fx.GraphModule):
+                Introspector._fill_subgraph_gm_info(sg, inner_gm)
+            compiled = getattr(piecewise, "compiled_graph_for_general_shape", None)
+            if compiled is not None:
+                sg.inductor_code = Introspector._try_extract_inductor_source(compiled)
+            if sg.inductor_code is None:
+                sg.inductor_code = Introspector._read_artifact_source_from_piecewise(piecewise)
+            return sg
+        gm = original_gm if isinstance(original_gm, torch.fx.GraphModule) else None
+        if gm is None and isinstance(runtime_obj, torch.fx.GraphModule):
+            gm = runtime_obj
+        if gm is not None:
+            sg = SubgraphInfo(name=sub_name, is_splitting_graph=True)
+            Introspector._fill_subgraph_gm_info(sg, gm)
+            return sg
+        return None
+    @staticmethod
+    def _unwrap_piecewise_backend(obj):
+        """Find a PiecewiseBackend from obj, unwrapping closures/wrappers if needed."""
+        if hasattr(obj, "graph") and hasattr(obj, "compiled_graph_for_general_shape"):
+            return obj
+        if callable(obj) and hasattr(obj, "__closure__") and obj.__closure__:
+            for cell in obj.__closure__:
+                try:
+                    val = cell.cell_contents
+                except ValueError:
+                    continue
+                if hasattr(val, "graph") and hasattr(val, "compiled_graph_for_general_shape"):
+                    return val
+        return None
+    @staticmethod
+    def _fill_subgraph_gm_info(sg: SubgraphInfo, gm) -> None:
+        try:
+            sg.readable_code = gm.print_readable(print_output=False)
+        except Exception:
+            pass
+        try:
+            sg.graph_module_code = str(gm.code) if hasattr(gm, "code") else None
+        except Exception:
+            pass
+        try:
+            buf = io.StringIO()
+            gm.graph.print_tabular(file=buf)
+            sg.fx_graph_tabular = buf.getvalue()
+        except Exception:
+            pass
+    @staticmethod
+    def _try_extract_inductor_source(compiled) -> Optional[str]:
+        """Try to extract inductor kernel source from a compiled graph object.
+        Handles CompiledFxGraph, CompiledArtifact, and closure-wrapped variants.
+        """
+        for attr in ("source_code", "_source_code"):
+            val = getattr(compiled, attr, None)
+            if isinstance(val, str) and val:
+                return val
+        cfx = Introspector._find_compiled_fx_graph(compiled)
+        if cfx is not None:
+            try:
+                return cfx.source_code
+            except Exception:
+                pass
+        if hasattr(compiled, "print_readable"):
+            try:
+                return compiled.print_readable(print_output=False)
+            except Exception:
+                pass
+        return None
+    @staticmethod
+    def _read_artifact_source_from_piecewise(piecewise) -> Optional[str]:
+        """Read Inductor-generated source from the saved artifact directory.
+        PiecewiseBackend stores a compiler_manager whose cache maps
+        CacheEntry(runtime_shape, graph_index, backend_name) → CacheHandle(key, path).
+        The artifact at CacheHandle.path is an unpacked directory containing
+        ``py/*.py`` — the full Inductor output code.
+        """
+        try:
+            compiler_manager = getattr(piecewise, "compiler_manager", None)
+            if compiler_manager is None:
+                return None
+            cache = getattr(compiler_manager, "cache", None)
+            if not cache:
+                return None
+            index = getattr(piecewise, "piecewise_compile_index", None)
+            if index is None:
+                return None
+            for cache_entry, cache_handle in cache.items():
+                if cache_entry.graph_index == index and cache_entry.runtime_shape is None:
+                    artifact_path = getattr(cache_handle, "path", None)
+                    if artifact_path:
+                        return Introspector._read_py_from_artifact(artifact_path)
+            return None
+        except Exception:
+            return None
+    @staticmethod
+    def _read_py_from_artifact(artifact_path: str) -> Optional[str]:
+        """Read the Inductor-generated Python wrapper from an artifact directory.
+        The unpacked artifact layout varies across PyTorch versions; the
+        wrapper ``.py`` file has been observed under ``yb/`` and ``py/``.
+        We try known directories first, then fall back to scanning all
+        immediate subdirectories.
+        """
+        root = Path(artifact_path)
+        if not root.is_dir():
+            return None
+        for candidate_dir in ("yb", "py"):
+            d = root / candidate_dir
+            if d.is_dir():
+                py_files = sorted(d.glob("*.py"))
+                if py_files:
+                    try:
+                        return py_files[0].read_text(encoding="utf-8")
+                    except Exception:
+                        pass
+        for d in sorted(root.iterdir()):
+            if d.is_dir():
+                py_files = sorted(d.glob("*.py"))
+                if py_files:
+                    try:
+                        return py_files[0].read_text(encoding="utf-8")
+                    except Exception:
+                        pass
+        return None
+    @staticmethod
+    def _detect_cudagraph_mode(split_gm, actual_gm) -> str:
+        """Detect cudagraph wrapping mode from the split graph structure.
+        - FULL:      split_gm itself is a cudagraph wrapper (not a GraphModule),
+                     with __qualname__ containing 'Athena_CUDAGraph_full'.
+        - PIECEWISE: split_gm is a GraphModule, but its __dict__ submodules are
+                     cudagraph wrappers with __qualname__ 'Athena_CUDAGraph_piecewise'.
+        - NONE:      no cudagraph wrapping detected.
+        """
+        _CG_PREFIX = "Athena_CUDAGraph_"
+        qualname = getattr(split_gm, "__qualname__", "") or ""
+        if qualname.startswith(f"{_CG_PREFIX}full"):
+            return "FULL"
+        if actual_gm is not None:
+            for key, val in actual_gm.__dict__.items():
+                if not key.startswith("submod_"):
+                    continue
+                sub_qualname = getattr(val, "__qualname__", "") or ""
+                if sub_qualname.startswith(f"{_CG_PREFIX}piecewise"):
+                    return "PIECEWISE"
+        return "NONE"
+    @staticmethod
+    def _find_graph_module_deep(obj, _depth: int = 0, _max_depth: int = 4) -> Optional[Any]:
+        """Recursively walk closure chain to find a ``torch.fx.GraphModule``.
+        This is needed for FULL cudagraph mode where the split GraphModule is
+        wrapped by ``gen_wrap_func_for_cudagraph`` (+ ``@instrument_nvtx``),
+        placing the GraphModule 2-3 levels deep in the closure chain.
+        """
+        import torch.fx
+        if isinstance(obj, torch.fx.GraphModule):
+            return obj
+        if _depth >= _max_depth:
+            return None
+        if not callable(obj) or not getattr(obj, "__closure__", None):
+            return None
+        for cell in obj.__closure__:
+            try:
+                val = cell.cell_contents
+            except ValueError:
+                continue
+            if isinstance(val, torch.fx.GraphModule):
+                return val
+            if callable(val):
+                found = Introspector._find_graph_module_deep(val, _depth + 1, _max_depth)
+                if found is not None:
+                    return found
+        return None
+    @staticmethod
+    def _find_graph_module(obj) -> Optional[Any]:
+        """Walk closure chain to find a torch.fx.GraphModule."""
+        import torch.fx
+        if isinstance(obj, torch.fx.GraphModule):
+            return obj
+        if hasattr(obj, "__self__") and isinstance(obj.__self__, torch.fx.GraphModule):
+            return obj.__self__
+        if not callable(obj) or not hasattr(obj, "__closure__") or not obj.__closure__:
+            return None
+        for cell in obj.__closure__:
+            try:
+                val = cell.cell_contents
+            except ValueError:
+                continue
+            if isinstance(val, torch.fx.GraphModule):
+                return val
+            if hasattr(val, "__self__") and isinstance(val.__self__, torch.fx.GraphModule):
+                return val.__self__
+        return None
+    @staticmethod
+    def _find_compiled_fx_graph(obj, _depth: int = 0) -> Optional[Any]:
+        """Walk closure chain (up to 4 levels) to find a CompiledFxGraph."""
+        if _depth > 4:
+            return None
+        try:
+            from torch._inductor.codecache import CompiledFxGraph
+        except ImportError:
+            return None
+        if isinstance(obj, CompiledFxGraph):
+            return obj
+        if not callable(obj) or not hasattr(obj, "__closure__") or not obj.__closure__:
+            return None
+        for cell in obj.__closure__:
+            try:
+                val = cell.cell_contents
+            except ValueError:
+                continue
+            if isinstance(val, CompiledFxGraph):
+                return val
+            if callable(val):
+                found = Introspector._find_compiled_fx_graph(val, _depth + 1)
+                if found is not None:
+                    return found
+        return None
+    @staticmethod
+    def build_entry_info(entry, index: int, fn_globals: dict) -> EntryInfo:
+        """Build an EntryInfo from a CacheEntry."""
+        tc = entry.code
+        decompiled = safe_decompile(tc)
+        compiled_names = [n for n in tc.co_names if n.startswith("__compiled")]
+        compiled_fns = []
+        for cn in compiled_names:
+            cf = Introspector.extract_compiled_fn_info(cn, fn_globals)
+            if cf:
+                compiled_fns.append(cf)
+        resume_names = [n for n in tc.co_names if n.startswith("__resume")]
+        resume_fns = []
+        for rn in resume_names:
+            rfn = fn_globals.get(rn)
+            if rfn is not None and hasattr(rfn, "__code__"):
+                resume_info = Introspector.build_function_info(rfn, fn_globals=fn_globals)
+                resume_info.name = rn
+                resume_fns.append(resume_info)
+        guard = Introspector.extract_guard_info(entry)
+        return EntryInfo(
+            index=index,
+            dynamo_code=tc,
+            decompiled_source=decompiled,
+            guard=guard,
+            compiled_fns=compiled_fns,
+            resume_fns=resume_fns,
+        )
+    @staticmethod
+    def build_function_info(fn, fn_globals: Optional[dict] = None) -> FunctionInfo:
+        """Build full FunctionInfo by walking CacheEntry chain."""
+        if fn_globals is None:
+            fn_globals = fn.__globals__ if hasattr(fn, "__globals__") else {}
+        code = fn.__code__ if hasattr(fn, "__code__") else fn
+        name = code.co_name
+        original_source = safe_decompile(code)
+        entries_raw = Introspector.get_cache_entries(fn)
+        entries = []
+        for i, raw_entry in enumerate(entries_raw):
+            entries.append(Introspector.build_entry_info(raw_entry, i, fn_globals))
+        return FunctionInfo(name=name, original_code=code, original_source=original_source, entries=entries)

pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/model.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data model for structured compilation output.
+These dataclasses represent the full compilation state that
+torch.compile produces, organized to reflect the actual runtime
+structure: CacheEntry linked list, fn/resume recursion,
+compiled_fn → backend mapping, and guard trees.
+"""
+from __future__ import annotations
+import dataclasses
+import dis
+import inspect
+import io
+from types import CodeType
+from typing import Dict, List, Optional
+def format_code_info(code: CodeType) -> str:
+    """Format key attributes of a CodeType for debugging."""
+    lines: List[str] = []
+    lines.append(f"co_name:          {code.co_name}")
+    if hasattr(code, "co_qualname"):
+        lines.append(f"co_qualname:      {code.co_qualname}")
+    lines.append(f"co_filename:      {code.co_filename}")
+    lines.append(f"co_firstlineno:   {code.co_firstlineno}")
+    lines.append(f"co_argcount:      {code.co_argcount}")
+    lines.append(f"co_kwonlyargcount:{code.co_kwonlyargcount}")
+    lines.append(f"co_varnames:      {code.co_varnames}")
+    lines.append(f"co_freevars:      {code.co_freevars}")
+    lines.append(f"co_cellvars:      {code.co_cellvars}")
+    lines.append(f"co_names:         {code.co_names}")
+    flags = code.co_flags
+    flag_strs = [name for name, val in _CODE_FLAGS.items() if flags & val]
+    lines.append(f"co_flags:         0x{flags:04x} ({' | '.join(flag_strs) if flag_strs else 'none'})")
+    lines.append(f"co_stacksize:     {code.co_stacksize}")
+    lines.append("")
+    lines.append("co_consts:")
+    for i, c in enumerate(code.co_consts):
+        lines.append(f"  [{i:3d}] {type(c).__name__:12s} {_safe_repr(c)}")
+    lines.append("")
+    lines.append("dis:")
+    buf = io.StringIO()
+    dis.dis(code, file=buf)
+    lines.append(buf.getvalue())
+    return "\n".join(lines)
+_CODE_FLAGS = {
+    "CO_OPTIMIZED": inspect.CO_OPTIMIZED,
+    "CO_NEWLOCALS": inspect.CO_NEWLOCALS,
+    "CO_VARARGS": inspect.CO_VARARGS,
+    "CO_VARKEYWORDS": inspect.CO_VARKEYWORDS,
+    "CO_NESTED": inspect.CO_NESTED,
+    "CO_GENERATOR": inspect.CO_GENERATOR,
+    "CO_COROUTINE": inspect.CO_COROUTINE,
+    "CO_ASYNC_GENERATOR": inspect.CO_ASYNC_GENERATOR,
+}
+def _safe_repr(obj, max_len: int = 120) -> str:
+    try:
+        r = repr(obj)
+    except Exception:
+        r = f"<repr failed: {type(obj).__name__}>"
+    if len(r) > max_len:
+        r = r[: max_len - 3] + "..."
+    return r
+@dataclasses.dataclass
+class GuardNode:
+    """One node in the guard tree (mirrors RootGuardManager / GuardManager)."""
+    type_name: str
+    leaf_guards: List[str]
+    children: List["GuardNode"] = dataclasses.field(default_factory=list)
+    def format(self, depth: int = 0, max_depth: int = 32) -> str:
+        prefix = "  " * depth
+        lines = [f"{prefix}[{self.type_name}] " f"({len(self.leaf_guards)} leaf guards, {len(self.children)} children)"]
+        for g in self.leaf_guards:
+            lines.append(f"{prefix}  LEAF: {g}")
+        if depth < max_depth:
+            for i, child in enumerate(self.children):
+                lines.append(f"{prefix}  child[{i}]:")
+                lines.append(child.format(depth + 2, max_depth))
+        elif self.children:
+            lines.append(f"{prefix}  ... ({len(self.children)} children omitted)")
+        return "\n".join(lines)
+@dataclasses.dataclass
+class SubgraphInfo:
+    """One piecewise subgraph in the magi split pipeline."""
+    name: str
+    is_splitting_graph: bool = False
+    readable_code: Optional[str] = None
+    graph_module_code: Optional[str] = None
+    fx_graph_tabular: Optional[str] = None
+    inductor_code: Optional[str] = None
+    def format(self) -> str:
+        if self.inductor_code:
+            return self.inductor_code
+        if self.readable_code:
+            return self.readable_code
+        if self.graph_module_code:
+            return self.graph_module_code
+        tag = "splitting_op" if self.is_splitting_graph else "compiled"
+        return f"# {self.name} ({tag})\n"
+@dataclasses.dataclass
+class CompiledFnInfo:
+    """What __compiled_fn_xxx actually points to in the backend."""
+    name: str
+    backend: str  # "eager", "inductor", or "magi_compile"
+    cudagraph_mode: Optional[str] = None  # "NONE", "PIECEWISE", "FULL" (magi_compile only)
+    readable_code: Optional[str] = None
+    graph_module_code: Optional[str] = None
+    fx_graph_tabular: Optional[str] = None
+    source_code: Optional[str] = None
+    inductor_post_grad_graph: Optional[str] = None
+    runnable_graph_str: Optional[str] = None
+    cache_key: Optional[str] = None
+    split_graph_readable: Optional[str] = None
+    subgraph_infos: List["SubgraphInfo"] = dataclasses.field(default_factory=list)
+    def format(self) -> str:
+        """Full content for writing to file (compiled output)."""
+        if self.source_code:
+            return self.source_code
+        if self.readable_code:
+            return self.readable_code
+        if self.graph_module_code:
+            return self.graph_module_code
+        return f"# {self.name} (backend={self.backend})\n"
+    def format_summary(self) -> str:
+        """Short summary for overview / full_code."""
+        header = f"{self.name} (backend={self.backend}"
+        if self.cudagraph_mode:
+            header += f", cudagraph={self.cudagraph_mode}"
+        header += ")"
+        lines = [header]
+        if self.cache_key:
+            lines.append(f"  cache_key: {self.cache_key}")
+        if self.graph_module_code:
+            lines.append("  GraphModule.code:")
+            for l in self.graph_module_code.strip().splitlines():
+                lines.append(f"    {l}")
+        if self.subgraph_infos:
+            lines.append(f"  piecewise subgraphs: {len(self.subgraph_infos)}")
+            for sg in self.subgraph_infos:
+                tag = "splitting_op" if sg.is_splitting_graph else "compiled"
+                lines.append(f"    {sg.name} ({tag})")
+        return "\n".join(lines)
+@dataclasses.dataclass
+class GuardInfo:
+    """Guard information for a CacheEntry."""
+    tree: Optional[GuardNode] = None
+    closure_vars: Optional[Dict[str, str]] = None
+    def format(self) -> str:
+        lines = []
+        if self.tree:
+            lines.append(self.tree.format())
+        if self.closure_vars:
+            lines.append("  closure_vars:")
+            for k, v in list(self.closure_vars.items())[:8]:
+                lines.append(f"    {k} = {v}")
+        return "\n".join(lines)
+@dataclasses.dataclass
+class EntryInfo:
+    """One CacheEntry in the linked list."""
+    index: int
+    dynamo_code: Optional[CodeType] = None
+    decompiled_source: str = ""
+    guard: Optional[GuardInfo] = None
+    compiled_fns: List[CompiledFnInfo] = dataclasses.field(default_factory=list)
+    resume_fns: List["FunctionInfo"] = dataclasses.field(default_factory=list)
+    def format(self, indent: int = 0) -> str:
+        pfx = "  " * indent
+        lines = [f"{pfx}entry[{self.index}]:"]
+        if self.decompiled_source:
+            lines.append(f"{pfx}  dynamo_code (decompiled):")
+            for l in self.decompiled_source.splitlines():
+                lines.append(f"{pfx}    {l}")
+        if self.compiled_fns:
+            lines.append(f"{pfx}  compiled functions:")
+            for cf in self.compiled_fns:
+                lines.append(cf.format_summary())
+        if self.guard:
+            lines.append(f"{pfx}  guards:")
+            lines.append(self.guard.format())
+        if self.resume_fns:
+            lines.append(f"{pfx}  resume functions:")
+            for rf in self.resume_fns:
+                lines.append(rf.format(indent + 2))
+        return "\n".join(lines)
+@dataclasses.dataclass
+class FunctionInfo:
+    """A compiled function and its CacheEntry chain."""
+    name: str
+    original_code: Optional[CodeType] = None
+    original_source: str = ""
+    entries: List[EntryInfo] = dataclasses.field(default_factory=list)
+    def format(self, indent: int = 0) -> str:
+        pfx = "  " * indent
+        lines = [f"{pfx}{self.name}: {len(self.entries)} cache entries"]
+        for entry in self.entries:
+            lines.append(entry.format(indent + 1))
+        return "\n".join(lines)

pkgs/MagiCompiler/magi_compiler/magi_depyf/inspect/result.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CaptureResult — structured data model for one compilation event."""
+from __future__ import annotations
+import dataclasses
+import time
+from types import CodeType
+from typing import List, Optional
+@dataclasses.dataclass
+class CaptureResult:
+    """Data captured from a single ``torch.compile`` bytecode event.
+    - original_code: the user's original function code
+    - dynamo_code: the code after Dynamo transformation (with __compiled_fn / __resume calls)
+    - decompiled_source: dynamo_code decompiled back to Python source
+    - fn_globals: the function's global namespace (for post-hoc introspection)
+    """
+    function_name: str
+    original_code: CodeType
+    dynamo_code: CodeType
+    decompiled_source: str
+    fn_globals: Optional[dict] = None
+    guards: List[str] = dataclasses.field(default_factory=list)
+    graph_source: Optional[str] = None
+    timestamp: float = dataclasses.field(default_factory=time.time)
+    def summary(self) -> str:
+        n_guards = len(self.guards)
+        return (
+            f"[{self.function_name}] "
+            f"original={self.original_code.co_name}, "
+            f"guards={n_guards}, "
+            f"graph={'yes' if self.graph_source else 'no'}"
+        )