From b3a5bb6a8d79be963341367f1ebd66440d884ad8 Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Wed, 17 Sep 2025 17:51:35 -0700
Subject: [PATCH 01/10] Refactor setup.py for lazy loading and build
 optimization

- Introduced lazy imports for heavy dependencies like `torch` and `torch.utils.cpp_extension` to reduce initial import overhead.
- Replaced the existing `TorchAOBuildExt` class with `LazyTorchAOBuildExt` to defer submodule checks and extension discovery until build time.
- Updated the `setup()` function to set `ext_modules` to an empty list, deferring extension discovery for performance improvements.
- Enhanced debug output for build processes based on environment variables.

This refactor aims to streamline the build process and improve performance during package setup.
---
 setup.py | 189 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 105 insertions(+), 84 deletions(-)

diff --git a/setup.py b/setup.py
index fd4ee9f40f..88d4963f8f 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@
 from typing import List, Optional
 
 from setuptools import Extension, find_packages, setup
+from setuptools.command.build_ext import build_ext as _setuptools_build_ext
 
 current_date = datetime.now().strftime("%Y%m%d")
 
@@ -97,17 +98,7 @@ def read_version(file_path="version.txt"):
 def use_debug_mode():
     return os.getenv("DEBUG", "0") == "1"
 
-
-import torch
-from torch.utils.cpp_extension import (
-    CUDA_HOME,
-    IS_WINDOWS,
-    ROCM_HOME,
-    BuildExtension,
-    CppExtension,
-    CUDAExtension,
-    _get_cuda_arch_flags,
-)
+# Heavy imports (torch, torch.utils.cpp_extension) are deferred to build time
 
 
 class BuildOptions:
@@ -139,6 +130,7 @@ def __init__(self):
             "TORCHAO_BUILD_EXPERIMENTAL_MPS", default=False
         )
         if self.build_experimental_mps:
+            import torch  # Lazy import
             assert is_macos, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires macOS"
             assert is_arm64, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires arm64"
             assert torch.mps.is_available(), (
@@ -264,6 +256,9 @@ def get_cutlass_build_flags():
     """Determine which CUTLASS kernels to build based on CUDA version.
     SM90a: CUDA 12.6+, SM100a: CUDA 12.8+
     """
+    # Lazy import torch and helper; only needed when building CUDA extensions
+    import torch
+    from torch.utils.cpp_extension import _get_cuda_arch_flags
     # Try nvcc then torch version
     cuda_version = get_cuda_version_from_nvcc() or torch.version.cuda
 
@@ -290,64 +285,69 @@ def get_cutlass_build_flags():
         )
 
 
-# BuildExtension is a subclass of from setuptools.command.build_ext.build_ext
-class TorchAOBuildExt(BuildExtension):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
-    def build_extensions(self):
-        cmake_extensions = [
-            ext for ext in self.extensions if isinstance(ext, CMakeExtension)
-        ]
-        other_extensions = [
-            ext for ext in self.extensions if not isinstance(ext, CMakeExtension)
-        ]
-        for ext in cmake_extensions:
-            self.build_cmake(ext)
-
-        # Use BuildExtension to build other extensions
-        self.extensions = other_extensions
-        super().build_extensions()
-
-        self.extensions = other_extensions + cmake_extensions
-
-    def build_cmake(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-
-        # Get the expected extension file name that Python will look for
-        # We force CMake to use this library name
-        ext_filename = os.path.basename(self.get_ext_filename(ext.name))
-        ext_basename = os.path.splitext(ext_filename)[0]
-
-        print(
-            "CMAKE COMMANG",
-            [
-                "cmake",
-                ext.cmake_lists_dir,
-            ]
-            + ext.cmake_args
-            + [
-                "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
-                "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
-            ],
-        )
+class LazyTorchAOBuildExt(_setuptools_build_ext):
+    def run(self):
+        # Import heavy torch build only when actually running build_ext
+        from torch.utils.cpp_extension import BuildExtension as _BuildExtension
+
+        class _TorchAOBuildExt(_BuildExtension):
+            def run(self_inner):
+                if os.getenv("USE_CPP", "1") != "0":
+                    check_submodules()
+                if not self_inner.distribution.ext_modules:
+                    self_inner.distribution.ext_modules = get_extensions()
+                super(_TorchAOBuildExt, self_inner).run()
+
+            def build_extensions(self_inner):
+                cmake_extensions = [
+                    ext for ext in self_inner.extensions if isinstance(ext, CMakeExtension)
+                ]
+                other_extensions = [
+                    ext for ext in self_inner.extensions if not isinstance(ext, CMakeExtension)
+                ]
+                for ext in cmake_extensions:
+                    self_inner.build_cmake(ext)
+
+                self_inner.extensions = other_extensions
+                super(_TorchAOBuildExt, self_inner).build_extensions()
+                self_inner.extensions = other_extensions + cmake_extensions
+
+            def build_cmake(self_inner, ext):
+                extdir = os.path.abspath(os.path.dirname(self_inner.get_ext_fullpath(ext.name)))
+                if not os.path.exists(self_inner.build_temp):
+                    os.makedirs(self_inner.build_temp)
+                ext_filename = os.path.basename(self_inner.get_ext_filename(ext.name))
+                ext_basename = os.path.splitext(ext_filename)[0]
+                if os.getenv("VERBOSE_BUILD", "0") == "1" or use_debug_mode():
+                    print(
+                        "CMAKE COMMAND",
+                        [
+                            "cmake",
+                            ext.cmake_lists_dir,
+                        ]
+                        + ext.cmake_args
+                        + [
+                            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
+                            "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
+                        ],
+                    )
+                subprocess.check_call(
+                    [
+                        "cmake",
+                        ext.cmake_lists_dir,
+                    ]
+                    + ext.cmake_args
+                    + [
+                        "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
+                        "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
+                    ],
+                    cwd=self_inner.build_temp,
+                )
+                subprocess.check_call(["cmake", "--build", "."], cwd=self_inner.build_temp)
 
-        subprocess.check_call(
-            [
-                "cmake",
-                ext.cmake_lists_dir,
-            ]
-            + ext.cmake_args
-            + [
-                "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
-                "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
-            ],
-            cwd=self.build_temp,
-        )
-        subprocess.check_call(["cmake", "--build", "."], cwd=self.build_temp)
+        # Morph this instance into the real BuildExtension subclass and run
+        self.__class__ = _TorchAOBuildExt
+        return _TorchAOBuildExt.run(self)
 
 
 class CMakeExtension(Extension):
@@ -371,6 +371,16 @@ def get_extensions():
     if debug_mode:
         print("Compiling in debug mode")
 
+    # Heavy imports moved here to minimize setup.py import overhead
+    import torch
+    from torch.utils.cpp_extension import (
+        CUDA_HOME,
+        IS_WINDOWS,
+        ROCM_HOME,
+        CppExtension,
+        CUDAExtension,
+    )
+
     if CUDA_HOME is None and torch.version.cuda:
         print("CUDA toolkit is not available. Skipping compilation of CUDA extensions")
         print(
@@ -452,11 +462,13 @@ def get_extensions():
         found_col16 = False
         found_vec_ext = False
         found_outer_vec = False
-        print("ROCM_HOME", ROCM_HOME)
+        if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+            print("ROCM_HOME", ROCM_HOME)
         hipblaslt_headers = list(
             glob.glob(os.path.join(ROCM_HOME, "include", "hipblaslt", "hipblaslt.h"))
         )
-        print("hipblaslt_headers", hipblaslt_headers)
+        if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+            print("hipblaslt_headers", hipblaslt_headers)
         for header in hipblaslt_headers:
             with open(header) as f:
                 text = f.read()
@@ -468,17 +480,22 @@ def get_extensions():
                     found_outer_vec = True
         if found_col16:
             extra_compile_args["cxx"].append("-DHIPBLASLT_HAS_ORDER_COL16")
-            print("hipblaslt found extended col order enums")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt found extended col order enums")
         else:
-            print("hipblaslt does not have extended col order enums")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt does not have extended col order enums")
         if found_outer_vec:
             extra_compile_args["cxx"].append("-DHIPBLASLT_OUTER_VEC")
-            print("hipblaslt found outer vec")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt found outer vec")
         elif found_vec_ext:
             extra_compile_args["cxx"].append("-DHIPBLASLT_VEC_EXT")
-            print("hipblaslt found vec ext")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt found vec ext")
         else:
-            print("hipblaslt does not have vec ext")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt does not have vec ext")
 
     # Get base directory and source paths
     curdir = os.path.dirname(os.path.curdir)
@@ -641,7 +658,8 @@ def get_extensions():
 
     ext_modules = []
     if len(sources) > 0:
-        print("SOURCES", sources)
+        if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+            print("SOURCES", sources)
         # Double-check to ensure mx_fp_cutlass_kernels.cu is not in sources
         sources = [
             s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu"
@@ -735,9 +753,13 @@ def get_extensions():
         def bool_to_on_off(value):
             return "ON" if value else "OFF"
 
-        from distutils.sysconfig import get_python_lib
+        import importlib.util
 
-        torch_dir = get_python_lib() + "/torch/share/cmake/Torch"
+        spec = importlib.util.find_spec("torch")
+        if spec is None or spec.origin is None:
+            raise RuntimeError("Unable to locate 'torch' package for CMake config")
+        torch_pkg_dir = os.path.dirname(spec.origin)
+        torch_dir = os.path.join(torch_pkg_dir, "share", "cmake", "Torch")
 
         ext_modules.append(
             CMakeExtension(
@@ -762,24 +784,23 @@ def bool_to_on_off(value):
     return ext_modules
 
 
-# Only check submodules if we're going to build C++ extensions
-if use_cpp != "0":
-    check_submodules()
+# Defer submodule checks to build time via build_ext
 
 setup(
     name="torchao",
     version=version + version_suffix,
-    packages=find_packages(exclude=["benchmarks", "benchmarks.*"]),
+    packages=find_packages(include=["torchao*"]),
     include_package_data=True,
     package_data={
         "torchao.kernel.configs": ["*.pkl"],
     },
-    ext_modules=get_extensions(),
+    # Defer extension discovery to build time for performance
+    ext_modules=[],
     extras_require={"dev": read_requirements("dev-requirements.txt")},
     description="Package for applying ao techniques to GPU models",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
     url="https://github.com/pytorch/ao",
-    cmdclass={"build_ext": TorchAOBuildExt},
+    cmdclass={"build_ext": LazyTorchAOBuildExt},
     options={"bdist_wheel": {"py_limited_api": "cp39"}},
 )

From 90843da4a0436d1b7cd6f8fdcd6e4e2e79f3629f Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Wed, 17 Sep 2025 17:56:15 -0700
Subject: [PATCH 02/10] Enhance code readability in setup.py

- Added blank lines for improved separation of code blocks.
- Reformatted list comprehensions for better clarity.
- Adjusted line breaks in function calls to enhance readability.

These changes aim to make the code more maintainable and easier to navigate.
---
 setup.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 88d4963f8f..8b9bc70423 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,7 @@ def read_version(file_path="version.txt"):
 def use_debug_mode():
     return os.getenv("DEBUG", "0") == "1"
 
+
 # Heavy imports (torch, torch.utils.cpp_extension) are deferred to build time
 
 
@@ -131,6 +132,7 @@ def __init__(self):
         )
         if self.build_experimental_mps:
             import torch  # Lazy import
+
             assert is_macos, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires macOS"
             assert is_arm64, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires arm64"
             assert torch.mps.is_available(), (
@@ -259,6 +261,7 @@ def get_cutlass_build_flags():
     # Lazy import torch and helper; only needed when building CUDA extensions
     import torch
     from torch.utils.cpp_extension import _get_cuda_arch_flags
+
     # Try nvcc then torch version
     cuda_version = get_cuda_version_from_nvcc() or torch.version.cuda
 
@@ -300,10 +303,14 @@ def run(self_inner):
 
             def build_extensions(self_inner):
                 cmake_extensions = [
-                    ext for ext in self_inner.extensions if isinstance(ext, CMakeExtension)
+                    ext
+                    for ext in self_inner.extensions
+                    if isinstance(ext, CMakeExtension)
                 ]
                 other_extensions = [
-                    ext for ext in self_inner.extensions if not isinstance(ext, CMakeExtension)
+                    ext
+                    for ext in self_inner.extensions
+                    if not isinstance(ext, CMakeExtension)
                 ]
                 for ext in cmake_extensions:
                     self_inner.build_cmake(ext)
@@ -313,7 +320,9 @@ def build_extensions(self_inner):
                 self_inner.extensions = other_extensions + cmake_extensions
 
             def build_cmake(self_inner, ext):
-                extdir = os.path.abspath(os.path.dirname(self_inner.get_ext_fullpath(ext.name)))
+                extdir = os.path.abspath(
+                    os.path.dirname(self_inner.get_ext_fullpath(ext.name))
+                )
                 if not os.path.exists(self_inner.build_temp):
                     os.makedirs(self_inner.build_temp)
                 ext_filename = os.path.basename(self_inner.get_ext_filename(ext.name))
@@ -343,7 +352,9 @@ def build_cmake(self_inner, ext):
                     ],
                     cwd=self_inner.build_temp,
                 )
-                subprocess.check_call(["cmake", "--build", "."], cwd=self_inner.build_temp)
+                subprocess.check_call(
+                    ["cmake", "--build", "."], cwd=self_inner.build_temp
+                )
 
         # Morph this instance into the real BuildExtension subclass and run
         self.__class__ = _TorchAOBuildExt

From 94e495b49edf806951d2126e0e6e72c77f130b90 Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Wed, 17 Sep 2025 21:01:35 -0700
Subject: [PATCH 03/10] Add environment variable to disable PEP 517 build
 isolation in CI workflow

- Introduced the `PIP_NO_BUILD_ISOLATION` environment variable in the `build_wheels_linux.yml` workflow to ensure that the `setuptools` installed in the `pre_build_script.sh` is accessible during the build process.

This change aims to improve the build process by allowing the use of the correct version of `setuptools` without isolation issues.
---
 .github/workflows/build_wheels_linux.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build_wheels_linux.yml b/.github/workflows/build_wheels_linux.yml
index f164ed03c5..0b0a29de3f 100644
--- a/.github/workflows/build_wheels_linux.yml
+++ b/.github/workflows/build_wheels_linux.yml
@@ -39,6 +39,10 @@ jobs:
     permissions:
       id-token: write
       contents: read
+    env:
+      # Ensure pip does not use PEP 517 build isolation so that
+      # setuptools installed in pre_build_script.sh is visible to the build.
+      PIP_NO_BUILD_ISOLATION: "1"
     uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
     with:
       # Set the ref to an empty string instead of the default nightly because

From f954b7821a2f1d7ba42083cff6c5f2bb5cf19212 Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Wed, 17 Sep 2025 21:10:03 -0700
Subject: [PATCH 04/10] Update CI workflow and environment script for
 PIP_NO_BUILD_ISOLATION

- Removed the `PIP_NO_BUILD_ISOLATION` environment variable from the `build_wheels_linux.yml` workflow.
- Added the `PIP_NO_BUILD_ISOLATION` export to the `env_var_script_linux.sh` to ensure pre-installed tools are accessible during the build process.

These changes aim to streamline the build environment and maintain consistency in the usage of environment variables.
---
 .github/workflows/build_wheels_linux.yml | 4 ----
 packaging/env_var_script_linux.sh        | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_wheels_linux.yml b/.github/workflows/build_wheels_linux.yml
index 0b0a29de3f..f164ed03c5 100644
--- a/.github/workflows/build_wheels_linux.yml
+++ b/.github/workflows/build_wheels_linux.yml
@@ -39,10 +39,6 @@ jobs:
     permissions:
       id-token: write
       contents: read
-    env:
-      # Ensure pip does not use PEP 517 build isolation so that
-      # setuptools installed in pre_build_script.sh is visible to the build.
-      PIP_NO_BUILD_ISOLATION: "1"
     uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
     with:
       # Set the ref to an empty string instead of the default nightly because
diff --git a/packaging/env_var_script_linux.sh b/packaging/env_var_script_linux.sh
index 3d3394fbd5..b9abe46d24 100644
--- a/packaging/env_var_script_linux.sh
+++ b/packaging/env_var_script_linux.sh
@@ -17,3 +17,7 @@ TORCH_CUDA_ARCH_LIST="8.0;8.6"
 if [[ ${CU_VERSION:-} == "cu124" ]]; then
   TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
 fi
+
+# Ensure pip does not use PEP 517 build isolation so that pre-installed
+# tools from pre_build_script.sh (setuptools, wheel) are visible to the build.
+export PIP_NO_BUILD_ISOLATION=1

From da04ff60ca0701e150e4750fab0560e92994cc31 Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Wed, 17 Sep 2025 21:24:40 -0700
Subject: [PATCH 05/10] Enhance post-build script to conditionally run
 auditwheel

- Added a check to ensure that auditwheel is only executed if the wheel contains at least one shared object (.so) file.
- Included a message to indicate when auditwheel is skipped due to the absence of shared libraries.
- Updated the wheel removal command to use `rm -f` for safer deletion.

These changes improve the robustness of the post-build process by preventing unnecessary execution of auditwheel.
---
 packaging/post_build_script.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh
index d47aacd339..47a760d088 100644
--- a/packaging/post_build_script.sh
+++ b/packaging/post_build_script.sh
@@ -14,7 +14,9 @@ if [[ "$CU_VERSION" == cu* ]]; then
 
     pushd dist
     manylinux_plat=manylinux_2_28_x86_64
-    auditwheel repair --plat "$manylinux_plat" -w . \
+    # Only run auditwheel if the wheel contains at least one shared object (.so)
+    if unzip -l "$WHEEL_NAME" | awk '{print $4}' | grep -E '\\.so($|\.)' >/dev/null 2>&1; then
+        auditwheel repair --plat "$manylinux_plat" -w . \
     --exclude libtorch.so \
     --exclude libtorch_python.so \
     --exclude libtorch_cuda.so \
@@ -24,10 +26,13 @@ if [[ "$CU_VERSION" == cu* ]]; then
     --exclude libcuda.so.* \
     --exclude libcudart.so.* \
     "${WHEEL_NAME}"
+    else
+        echo "No shared libraries detected in wheel ${WHEEL_NAME}; skipping auditwheel."
+    fi
 
     ls -lah .
     # Clean up the linux_x86_64 wheel
-    rm "${WHEEL_NAME}"
+    rm -f "${WHEEL_NAME}"
     popd
 fi
 

From b4494b9d805565948cf9bc961f9382d62cf5246e Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Wed, 17 Sep 2025 21:37:29 -0700
Subject: [PATCH 06/10] Refactor post-build script for improved wheel handling

- Updated the script to determine the original wheel file produced by the build process, ensuring that the correct wheel is used for auditwheel operations.
- Changed the wheel installation command to select the most recent wheel file from the distribution directory.
- Enhanced logging messages to reflect the changes in wheel handling.

These modifications enhance the reliability and clarity of the post-build process.
---
 packaging/post_build_script.sh | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh
index 47a760d088..ab342d4e8c 100644
--- a/packaging/post_build_script.sh
+++ b/packaging/post_build_script.sh
@@ -10,12 +10,12 @@ set -eux
 # Prepare manywheel, only for CUDA.
 # The wheel is a pure python wheel for other platforms.
 if [[ "$CU_VERSION" == cu* ]]; then
-    WHEEL_NAME=$(ls dist/)
-
     pushd dist
+    # Determine the original wheel produced by build (there should be exactly one)
+    ORIG_WHEEL=$(ls -1 *.whl | head -n 1)
     manylinux_plat=manylinux_2_28_x86_64
     # Only run auditwheel if the wheel contains at least one shared object (.so)
-    if unzip -l "$WHEEL_NAME" | awk '{print $4}' | grep -E '\\.so($|\.)' >/dev/null 2>&1; then
+    if unzip -l "$ORIG_WHEEL" | awk '{print $4}' | grep -E '\\.so($|\.)' >/dev/null 2>&1; then
         auditwheel repair --plat "$manylinux_plat" -w . \
     --exclude libtorch.so \
     --exclude libtorch_python.so \
@@ -25,18 +25,16 @@ if [[ "$CU_VERSION" == cu* ]]; then
     --exclude libc10_cuda.so \
     --exclude libcuda.so.* \
     --exclude libcudart.so.* \
-    "${WHEEL_NAME}"
+        "${ORIG_WHEEL}"
     else
-        echo "No shared libraries detected in wheel ${WHEEL_NAME}; skipping auditwheel."
+        echo "No shared libraries detected in wheel ${ORIG_WHEEL}; skipping auditwheel."
     fi
 
     ls -lah .
-    # Clean up the linux_x86_64 wheel
-    rm -f "${WHEEL_NAME}"
     popd
 fi
 
-MANYWHEEL_NAME=$(ls dist/)
-# Try to install the new wheel
-pip install "dist/${MANYWHEEL_NAME}"
+INSTALL_WHEEL=$(ls -1t dist/*.whl | head -n 1)
+# Try to install the new wheel (pick most recent wheel file)
+pip install "${INSTALL_WHEEL}"
 python -c "import torchao"

From a18b08191cf29c72bda72070dc01689a5d94f646 Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Mon, 22 Sep 2025 21:31:37 -0700
Subject: [PATCH 07/10] Enhance CUDA extension handling in setup.py

- Updated the logic to skip CUDA extension compilation only if both CUDA_HOME is unset and nvcc is not found, improving compatibility with CI environments.
- Adjusted the condition for building CUDA extensions to check for the presence of either CUDA_HOME or nvcc.

These changes aim to provide clearer messaging and better support for CUDA extension compilation in various environments.
---
 setup.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 8b9bc70423..310623a0a3 100644
--- a/setup.py
+++ b/setup.py
@@ -392,8 +392,10 @@ def get_extensions():
         CUDAExtension,
     )
 
-    if CUDA_HOME is None and torch.version.cuda:
-        print("CUDA toolkit is not available. Skipping compilation of CUDA extensions")
+    # Only skip CUDA extensions if neither CUDA_HOME nor nvcc is available.
+    # In many CI environments CUDA_HOME may be unset even though nvcc is on PATH.
+    if torch.version.cuda and CUDA_HOME is None and get_cuda_version_from_nvcc() is None:
+        print("CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions")
         print(
             "If you'd like to compile CUDA extensions locally please install the cudatoolkit from https://anaconda.org/nvidia/cuda-toolkit"
         )
@@ -401,7 +403,8 @@ def get_extensions():
         print("ROCm is not available. Skipping compilation of ROCm extensions")
         print("If you'd like to compile ROCm extensions locally please install ROCm")
 
-    use_cuda = torch.version.cuda and CUDA_HOME is not None
+    # Build CUDA extensions if CUDA is available and either CUDA_HOME is set or nvcc is present
+    use_cuda = bool(torch.version.cuda) and (CUDA_HOME is not None or get_cuda_version_from_nvcc() is not None)
     use_rocm = torch.version.hip and ROCM_HOME is not None
     extension = CUDAExtension if (use_cuda or use_rocm) else CppExtension
 

From d0e86c220344f8e8cc1d4eb8da4e317809483b3b Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Mon, 22 Sep 2025 21:33:53 -0700
Subject: [PATCH 08/10] lint

---
 setup.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 310623a0a3..345fa9a31f 100644
--- a/setup.py
+++ b/setup.py
@@ -394,8 +394,14 @@ def get_extensions():
 
     # Only skip CUDA extensions if neither CUDA_HOME nor nvcc is available.
     # In many CI environments CUDA_HOME may be unset even though nvcc is on PATH.
-    if torch.version.cuda and CUDA_HOME is None and get_cuda_version_from_nvcc() is None:
-        print("CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions")
+    if (
+        torch.version.cuda
+        and CUDA_HOME is None
+        and get_cuda_version_from_nvcc() is None
+    ):
+        print(
+            "CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions"
+        )
         print(
             "If you'd like to compile CUDA extensions locally please install the cudatoolkit from https://anaconda.org/nvidia/cuda-toolkit"
         )
@@ -404,7 +410,9 @@ def get_extensions():
         print("If you'd like to compile ROCm extensions locally please install ROCm")
 
     # Build CUDA extensions if CUDA is available and either CUDA_HOME is set or nvcc is present
-    use_cuda = bool(torch.version.cuda) and (CUDA_HOME is not None or get_cuda_version_from_nvcc() is not None)
+    use_cuda = bool(torch.version.cuda) and (
+        CUDA_HOME is not None or get_cuda_version_from_nvcc() is not None
+    )
     use_rocm = torch.version.hip and ROCM_HOME is not None
     extension = CUDAExtension if (use_cuda or use_rocm) else CppExtension
 

From 1d1fa5e94ce3ce0b4350254a998edd5014355cb9 Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Sun, 28 Sep 2025 06:50:05 -0700
Subject: [PATCH 09/10] Fix CUDA extension detection logic

The recent change to CUDA extension detection was causing CUDA extensions
to not be built when CUDA_HOME is unset but nvcc is available. This was
breaking the quant_llm_linear operator registration.

The issue was that get_cuda_version_from_nvcc() could return None even
when nvcc is available, causing use_cuda to be False.

This fix adds a separate is_nvcc_available() function that simply checks
if nvcc can be executed, regardless of version parsing success.
---
 setup.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 345fa9a31f..3243a5b5ef 100644
--- a/setup.py
+++ b/setup.py
@@ -254,6 +254,15 @@ def get_cuda_version_from_nvcc():
         return None
 
 
+def is_nvcc_available():
+    """Check if nvcc is available on the system."""
+    try:
+        subprocess.check_output(["nvcc", "--version"], stderr=subprocess.STDOUT)
+        return True
+    except:
+        return False
+
+
 def get_cutlass_build_flags():
     """Determine which CUTLASS kernels to build based on CUDA version.
     SM90a: CUDA 12.6+, SM100a: CUDA 12.8+
@@ -397,7 +406,7 @@ def get_extensions():
     if (
         torch.version.cuda
         and CUDA_HOME is None
-        and get_cuda_version_from_nvcc() is None
+        and not is_nvcc_available()
     ):
         print(
             "CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions"
@@ -411,7 +420,7 @@ def get_extensions():
 
     # Build CUDA extensions if CUDA is available and either CUDA_HOME is set or nvcc is present
     use_cuda = bool(torch.version.cuda) and (
-        CUDA_HOME is not None or get_cuda_version_from_nvcc() is not None
+        CUDA_HOME is not None or is_nvcc_available()
     )
     use_rocm = torch.version.hip and ROCM_HOME is not None
     extension = CUDAExtension if (use_cuda or use_rocm) else CppExtension

From 59af0fa0f75968eef5914b949d2fa259869aa124 Mon Sep 17 00:00:00 2001
From: Peter Yeh <peter.yeh@gmail.com>
Date: Sun, 28 Sep 2025 06:52:42 -0700
Subject: [PATCH 10/10] Format setup.py with ruff

Reformat the CUDA extension detection condition to follow ruff formatting standards.
---
 setup.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 3243a5b5ef..73217dca65 100644
--- a/setup.py
+++ b/setup.py
@@ -403,11 +403,7 @@ def get_extensions():
 
     # Only skip CUDA extensions if neither CUDA_HOME nor nvcc is available.
     # In many CI environments CUDA_HOME may be unset even though nvcc is on PATH.
-    if (
-        torch.version.cuda
-        and CUDA_HOME is None
-        and not is_nvcc_available()
-    ):
+    if torch.version.cuda and CUDA_HOME is None and not is_nvcc_available():
         print(
             "CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions"
         )