From b3a5bb6a8d79be963341367f1ebd66440d884ad8 Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Wed, 17 Sep 2025 17:51:35 -0700 Subject: [PATCH 01/10] Refactor setup.py for lazy loading and build optimization - Introduced lazy imports for heavy dependencies like `torch` and `torch.utils.cpp_extension` to reduce initial import overhead. - Replaced the existing `TorchAOBuildExt` class with `LazyTorchAOBuildExt` to defer submodule checks and extension discovery until build time. - Updated the `setup()` function to set `ext_modules` to an empty list, deferring extension discovery for performance improvements. - Enhanced debug output for build processes based on environment variables. This refactor aims to streamline the build process and improve performance during package setup. --- setup.py | 189 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 105 insertions(+), 84 deletions(-) diff --git a/setup.py b/setup.py index fd4ee9f40f..88d4963f8f 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ from typing import List, Optional from setuptools import Extension, find_packages, setup +from setuptools.command.build_ext import build_ext as _setuptools_build_ext current_date = datetime.now().strftime("%Y%m%d") @@ -97,17 +98,7 @@ def read_version(file_path="version.txt"): def use_debug_mode(): return os.getenv("DEBUG", "0") == "1" - -import torch -from torch.utils.cpp_extension import ( - CUDA_HOME, - IS_WINDOWS, - ROCM_HOME, - BuildExtension, - CppExtension, - CUDAExtension, - _get_cuda_arch_flags, -) +# Heavy imports (torch, torch.utils.cpp_extension) are deferred to build time class BuildOptions: @@ -139,6 +130,7 @@ def __init__(self): "TORCHAO_BUILD_EXPERIMENTAL_MPS", default=False ) if self.build_experimental_mps: + import torch # Lazy import assert is_macos, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires macOS" assert is_arm64, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires arm64" assert torch.mps.is_available(), ( @@ -264,6 +256,9 @@ def get_cutlass_build_flags(): """Determine which CUTLASS kernels to build based on CUDA version. SM90a: CUDA 12.6+, SM100a: CUDA 12.8+ """ + # Lazy import torch and helper; only needed when building CUDA extensions + import torch + from torch.utils.cpp_extension import _get_cuda_arch_flags # Try nvcc then torch version cuda_version = get_cuda_version_from_nvcc() or torch.version.cuda @@ -290,64 +285,69 @@ def get_cutlass_build_flags(): ) -# BuildExtension is a subclass of from setuptools.command.build_ext.build_ext -class TorchAOBuildExt(BuildExtension): - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - def build_extensions(self): - cmake_extensions = [ - ext for ext in self.extensions if isinstance(ext, CMakeExtension) - ] - other_extensions = [ - ext for ext in self.extensions if not isinstance(ext, CMakeExtension) - ] - for ext in cmake_extensions: - self.build_cmake(ext) - - # Use BuildExtension to build other extensions - self.extensions = other_extensions - super().build_extensions() - - self.extensions = other_extensions + cmake_extensions - - def build_cmake(self, ext): - extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) - - if not os.path.exists(self.build_temp): - os.makedirs(self.build_temp) - - # Get the expected extension file name that Python will look for - # We force CMake to use this library name - ext_filename = os.path.basename(self.get_ext_filename(ext.name)) - ext_basename = os.path.splitext(ext_filename)[0] - - print( - "CMAKE COMMANG", - [ - "cmake", - ext.cmake_lists_dir, - ] - + ext.cmake_args - + [ - "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir, - "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename, - ], - ) +class LazyTorchAOBuildExt(_setuptools_build_ext): + def run(self): + # Import heavy torch build only when actually running build_ext + from torch.utils.cpp_extension import BuildExtension as _BuildExtension + + class _TorchAOBuildExt(_BuildExtension): + def run(self_inner): + if os.getenv("USE_CPP", "1") != "0": + check_submodules() + if not self_inner.distribution.ext_modules: + self_inner.distribution.ext_modules = get_extensions() + super(_TorchAOBuildExt, self_inner).run() + + def build_extensions(self_inner): + cmake_extensions = [ + ext for ext in self_inner.extensions if isinstance(ext, CMakeExtension) + ] + other_extensions = [ + ext for ext in self_inner.extensions if not isinstance(ext, CMakeExtension) + ] + for ext in cmake_extensions: + self_inner.build_cmake(ext) + + self_inner.extensions = other_extensions + super(_TorchAOBuildExt, self_inner).build_extensions() + self_inner.extensions = other_extensions + cmake_extensions + + def build_cmake(self_inner, ext): + extdir = os.path.abspath(os.path.dirname(self_inner.get_ext_fullpath(ext.name))) + if not os.path.exists(self_inner.build_temp): + os.makedirs(self_inner.build_temp) + ext_filename = os.path.basename(self_inner.get_ext_filename(ext.name)) + ext_basename = os.path.splitext(ext_filename)[0] + if os.getenv("VERBOSE_BUILD", "0") == "1" or use_debug_mode(): + print( + "CMAKE COMMAND", + [ + "cmake", + ext.cmake_lists_dir, + ] + + ext.cmake_args + + [ + "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir, + "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename, + ], + ) + subprocess.check_call( + [ + "cmake", + ext.cmake_lists_dir, + ] + + ext.cmake_args + + [ + "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir, + "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename, + ], + cwd=self_inner.build_temp, + ) + subprocess.check_call(["cmake", "--build", "."], cwd=self_inner.build_temp) - subprocess.check_call( - [ - "cmake", - ext.cmake_lists_dir, - ] - + ext.cmake_args - + [ - "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir, - "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename, - ], - cwd=self.build_temp, - ) - subprocess.check_call(["cmake", "--build", "."], cwd=self.build_temp) + # Morph this instance into the real BuildExtension subclass and run + self.__class__ = _TorchAOBuildExt + return _TorchAOBuildExt.run(self) class CMakeExtension(Extension): @@ -371,6 +371,16 @@ def get_extensions(): if debug_mode: print("Compiling in debug mode") + # Heavy imports moved here to minimize setup.py import overhead + import torch + from torch.utils.cpp_extension import ( + CUDA_HOME, + IS_WINDOWS, + ROCM_HOME, + CppExtension, + CUDAExtension, + ) + if CUDA_HOME is None and torch.version.cuda: print("CUDA toolkit is not available. Skipping compilation of CUDA extensions") print( @@ -452,11 +462,13 @@ def get_extensions(): found_col16 = False found_vec_ext = False found_outer_vec = False - print("ROCM_HOME", ROCM_HOME) + if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode: + print("ROCM_HOME", ROCM_HOME) hipblaslt_headers = list( glob.glob(os.path.join(ROCM_HOME, "include", "hipblaslt", "hipblaslt.h")) ) - print("hipblaslt_headers", hipblaslt_headers) + if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode: + print("hipblaslt_headers", hipblaslt_headers) for header in hipblaslt_headers: with open(header) as f: text = f.read() @@ -468,17 +480,22 @@ def get_extensions(): found_outer_vec = True if found_col16: extra_compile_args["cxx"].append("-DHIPBLASLT_HAS_ORDER_COL16") - print("hipblaslt found extended col order enums") + if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode: + print("hipblaslt found extended col order enums") else: - print("hipblaslt does not have extended col order enums") + if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode: + print("hipblaslt does not have extended col order enums") if found_outer_vec: extra_compile_args["cxx"].append("-DHIPBLASLT_OUTER_VEC") - print("hipblaslt found outer vec") + if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode: + print("hipblaslt found outer vec") elif found_vec_ext: extra_compile_args["cxx"].append("-DHIPBLASLT_VEC_EXT") - print("hipblaslt found vec ext") + if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode: + print("hipblaslt found vec ext") else: - print("hipblaslt does not have vec ext") + if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode: + print("hipblaslt does not have vec ext") # Get base directory and source paths curdir = os.path.dirname(os.path.curdir) @@ -641,7 +658,8 @@ def get_extensions(): ext_modules = [] if len(sources) > 0: - print("SOURCES", sources) + if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode: + print("SOURCES", sources) # Double-check to ensure mx_fp_cutlass_kernels.cu is not in sources sources = [ s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu" @@ -735,9 +753,13 @@ def get_extensions(): def bool_to_on_off(value): return "ON" if value else "OFF" - from distutils.sysconfig import get_python_lib + import importlib.util - torch_dir = get_python_lib() + "/torch/share/cmake/Torch" + spec = importlib.util.find_spec("torch") + if spec is None or spec.origin is None: + raise RuntimeError("Unable to locate 'torch' package for CMake config") + torch_pkg_dir = os.path.dirname(spec.origin) + torch_dir = os.path.join(torch_pkg_dir, "share", "cmake", "Torch") ext_modules.append( CMakeExtension( @@ -762,24 +784,23 @@ def bool_to_on_off(value): return ext_modules -# Only check submodules if we're going to build C++ extensions -if use_cpp != "0": - check_submodules() +# Defer submodule checks to build time via build_ext setup( name="torchao", version=version + version_suffix, - packages=find_packages(exclude=["benchmarks", "benchmarks.*"]), + packages=find_packages(include=["torchao*"]), include_package_data=True, package_data={ "torchao.kernel.configs": ["*.pkl"], }, - ext_modules=get_extensions(), + # Defer extension discovery to build time for performance + ext_modules=[], extras_require={"dev": read_requirements("dev-requirements.txt")}, description="Package for applying ao techniques to GPU models", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", url="https://github.com/pytorch/ao", - cmdclass={"build_ext": TorchAOBuildExt}, + cmdclass={"build_ext": LazyTorchAOBuildExt}, options={"bdist_wheel": {"py_limited_api": "cp39"}}, ) From 90843da4a0436d1b7cd6f8fdcd6e4e2e79f3629f Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Wed, 17 Sep 2025 17:56:15 -0700 Subject: [PATCH 02/10] Enhance code readability in setup.py - Added blank lines for improved separation of code blocks. - Reformatted list comprehensions for better clarity. - Adjusted line breaks in function calls to enhance readability. These changes aim to make the code more maintainable and easier to navigate. --- setup.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 88d4963f8f..8b9bc70423 100644 --- a/setup.py +++ b/setup.py @@ -98,6 +98,7 @@ def read_version(file_path="version.txt"): def use_debug_mode(): return os.getenv("DEBUG", "0") == "1" + # Heavy imports (torch, torch.utils.cpp_extension) are deferred to build time @@ -131,6 +132,7 @@ def __init__(self): ) if self.build_experimental_mps: import torch # Lazy import + assert is_macos, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires macOS" assert is_arm64, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires arm64" assert torch.mps.is_available(), ( @@ -259,6 +261,7 @@ def get_cutlass_build_flags(): # Lazy import torch and helper; only needed when building CUDA extensions import torch from torch.utils.cpp_extension import _get_cuda_arch_flags + # Try nvcc then torch version cuda_version = get_cuda_version_from_nvcc() or torch.version.cuda @@ -300,10 +303,14 @@ def run(self_inner): def build_extensions(self_inner): cmake_extensions = [ - ext for ext in self_inner.extensions if isinstance(ext, CMakeExtension) + ext + for ext in self_inner.extensions + if isinstance(ext, CMakeExtension) ] other_extensions = [ - ext for ext in self_inner.extensions if not isinstance(ext, CMakeExtension) + ext + for ext in self_inner.extensions + if not isinstance(ext, CMakeExtension) ] for ext in cmake_extensions: self_inner.build_cmake(ext) @@ -313,7 +320,9 @@ def build_extensions(self_inner): self_inner.extensions = other_extensions + cmake_extensions def build_cmake(self_inner, ext): - extdir = os.path.abspath(os.path.dirname(self_inner.get_ext_fullpath(ext.name))) + extdir = os.path.abspath( + os.path.dirname(self_inner.get_ext_fullpath(ext.name)) + ) if not os.path.exists(self_inner.build_temp): os.makedirs(self_inner.build_temp) ext_filename = os.path.basename(self_inner.get_ext_filename(ext.name)) @@ -343,7 +352,9 @@ def build_cmake(self_inner, ext): ], cwd=self_inner.build_temp, ) - subprocess.check_call(["cmake", "--build", "."], cwd=self_inner.build_temp) + subprocess.check_call( + ["cmake", "--build", "."], cwd=self_inner.build_temp + ) # Morph this instance into the real BuildExtension subclass and run self.__class__ = _TorchAOBuildExt From 94e495b49edf806951d2126e0e6e72c77f130b90 Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Wed, 17 Sep 2025 21:01:35 -0700 Subject: [PATCH 03/10] Add environment variable to disable PEP 517 build isolation in CI workflow - Introduced the `PIP_NO_BUILD_ISOLATION` environment variable in the `build_wheels_linux.yml` workflow to ensure that the `setuptools` installed in the `pre_build_script.sh` is accessible during the build process. This change aims to improve the build process by allowing the use of the correct version of `setuptools` without isolation issues. --- .github/workflows/build_wheels_linux.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build_wheels_linux.yml b/.github/workflows/build_wheels_linux.yml index f164ed03c5..0b0a29de3f 100644 --- a/.github/workflows/build_wheels_linux.yml +++ b/.github/workflows/build_wheels_linux.yml @@ -39,6 +39,10 @@ jobs: permissions: id-token: write contents: read + env: + # Ensure pip does not use PEP 517 build isolation so that + # setuptools installed in pre_build_script.sh is visible to the build. + PIP_NO_BUILD_ISOLATION: "1" uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main with: # Set the ref to an empty string instead of the default nightly because From f954b7821a2f1d7ba42083cff6c5f2bb5cf19212 Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Wed, 17 Sep 2025 21:10:03 -0700 Subject: [PATCH 04/10] Update CI workflow and environment script for PIP_NO_BUILD_ISOLATION - Removed the `PIP_NO_BUILD_ISOLATION` environment variable from the `build_wheels_linux.yml` workflow. - Added the `PIP_NO_BUILD_ISOLATION` export to the `env_var_script_linux.sh` to ensure pre-installed tools are accessible during the build process. These changes aim to streamline the build environment and maintain consistency in the usage of environment variables. --- .github/workflows/build_wheels_linux.yml | 4 ---- packaging/env_var_script_linux.sh | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_wheels_linux.yml b/.github/workflows/build_wheels_linux.yml index 0b0a29de3f..f164ed03c5 100644 --- a/.github/workflows/build_wheels_linux.yml +++ b/.github/workflows/build_wheels_linux.yml @@ -39,10 +39,6 @@ jobs: permissions: id-token: write contents: read - env: - # Ensure pip does not use PEP 517 build isolation so that - # setuptools installed in pre_build_script.sh is visible to the build. - PIP_NO_BUILD_ISOLATION: "1" uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main with: # Set the ref to an empty string instead of the default nightly because diff --git a/packaging/env_var_script_linux.sh b/packaging/env_var_script_linux.sh index 3d3394fbd5..b9abe46d24 100644 --- a/packaging/env_var_script_linux.sh +++ b/packaging/env_var_script_linux.sh @@ -17,3 +17,7 @@ TORCH_CUDA_ARCH_LIST="8.0;8.6" if [[ ${CU_VERSION:-} == "cu124" ]]; then TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0" fi + +# Ensure pip does not use PEP 517 build isolation so that pre-installed +# tools from pre_build_script.sh (setuptools, wheel) are visible to the build. +export PIP_NO_BUILD_ISOLATION=1 From da04ff60ca0701e150e4750fab0560e92994cc31 Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Wed, 17 Sep 2025 21:24:40 -0700 Subject: [PATCH 05/10] Enhance post-build script to conditionally run auditwheel - Added a check to ensure that auditwheel is only executed if the wheel contains at least one shared object (.so) file. - Included a message to indicate when auditwheel is skipped due to the absence of shared libraries. - Updated the wheel removal command to use `rm -f` for safer deletion. These changes improve the robustness of the post-build process by preventing unnecessary execution of auditwheel. --- packaging/post_build_script.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh index d47aacd339..47a760d088 100644 --- a/packaging/post_build_script.sh +++ b/packaging/post_build_script.sh @@ -14,7 +14,9 @@ if [[ "$CU_VERSION" == cu* ]]; then pushd dist manylinux_plat=manylinux_2_28_x86_64 - auditwheel repair --plat "$manylinux_plat" -w . \ + # Only run auditwheel if the wheel contains at least one shared object (.so) + if unzip -l "$WHEEL_NAME" | awk '{print $4}' | grep -E '\\.so($|\.)' >/dev/null 2>&1; then + auditwheel repair --plat "$manylinux_plat" -w . \ --exclude libtorch.so \ --exclude libtorch_python.so \ --exclude libtorch_cuda.so \ @@ -24,10 +26,13 @@ if [[ "$CU_VERSION" == cu* ]]; then --exclude libcuda.so.* \ --exclude libcudart.so.* \ "${WHEEL_NAME}" + else + echo "No shared libraries detected in wheel ${WHEEL_NAME}; skipping auditwheel." + fi ls -lah . # Clean up the linux_x86_64 wheel - rm "${WHEEL_NAME}" + rm -f "${WHEEL_NAME}" popd fi From b4494b9d805565948cf9bc961f9382d62cf5246e Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Wed, 17 Sep 2025 21:37:29 -0700 Subject: [PATCH 06/10] Refactor post-build script for improved wheel handling - Updated the script to determine the original wheel file produced by the build process, ensuring that the correct wheel is used for auditwheel operations. - Changed the wheel installation command to select the most recent wheel file from the distribution directory. - Enhanced logging messages to reflect the changes in wheel handling. These modifications enhance the reliability and clarity of the post-build process. --- packaging/post_build_script.sh | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh index 47a760d088..ab342d4e8c 100644 --- a/packaging/post_build_script.sh +++ b/packaging/post_build_script.sh @@ -10,12 +10,12 @@ set -eux # Prepare manywheel, only for CUDA. # The wheel is a pure python wheel for other platforms. if [[ "$CU_VERSION" == cu* ]]; then - WHEEL_NAME=$(ls dist/) - pushd dist + # Determine the original wheel produced by build (there should be exactly one) + ORIG_WHEEL=$(ls -1 *.whl | head -n 1) manylinux_plat=manylinux_2_28_x86_64 # Only run auditwheel if the wheel contains at least one shared object (.so) - if unzip -l "$WHEEL_NAME" | awk '{print $4}' | grep -E '\\.so($|\.)' >/dev/null 2>&1; then + if unzip -l "$ORIG_WHEEL" | awk '{print $4}' | grep -E '\\.so($|\.)' >/dev/null 2>&1; then auditwheel repair --plat "$manylinux_plat" -w . \ --exclude libtorch.so \ --exclude libtorch_python.so \ @@ -25,18 +25,16 @@ if [[ "$CU_VERSION" == cu* ]]; then --exclude libc10_cuda.so \ --exclude libcuda.so.* \ --exclude libcudart.so.* \ - "${WHEEL_NAME}" + "${ORIG_WHEEL}" else - echo "No shared libraries detected in wheel ${WHEEL_NAME}; skipping auditwheel." + echo "No shared libraries detected in wheel ${ORIG_WHEEL}; skipping auditwheel." fi ls -lah . - # Clean up the linux_x86_64 wheel - rm -f "${WHEEL_NAME}" popd fi -MANYWHEEL_NAME=$(ls dist/) -# Try to install the new wheel -pip install "dist/${MANYWHEEL_NAME}" +INSTALL_WHEEL=$(ls -1t dist/*.whl | head -n 1) +# Try to install the new wheel (pick most recent wheel file) +pip install "${INSTALL_WHEEL}" python -c "import torchao" From a18b08191cf29c72bda72070dc01689a5d94f646 Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Mon, 22 Sep 2025 21:31:37 -0700 Subject: [PATCH 07/10] Enhance CUDA extension handling in setup.py - Updated the logic to skip CUDA extension compilation only if both CUDA_HOME is unset and nvcc is not found, improving compatibility with CI environments. - Adjusted the condition for building CUDA extensions to check for the presence of either CUDA_HOME or nvcc. These changes aim to provide clearer messaging and better support for CUDA extension compilation in various environments. --- setup.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 8b9bc70423..310623a0a3 100644 --- a/setup.py +++ b/setup.py @@ -392,8 +392,10 @@ def get_extensions(): CUDAExtension, ) - if CUDA_HOME is None and torch.version.cuda: - print("CUDA toolkit is not available. Skipping compilation of CUDA extensions") + # Only skip CUDA extensions if neither CUDA_HOME nor nvcc is available. + # In many CI environments CUDA_HOME may be unset even though nvcc is on PATH. + if torch.version.cuda and CUDA_HOME is None and get_cuda_version_from_nvcc() is None: + print("CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions") print( "If you'd like to compile CUDA extensions locally please install the cudatoolkit from https://anaconda.org/nvidia/cuda-toolkit" ) @@ -401,7 +403,8 @@ def get_extensions(): print("ROCm is not available. Skipping compilation of ROCm extensions") print("If you'd like to compile ROCm extensions locally please install ROCm") - use_cuda = torch.version.cuda and CUDA_HOME is not None + # Build CUDA extensions if CUDA is available and either CUDA_HOME is set or nvcc is present + use_cuda = bool(torch.version.cuda) and (CUDA_HOME is not None or get_cuda_version_from_nvcc() is not None) use_rocm = torch.version.hip and ROCM_HOME is not None extension = CUDAExtension if (use_cuda or use_rocm) else CppExtension From d0e86c220344f8e8cc1d4eb8da4e317809483b3b Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Mon, 22 Sep 2025 21:33:53 -0700 Subject: [PATCH 08/10] lint --- setup.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 310623a0a3..345fa9a31f 100644 --- a/setup.py +++ b/setup.py @@ -394,8 +394,14 @@ def get_extensions(): # Only skip CUDA extensions if neither CUDA_HOME nor nvcc is available. # In many CI environments CUDA_HOME may be unset even though nvcc is on PATH. - if torch.version.cuda and CUDA_HOME is None and get_cuda_version_from_nvcc() is None: - print("CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions") + if ( + torch.version.cuda + and CUDA_HOME is None + and get_cuda_version_from_nvcc() is None + ): + print( + "CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions" + ) print( "If you'd like to compile CUDA extensions locally please install the cudatoolkit from https://anaconda.org/nvidia/cuda-toolkit" ) @@ -404,7 +410,9 @@ def get_extensions(): print("If you'd like to compile ROCm extensions locally please install ROCm") # Build CUDA extensions if CUDA is available and either CUDA_HOME is set or nvcc is present - use_cuda = bool(torch.version.cuda) and (CUDA_HOME is not None or get_cuda_version_from_nvcc() is not None) + use_cuda = bool(torch.version.cuda) and ( + CUDA_HOME is not None or get_cuda_version_from_nvcc() is not None + ) use_rocm = torch.version.hip and ROCM_HOME is not None extension = CUDAExtension if (use_cuda or use_rocm) else CppExtension From 1d1fa5e94ce3ce0b4350254a998edd5014355cb9 Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Sun, 28 Sep 2025 06:50:05 -0700 Subject: [PATCH 09/10] Fix CUDA extension detection logic The recent change to CUDA extension detection was causing CUDA extensions to not be built when CUDA_HOME is unset but nvcc is available. This was breaking the quant_llm_linear operator registration. The issue was that get_cuda_version_from_nvcc() could return None even when nvcc is available, causing use_cuda to be False. This fix adds a separate is_nvcc_available() function that simply checks if nvcc can be executed, regardless of version parsing success. --- setup.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 345fa9a31f..3243a5b5ef 100644 --- a/setup.py +++ b/setup.py @@ -254,6 +254,15 @@ def get_cuda_version_from_nvcc(): return None +def is_nvcc_available(): + """Check if nvcc is available on the system.""" + try: + subprocess.check_output(["nvcc", "--version"], stderr=subprocess.STDOUT) + return True + except: + return False + + def get_cutlass_build_flags(): """Determine which CUTLASS kernels to build based on CUDA version. SM90a: CUDA 12.6+, SM100a: CUDA 12.8+ @@ -397,7 +406,7 @@ def get_extensions(): if ( torch.version.cuda and CUDA_HOME is None - and get_cuda_version_from_nvcc() is None + and not is_nvcc_available() ): print( "CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions" @@ -411,7 +420,7 @@ def get_extensions(): # Build CUDA extensions if CUDA is available and either CUDA_HOME is set or nvcc is present use_cuda = bool(torch.version.cuda) and ( - CUDA_HOME is not None or get_cuda_version_from_nvcc() is not None + CUDA_HOME is not None or is_nvcc_available() ) use_rocm = torch.version.hip and ROCM_HOME is not None extension = CUDAExtension if (use_cuda or use_rocm) else CppExtension From 59af0fa0f75968eef5914b949d2fa259869aa124 Mon Sep 17 00:00:00 2001 From: Peter Yeh Date: Sun, 28 Sep 2025 06:52:42 -0700 Subject: [PATCH 10/10] Format setup.py with ruff Reformat the CUDA extension detection condition to follow ruff formatting standards. --- setup.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 3243a5b5ef..73217dca65 100644 --- a/setup.py +++ b/setup.py @@ -403,11 +403,7 @@ def get_extensions(): # Only skip CUDA extensions if neither CUDA_HOME nor nvcc is available. # In many CI environments CUDA_HOME may be unset even though nvcc is on PATH. - if ( - torch.version.cuda - and CUDA_HOME is None - and not is_nvcc_available() - ): + if torch.version.cuda and CUDA_HOME is None and not is_nvcc_available(): print( "CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions" )