-
Notifications
You must be signed in to change notification settings - Fork 690
Enforce minimum NCCL version for cuBLASMp #2857
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -98,6 +98,39 @@ set(CUTLASS_TOOLS_INCLUDE_DIR | |
| # Python | ||
| find_package(Python COMPONENTS Interpreter Development.Module REQUIRED) | ||
|
|
||
| function(find_nccl_version OUT_VERSION OUT_INCLUDE_DIR) | ||
| find_path(_nvte_nccl_include_dir | ||
| NAMES nccl.h | ||
| PATH_SUFFIXES include | ||
| REQUIRED) | ||
|
Comment on lines
+102
to
+105
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
find_path(_nvte_nccl_include_dir
NAMES nccl.h
HINTS ${CUBLASMP_DIR}
PATH_SUFFIXES include
NO_CACHE
REQUIRED) |
||
|
|
||
| file(STRINGS "${_nvte_nccl_include_dir}/nccl.h" _nvte_nccl_major_line | ||
| REGEX "^#define NCCL_MAJOR[ \t]+[0-9]+$") | ||
| file(STRINGS "${_nvte_nccl_include_dir}/nccl.h" _nvte_nccl_minor_line | ||
| REGEX "^#define NCCL_MINOR[ \t]+[0-9]+$") | ||
| file(STRINGS "${_nvte_nccl_include_dir}/nccl.h" _nvte_nccl_patch_line | ||
| REGEX "^#define NCCL_PATCH[ \t]+[0-9]+$") | ||
|
|
||
| string(REGEX REPLACE "^#define NCCL_MAJOR[ \t]+([0-9]+)$" "\\1" | ||
| _nvte_nccl_major "${_nvte_nccl_major_line}") | ||
| string(REGEX REPLACE "^#define NCCL_MINOR[ \t]+([0-9]+)$" "\\1" | ||
| _nvte_nccl_minor "${_nvte_nccl_minor_line}") | ||
| string(REGEX REPLACE "^#define NCCL_PATCH[ \t]+([0-9]+)$" "\\1" | ||
| _nvte_nccl_patch "${_nvte_nccl_patch_line}") | ||
|
|
||
| if ("${_nvte_nccl_major}" STREQUAL "" | ||
| OR "${_nvte_nccl_minor}" STREQUAL "" | ||
| OR "${_nvte_nccl_patch}" STREQUAL "") | ||
| message(FATAL_ERROR | ||
| "Failed to parse NCCL version from ${_nvte_nccl_include_dir}/nccl.h") | ||
| endif() | ||
|
|
||
| set(${OUT_VERSION} | ||
| "${_nvte_nccl_major}.${_nvte_nccl_minor}.${_nvte_nccl_patch}" | ||
| PARENT_SCOPE) | ||
| set(${OUT_INCLUDE_DIR} "${_nvte_nccl_include_dir}" PARENT_SCOPE) | ||
| endfunction() | ||
|
|
||
| # Configure Transformer Engine library | ||
| include_directories(${PROJECT_SOURCE_DIR}/..) | ||
| set(transformer_engine_SOURCES) | ||
|
|
@@ -290,6 +323,7 @@ option(NVTE_WITH_CUBLASMP "Use cuBLASMp for tensor parallel GEMMs" OFF) | |
| if (NVTE_WITH_CUBLASMP) | ||
| target_compile_definitions(transformer_engine PRIVATE NVTE_WITH_CUBLASMP) | ||
| target_include_directories(transformer_engine PRIVATE ${CUBLASMP_DIR}/include) | ||
| find_nccl_version(NCCL_VERSION NCCL_INCLUDE_DIR) | ||
| find_library(CUBLASMP_LIB | ||
| NAMES cublasmp libcublasmp | ||
| PATHS ${CUBLASMP_DIR} | ||
|
|
@@ -299,8 +333,14 @@ if (NVTE_WITH_CUBLASMP) | |
| NAMES nccl libnccl | ||
| PATH_SUFFIXES lib | ||
| REQUIRED) | ||
| target_link_libraries(transformer_engine PUBLIC ${NCCL_LIB} ${CUBLASMP_LIB}) | ||
| message(STATUS "Using cuBLASMp at: ${CUBLASMP_DIR}") | ||
| if (NCCL_VERSION VERSION_LESS 2.29.0) | ||
| message(FATAL_ERROR | ||
| "NVTE_WITH_CUBLASMP requires NCCL >= 2.29.0, but found NCCL ${NCCL_VERSION} " | ||
| "in ${NCCL_INCLUDE_DIR}/nccl.h") | ||
| endif() | ||
| target_link_libraries(transformer_engine PUBLIC ${NCCL_LIB} ${CUBLASMP_LIB}) | ||
| message(STATUS "Using cuBLASMp at: ${CUBLASMP_DIR}") | ||
| message(STATUS "Using NCCL ${NCCL_VERSION} at: ${NCCL_LIB}") | ||
| endif() | ||
|
|
||
| # Number of philox4x32 rounds for stochastic rounding (build-time constant). | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
find_pathmay locate the wrongnccl.hfind_pathhere has noPATHSorHINTS, so it searches CMake's default system paths rather than the NCCL that will actually be linked. On systems where both a system NCCL and aCUBLASMP_DIR-bundled NCCL exist, the header discovered could belong to a different version than theNCCL_LIBfound by the laterfind_librarycall, making the version check unreliable.Consider extracting a hint from
CUBLASMP_DIR, e.g.:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this suggestion from Greptile sensible? My understanding was that cuBLASMp does not come pre-packaged with it's own NCCL so I don't think this makes sense.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're correct—my suggestion doesn't make sense. If cuBLASMp doesn't bundle NCCL, then
${CUBLASMP_DIR}won't containnccl.h.The potential issue I was flagging (that
find_pathandfind_librarycould locate different NCCL installations on systems with multiple versions) is real in principle, but the fix would be to accept a user-providedNCCL_DIRor derive the include path from the library location found byfind_library, not to look inCUBLASMP_DIR.For most practical scenarios where NCCL is installed in a single standard location, the current approach should work fine—both
find_pathandfind_librarywill discover the same installation through CMake's default search paths.Tip: You can customize Greptile's behavior for this repo with
.greptile/rulesand.greptile/config.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@vcherepanov-nv I think Greptile's latest reply here does have a point here that a user's NCCL dir may differ from the system installation, for example, if they are using NCCL from a pip package. I'm not sure if NCCL from pip is supported with NVTE_WITH_CUBLASMP, but if it is we should probably allow the user to provide a NCCL_DIR hint (unless we have a better way of choosing between pip packages and system installations in CMake)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think existing find_library approach would find pip-installed NCCL. So whatever find_path finds, it's likely whatever find_library finds as well. Until now no one asked for a custom path to NCCL, so I'd like to avoid adding any extra configurable paths, unless absolutely necessary.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Meaning - less options to configure, less options to misconfigure.