diff --git a/tile_engine/ops/CMakeLists.txt b/tile_engine/ops/CMakeLists.txt
index 6f82e1b07af..51129deeeb5 100644
--- a/tile_engine/ops/CMakeLists.txt
+++ b/tile_engine/ops/CMakeLists.txt
@@ -4,4 +4,5 @@
 add_subdirectory(gemm)
 add_subdirectory(gemm_multi_d)
 add_subdirectory(gemm_preshuffle)
+add_subdirectory(pooling)
 add_subdirectory(gemm_streamk)
diff --git a/tile_engine/ops/commons/test_benchmark.sh b/tile_engine/ops/commons/test_benchmark.sh
index e2e0324da8c..6f8dd3e0cc5 100755
--- a/tile_engine/ops/commons/test_benchmark.sh
+++ b/tile_engine/ops/commons/test_benchmark.sh
@@ -3,29 +3,88 @@
 # SPDX-License-Identifier: MIT
 
 
-# Test script for tile engine GEMM benchmarks
-# This script demonstrates how to run the new individual benchmark executables
+# Test script for tile engine benchmarks (GEMM and Pooling)
+# This script demonstrates how to run the individual benchmark executables
 
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 
+# Default operation type (gemm, pool, or all)
+OP_TYPE="all"
+
+# Parse command line arguments
+show_help() {
+    echo "Usage: $0 [OPTIONS] [build_directory]"
+    echo ""
+    echo "Options:"
+    echo "  --gemm        Test only GEMM benchmarks"
+    echo "  --pool        Test only Pooling benchmarks"
+    echo "  --all         Test all benchmarks (default)"
+    echo "  --verify      Enable verification"
+    echo "  --help        Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0                           # Test all benchmarks, auto-detect build dir"
+    echo "  $0 --pool                    # Test only pooling benchmarks"
+    echo "  $0 --gemm /path/to/build     # Test GEMM with specific build dir"
+    echo "  $0 --verify --pool           # Test pooling with verification"
+}
+
+VERIFY_FLAG=""
+BUILD_DIR=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --gemm)
+            OP_TYPE="gemm"
+            shift
+            ;;
+        --pool)
+            OP_TYPE="pool"
+            shift
+            ;;
+        --all)
+            OP_TYPE="all"
+            shift
+            ;;
+        --verify)
+            VERIFY_FLAG="-verify=1"
+            shift
+            ;;
+        --help|-h)
+            show_help
+            exit 0
+            ;;
+        *)
+            BUILD_DIR="$1"
+            shift
+            ;;
+    esac
+done
+
 # Find the build directory
-if [ -z "$1" ]; then
-    # Try to find build directory automatically
-    BUILD_DIR=$(find /root/workspace/composable_kernel -name "test_gemm_fix" -type d 2>/dev/null | head -1)
+if [ -z "$BUILD_DIR" ]; then
+    # Try common build directory locations
+    for dir in "/root/workspace/composable_kernel/build" "$HOME/composable_kernel/build" "$(pwd)/build"; do
+        if [ -d "$dir/bin" ]; then
+            BUILD_DIR="$dir"
+            break
+        fi
+    done
+    
     if [ -z "$BUILD_DIR" ]; then
-        echo -e "${RED}Error: Could not find build directory. Please provide it as first argument.${NC}"
-        echo "Usage: $0 <build_directory>"
+        echo -e "${RED}Error: Could not find build directory. Please provide it as argument.${NC}"
+        echo "Usage: $0 [--gemm|--pool|--all] <build_directory>"
         exit 1
     fi
-else
-    BUILD_DIR="$1"
 fi
 
 echo -e "${GREEN}Using build directory: $BUILD_DIR${NC}"
+echo -e "${GREEN}Operation type: $OP_TYPE${NC}"
 
 # Check if bin directory exists
 if [ ! -d "$BUILD_DIR/bin" ]; then
@@ -33,73 +92,219 @@ if [ ! -d "$BUILD_DIR/bin" ]; then
     exit 1
 fi
 
-# Find all benchmark executables
-echo -e "${YELLOW}Finding benchmark executables...${NC}"
-BENCHMARKS=$(find "$BUILD_DIR/bin" -name "benchmark_gemm_*" -type f 2>/dev/null)
-
-if [ -z "$BENCHMARKS" ]; then
-    echo -e "${RED}No benchmark executables found in $BUILD_DIR/bin${NC}"
-    echo "Please build some benchmarks first with:"
-    echo "  cd $BUILD_DIR"
-    echo "  make benchmark_gemm_<kernel_name>"
-    exit 1
-fi
-
-# Count benchmarks
-NUM_BENCHMARKS=$(echo "$BENCHMARKS" | wc -l)
-echo -e "${GREEN}Found $NUM_BENCHMARKS benchmark executable(s)${NC}"
-
-# Test sizes
-SIZES=(512 1024 2048)
-
 # Results file
 RESULTS_FILE="benchmark_results_$(date +%Y%m%d_%H%M%S).csv"
-
-echo -e "${YELLOW}Running benchmarks...${NC}"
 echo "Results will be saved to: $RESULTS_FILE"
 
-# Run each benchmark
-COUNTER=0
-for BENCH in $BENCHMARKS; do
-    COUNTER=$((COUNTER + 1))
-    BENCH_NAME=$(basename "$BENCH")
-    echo -e "\n${GREEN}[$COUNTER/$NUM_BENCHMARKS] Running: $BENCH_NAME${NC}"
+# ============================================================================
+# GEMM Benchmark Functions
+# ============================================================================
+
+run_gemm_benchmarks() {
+    echo -e "\n${BLUE}========================================${NC}"
+    echo -e "${BLUE}       GEMM BENCHMARKS${NC}"
+    echo -e "${BLUE}========================================${NC}"
+    
+    # Find all GEMM benchmark executables
+    GEMM_BENCHMARKS=$(find "$BUILD_DIR/bin" -name "benchmark_gemm_*" -type f -executable 2>/dev/null | sort)
+    
+    if [ -z "$GEMM_BENCHMARKS" ]; then
+        echo -e "${YELLOW}No GEMM benchmark executables found in $BUILD_DIR/bin${NC}"
+        echo "Build with: make benchmark_gemm_<kernel_name>"
+        return 0
+    fi
+    
+    NUM_GEMM=$(echo "$GEMM_BENCHMARKS" | wc -l)
+    echo -e "${GREEN}Found $NUM_GEMM GEMM benchmark executable(s)${NC}"
+    
+    # Test sizes for GEMM
+    GEMM_SIZES=(512 1024 2048)
     
-    for SIZE in "${SIZES[@]}"; do
-        echo -e "  Testing size: ${SIZE}x${SIZE}x${SIZE}"
+    COUNTER=0
+    GEMM_PASSED=0
+    GEMM_FAILED=0
+    
+    for BENCH in $GEMM_BENCHMARKS; do
+        COUNTER=$((COUNTER + 1))
+        BENCH_NAME=$(basename "$BENCH")
+        echo -e "\n${GREEN}[GEMM $COUNTER/$NUM_GEMM] Running: $BENCH_NAME${NC}"
         
-        # Run with verification
-        "$BENCH" -m=$SIZE -n=$SIZE -k=$SIZE -verify=2 -warmup=10 -repeat=20 \
-                 -csv_filename="$RESULTS_FILE" -csv_format=simple \
-                 2>&1 | grep -E "(Time:|Performance:|Verification:|Error)"
+        for SIZE in "${GEMM_SIZES[@]}"; do
+            echo -e "  Testing size: ${SIZE}x${SIZE}x${SIZE}"
+            
+            # Run benchmark
+            if "$BENCH" -m=$SIZE -n=$SIZE -k=$SIZE $VERIFY_FLAG -warmup=5 -repeat=10 2>&1 | \
+               grep -E "(Time:|Performance:|Verification:|Error|TFLOPS|latency)" | head -5; then
+                GEMM_PASSED=$((GEMM_PASSED + 1))
+            else
+                echo -e "  ${RED}Benchmark failed or no output!${NC}"
+                GEMM_FAILED=$((GEMM_FAILED + 1))
+            fi
+        done
+    done
+    
+    echo -e "\n${GREEN}GEMM Summary: $GEMM_PASSED passed, $GEMM_FAILED failed${NC}"
+}
+
+# ============================================================================
+# Pooling Benchmark Functions
+# ============================================================================
+
+run_pool_benchmarks() {
+    echo -e "\n${BLUE}========================================${NC}"
+    echo -e "${BLUE}       POOLING BENCHMARKS${NC}"
+    echo -e "${BLUE}========================================${NC}"
+    
+    # Find all Pooling benchmark executables
+    POOL_BENCHMARKS=$(find "$BUILD_DIR/bin" -name "benchmark_pool*" -type f -executable 2>/dev/null | sort)
+    
+    if [ -z "$POOL_BENCHMARKS" ]; then
+        echo -e "${YELLOW}No Pooling benchmark executables found in $BUILD_DIR/bin${NC}"
+        echo "Build with: make benchmark_pool_all or make benchmark_pool2d or make benchmark_pool3d"
+        return 0
+    fi
+    
+    NUM_POOL=$(echo "$POOL_BENCHMARKS" | wc -l)
+    echo -e "${GREEN}Found $NUM_POOL Pooling benchmark executable(s)${NC}"
+    
+    COUNTER=0
+    POOL_PASSED=0
+    POOL_FAILED=0
+    
+    # Test configurations for pooling
+    # Format: "description|args"
+    POOL2D_TESTS=(
+        "Small 2D (64x64)|      -N=1 -H=64  -W=64  -C=32  -Y=3 -X=3 -Sy=2 -Sx=2"
+        "Medium 2D (224x224)|   -N=2 -H=224 -W=224 -C=64  -Y=3 -X=3 -Sy=2 -Sx=2"
+        "Large 2D (512x512)|    -N=1 -H=512 -W=512 -C=128 -Y=3 -X=3 -Sy=2 -Sx=2"
+    )
+    
+    POOL3D_TESTS=(
+        "Small 3D (32x32x32)|   -N=1 -D=32 -H=32 -W=32 -C=32  -Z=3 -Y=3 -X=3 -Sz=2 -Sy=2 -Sx=2"
+        "Medium 3D (56x56x32)|  -N=2 -D=32 -H=56 -W=56 -C=64  -Z=3 -Y=3 -X=3 -Sz=2 -Sy=2 -Sx=2"
+        "Large 3D (64x64x64)|   -N=1 -D=64 -H=64 -W=64 -C=128 -Z=3 -Y=3 -X=3 -Sz=2 -Sy=2 -Sx=2"
+    )
+    
+    for BENCH in $POOL_BENCHMARKS; do
+        COUNTER=$((COUNTER + 1))
+        BENCH_NAME=$(basename "$BENCH")
+        echo -e "\n${GREEN}[Pool $COUNTER/$NUM_POOL] Running: $BENCH_NAME${NC}"
         
-        if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo -e "  ${RED}Benchmark failed!${NC}"
+        # Determine if 2D or 3D based on name
+        if [[ "$BENCH_NAME" == *"pool2d"* ]]; then
+            TESTS=("${POOL2D_TESTS[@]}")
+            POOL_TYPE="2D"
+        elif [[ "$BENCH_NAME" == *"pool3d"* ]]; then
+            TESTS=("${POOL3D_TESTS[@]}")
+            POOL_TYPE="3D"
+        else
+            echo -e "  ${YELLOW}Unknown pool type, skipping...${NC}"
+            continue
         fi
+        
+        for TEST in "${TESTS[@]}"; do
+            # Parse test description and args
+            DESC=$(echo "$TEST" | cut -d'|' -f1 | xargs)
+            ARGS=$(echo "$TEST" | cut -d'|' -f2 | xargs)
+            
+            echo -e "  Testing: ${DESC}"
+            
+            # Run benchmark
+            OUTPUT=$("$BENCH" $ARGS $VERIFY_FLAG -warmup=5 -repeat=10 2>&1)
+            EXIT_CODE=$?
+            
+            if [ $EXIT_CODE -eq 0 ]; then
+                # Try to extract and display key metrics
+                echo "$OUTPUT" | grep -iE "(latency|bandwidth|tflops|time|performance|pass)" | head -3
+                if [ -z "$(echo "$OUTPUT" | grep -i "error\|fail\|wrong")" ]; then
+                    echo -e "    ${GREEN}PASS${NC}"
+                    POOL_PASSED=$((POOL_PASSED + 1))
+                else
+                    echo -e "    ${RED}FAIL (verification error)${NC}"
+                    POOL_FAILED=$((POOL_FAILED + 1))
+                fi
+            else
+                echo -e "    ${RED}FAIL (exit code: $EXIT_CODE)${NC}"
+                # Show error output
+                echo "$OUTPUT" | grep -iE "(error|fail|wrong|unsupported)" | head -3
+                POOL_FAILED=$((POOL_FAILED + 1))
+            fi
+        done
     done
-done
+    
+    echo -e "\n${GREEN}Pooling Summary: $POOL_PASSED passed, $POOL_FAILED failed${NC}"
+}
 
-echo -e "\n${GREEN}Benchmark testing complete!${NC}"
-echo "Results saved to: $RESULTS_FILE"
+# ============================================================================
+# Main Execution
+# ============================================================================
 
-# Show summary if CSV file exists
-if [ -f "$RESULTS_FILE" ]; then
-    echo -e "\n${YELLOW}Summary of results:${NC}"
-    echo "Number of tests: $(tail -n +2 "$RESULTS_FILE" | wc -l)"
-    echo "Successful tests: $(grep -c "true" "$RESULTS_FILE")"
-    echo "Failed tests: $(grep -c "false" "$RESULTS_FILE")"
-fi
+echo -e "${YELLOW}Starting ckTileEngine benchmark tests...${NC}"
+echo "=============================================="
+
+TOTAL_START=$(date +%s)
+
+case $OP_TYPE in
+    gemm)
+        run_gemm_benchmarks
+        ;;
+    pool)
+        run_pool_benchmarks
+        ;;
+    all)
+        run_gemm_benchmarks
+        run_pool_benchmarks
+        ;;
+esac
+
+TOTAL_END=$(date +%s)
+TOTAL_TIME=$((TOTAL_END - TOTAL_START))
+
+echo -e "\n${BLUE}========================================${NC}"
+echo -e "${BLUE}       FINAL SUMMARY${NC}"
+echo -e "${BLUE}========================================${NC}"
+echo -e "Total execution time: ${TOTAL_TIME} seconds"
+echo -e "Operation type tested: $OP_TYPE"
+
+# ============================================================================
+# Example Commands
+# ============================================================================
 
-# Example of running a specific benchmark with different options
 echo -e "\n${YELLOW}Example commands for manual testing:${NC}"
-echo "# Basic run:"
-echo "$BUILD_DIR/bin/benchmark_gemm_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 -m=1024 -n=1024 -k=1024"
-echo ""
-echo "# With CPU verification:"
-echo "$BUILD_DIR/bin/benchmark_gemm_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 -m=1024 -n=1024 -k=1024 -verify=1"
-echo ""
-echo "# JSON output for parsing:"
-echo "$BUILD_DIR/bin/benchmark_gemm_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 -m=1024 -n=1024 -k=1024 -json_output=true"
-echo ""
-echo "# Performance testing with TFLOPS metric:"
-echo "$BUILD_DIR/bin/benchmark_gemm_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 -m=4096 -n=4096 -k=4096 -warmup=100 -repeat=200 -metric=1"
+
+if [[ "$OP_TYPE" == "gemm" ]] || [[ "$OP_TYPE" == "all" ]]; then
+    echo -e "\n${GREEN}GEMM Examples:${NC}"
+    SAMPLE_GEMM=$(find "$BUILD_DIR/bin" -name "benchmark_gemm_*" -type f 2>/dev/null | head -1)
+    if [ -n "$SAMPLE_GEMM" ]; then
+        echo "# Basic GEMM run:"
+        echo "$SAMPLE_GEMM -m=1024 -n=1024 -k=1024"
+        echo ""
+        echo "# GEMM with CPU verification:"
+        echo "$SAMPLE_GEMM -m=1024 -n=1024 -k=1024 -verify=1"
+    fi
+fi
+
+if [[ "$OP_TYPE" == "pool" ]] || [[ "$OP_TYPE" == "all" ]]; then
+    echo -e "\n${GREEN}Pooling Examples:${NC}"
+    SAMPLE_POOL2D=$(find "$BUILD_DIR/bin" -name "benchmark_pool2d_*" -type f 2>/dev/null | head -1)
+    SAMPLE_POOL3D=$(find "$BUILD_DIR/bin" -name "benchmark_pool3d_*" -type f 2>/dev/null | head -1)
+    
+    if [ -n "$SAMPLE_POOL2D" ]; then
+        echo "# 2D Pooling (NHWC format):"
+        echo "$SAMPLE_POOL2D -N=2 -H=224 -W=224 -C=64 -Y=3 -X=3 -Sy=2 -Sx=2"
+        echo ""
+        echo "# 2D Pooling with verification:"
+        echo "$SAMPLE_POOL2D -N=2 -H=224 -W=224 -C=64 -Y=3 -X=3 -Sy=2 -Sx=2 -verify=1"
+    fi
+    
+    if [ -n "$SAMPLE_POOL3D" ]; then
+        echo ""
+        echo "# 3D Pooling (NDHWC format):"
+        echo "$SAMPLE_POOL3D -N=2 -D=32 -H=56 -W=56 -C=64 -Z=3 -Y=3 -X=3 -Sz=2 -Sy=2 -Sx=2"
+        echo ""
+        echo "# 3D Pooling with verification:"
+        echo "$SAMPLE_POOL3D -N=2 -D=32 -H=56 -W=56 -C=64 -Z=3 -Y=3 -X=3 -Sz=2 -Sy=2 -Sx=2 -verify=1"
+    fi
+fi
+
+echo -e "\n${GREEN}Testing complete!${NC}"
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index ce62f8dca5a..55af6153524 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -187,7 +187,8 @@ python gemm_instance_builder.py \
     --datatype fp16 \
     --layout rcr \
     --config_json configs/user_provided_config.json \
-    --gen_all_individual
+    --gen_all_individual \
+    --gpu_target gfx942
 ```
 
 #### gemm_instance_builder_parallel.py
diff --git a/tile_engine/ops/pooling/CMakeLists.txt b/tile_engine/ops/pooling/CMakeLists.txt
new file mode 100644
index 00000000000..e906a36fbb0
--- /dev/null
+++ b/tile_engine/ops/pooling/CMakeLists.txt
@@ -0,0 +1,271 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+set(POOL_DATATYPE "fp16;fp32" CACHE STRING "List of datatypes for Pool (semicolon-separated)")
+set(POOL_REDUCE_OP "max;avg" CACHE STRING "List of reduce operations for Pool (semicolon-separated)")
+set(POOL_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
+option(ENABLE_CCACHE_POOL "Enable ccache for Pool ops compilation" OFF)
+
+# Store the directory path for use in functions
+set(POOL_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR})
+
+# Function to create individual Pool targets
+function(create_individual_pool_target datatype reduce_op trait block_config config_json)
+    # Use the parent scope POOL_GPU_TARGETS_INDIVIDUAL variable
+    if(NOT POOL_GPU_TARGETS_INDIVIDUAL)
+        message(WARNING "Skipping individual Pool target ${datatype}_${reduce_op}_${trait}_${block_config}: No supported GPU targets")
+        return()
+    endif()
+
+    # Parse block configuration: format is block_mxblock_n_warp_mxwarp_n_thread_tile_mxthread_tile_n
+    string(REPLACE "_" ";" config_groups ${block_config})
+    list(GET config_groups 0 block_dims)       # e.g., 128x1
+    list(GET config_groups 1 warp_dims)        # e.g., 1x1
+    list(GET config_groups 2 thread_tile_dims) # e.g., 2x1
+
+    # Parse block dimensions
+    string(REPLACE "x" ";" block_parts ${block_dims})
+    list(GET block_parts 0 block_m)
+    list(GET block_parts 1 block_n)
+
+    # Parse warp dimensions
+    string(REPLACE "x" ";" warp_parts ${warp_dims})
+    list(GET warp_parts 0 warp_m)
+    list(GET warp_parts 1 warp_n)
+
+    # Parse thread tile dimensions
+    string(REPLACE "x" ";" thread_tile_parts ${thread_tile_dims})
+    list(GET thread_tile_parts 0 thread_tile_m)
+    list(GET thread_tile_parts 1 thread_tile_n)
+
+    # Parse trait combo to get individual parts
+    string(REPLACE "_" ";" trait_parts ${trait})
+    list(GET trait_parts 0 output_index)
+    list(GET trait_parts 1 propagate_nan)
+    list(GET trait_parts 2 pool_dim)
+
+    # Create trait string without pool_dim for filename (to match Python generator)
+    set(trait_for_filename "${output_index}_${propagate_nan}")
+
+    set(target_name "benchmark_pool${pool_dim}d_${datatype}_${reduce_op}_${trait}_${block_config}")
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${reduce_op}")
+
+    # Generate the single instance header for this kernel (filename without pool_dim in trait)
+    set(instance_header "${working_path}/pool_single_${pool_dim}d_${datatype}_${reduce_op}_${trait_for_filename}_${block_config}.hpp")
+
+    # Add custom command to generate the header file at build time
+    add_custom_command(
+        OUTPUT ${instance_header}
+        COMMAND ${Python3_EXECUTABLE} ${POOL_SOURCE_DIR}/pool_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --reduce_op ${reduce_op}
+                --config_json ${config_json}
+                --gen_single
+                --kernel_name "pool${pool_dim}d_${datatype}_${reduce_op}_${trait_for_filename}_${block_config}"
+                --block_config "${block_config}"
+                --trait_combo "${trait}"
+                --gpu_target "${POOL_GPU_TARGETS_INDIVIDUAL}"
+        DEPENDS ${POOL_SOURCE_DIR}/pool_instance_builder.py ${config_json}
+        COMMENT "Generating ${instance_header}"
+    )
+
+    # Create the executable
+    add_executable(${target_name}
+        EXCLUDE_FROM_ALL
+        ${POOL_SOURCE_DIR}/pool_benchmark_single.cpp
+        ${instance_header}
+    )
+
+    # Set GPU architectures
+    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${POOL_GPU_TARGETS_INDIVIDUAL})
+
+    # Set compile definitions
+    target_compile_definitions(${target_name} PRIVATE
+        POOL_SINGLE_INSTANCE_HPP="${instance_header}"
+    )
+
+    # Include directories
+    target_include_directories(${target_name} PRIVATE
+        ${POOL_SOURCE_DIR}
+        ${working_path}
+    )
+
+    # Compile options
+    target_compile_options(${target_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+        -include ${instance_header}
+    )
+
+    # Add to collection targets
+    add_dependencies(benchmark_pool_all ${target_name})
+    add_dependencies(benchmark_pool_${datatype} ${target_name})
+    add_dependencies(benchmark_pool_${reduce_op} ${target_name})
+    add_dependencies(benchmark_pool_${datatype}_${reduce_op} ${target_name})
+    add_dependencies(benchmark_pool${pool_dim}d ${target_name})
+endfunction()
+
+# Function to build individual Pool targets
+function(build_individual_pool_targets datatype reduce_op)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${reduce_op}")
+
+    # Choose config file
+    if(DEFINED ENV{POOL_CONFIG_FILE} AND NOT "$ENV{POOL_CONFIG_FILE}" STREQUAL "")
+        set(config_filename "$ENV{POOL_CONFIG_FILE}")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
+    elseif(NOT "${POOL_CONFIG_FILE}" STREQUAL "")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${POOL_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${POOL_CONFIG_FILE}")
+    else()
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+        message(VERBOSE "  Using default config")
+    endif()
+
+    # Check if config file exists
+    if(NOT EXISTS ${json_blob})
+        message(FATAL_ERROR "Config file not found: ${json_blob}")
+    endif()
+
+    # Determine number of workers
+    if(DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+        set(num_workers $ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+    else()
+        cmake_host_system_information(RESULT num_cores QUERY NUMBER_OF_LOGICAL_CORES)
+        math(EXPR num_workers "${num_cores}")
+        if(num_workers GREATER 8)
+            set(num_workers 8)
+        endif()
+    endif()
+
+    # Generate individual kernel files
+    message(VERBOSE "Generating individual kernels for ${datatype} ${reduce_op} using ${num_workers} workers...")
+    message(VERBOSE "  Working path: ${working_path}")
+    message(VERBOSE "  Config file: ${json_blob}")
+
+    # Create working directory first
+    file(MAKE_DIRECTORY ${working_path})
+
+    # List the kernels (fast operation)
+    message(VERBOSE "  Listing kernel configurations...")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/pool_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --reduce_op ${reduce_op}
+                --config_json ${json_blob}
+                --gpu_target ${POOL_GPU_TARGETS_INDIVIDUAL}
+                --list_kernels
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        RESULT_VARIABLE ret
+        OUTPUT_VARIABLE list_output
+        ERROR_VARIABLE list_error
+    )
+
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${reduce_op}: ${list_error}")
+    endif()
+
+    # Read kernel count
+    if(EXISTS ${working_path}/pool_kernel_count.txt)
+        file(READ ${working_path}/pool_kernel_count.txt kernel_count)
+        string(STRIP "${kernel_count}" kernel_count)
+        message(VERBOSE "  Found ${kernel_count} kernel configurations")
+    else()
+        message(FATAL_ERROR "Kernel count file not found")
+    endif()
+
+    # Read kernel list and create targets
+    if(EXISTS ${working_path}/pool_kernel_list.txt)
+        file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines)
+        foreach(line IN LISTS kernel_lines)
+            # Parse line: kernel_name|block_config|trait_combo
+            string(REPLACE "|" ";" parts "${line}")
+            list(GET parts 0 kernel_name)
+            list(GET parts 1 block_config)
+            list(GET parts 2 trait_combo)
+
+            # Create individual target
+            create_individual_pool_target("${datatype}" "${reduce_op}" "${trait_combo}" "${block_config}" "${json_blob}")
+        endforeach()
+    else()
+        message(FATAL_ERROR "Kernel list file not found")
+    endif()
+endfunction()
+
+# Main build logic
+message(VERBOSE "=== Starting Tile Engine Pool Configuration ===")
+message(VERBOSE "POOL_DATATYPE: ${POOL_DATATYPE}")
+message(VERBOSE "POOL_REDUCE_OP: ${POOL_REDUCE_OP}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+
+# Filter GPU targets
+set(POOL_GPU_TARGETS_INDIVIDUAL "")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+
+foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+    if(target IN_LIST DESIRED_TARGETS)
+        list(APPEND POOL_GPU_TARGETS_INDIVIDUAL ${target})
+        message(VERBOSE "  Adding GPU target: ${target}")
+    endif()
+endforeach()
+
+# Skip build if no matching targets found
+if(NOT POOL_GPU_TARGETS_INDIVIDUAL)
+    message(WARNING "Skipping Tile Engine Pool build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+else()
+    message(VERBOSE "Building individual Pool targets for GPU targets: ${POOL_GPU_TARGETS_INDIVIDUAL}")
+
+    # Set up job pools
+    set_property(GLOBAL PROPERTY JOB_POOLS
+        compile_heavy=4
+        compile_normal=16
+    )
+
+    # Enable compiler cache if requested
+    if(ENABLE_CCACHE_POOL)
+        find_program(CCACHE_PROGRAM ccache)
+        if(CCACHE_PROGRAM)
+            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+            message(VERBOSE "Using ccache for faster compilation")
+        else()
+            message(WARNING "ccache requested but not found")
+        endif()
+    else()
+        message(VERBOSE "ccache disabled for Pool ops (use -DENABLE_CCACHE_POOL=ON to enable)")
+    endif()
+
+    # Create master collection targets
+    add_custom_target(benchmark_pool_all)
+
+    # Create datatype collection targets
+    foreach(dt IN LISTS POOL_DATATYPE)
+        add_custom_target(benchmark_pool_${dt})
+    endforeach()
+
+    # Create reduce_op collection targets
+    foreach(op IN LISTS POOL_REDUCE_OP)
+        add_custom_target(benchmark_pool_${op})
+    endforeach()
+
+    # Create combined collection targets
+    foreach(dt IN LISTS POOL_DATATYPE)
+        foreach(op IN LISTS POOL_REDUCE_OP)
+            add_custom_target(benchmark_pool_${dt}_${op})
+        endforeach()
+    endforeach()
+
+    # Create pool dimension targets
+    add_custom_target(benchmark_pool2d)
+    add_custom_target(benchmark_pool3d)
+
+    # Build individual targets for each datatype/reduce_op combination
+    foreach(dt IN LISTS POOL_DATATYPE)
+        foreach(op IN LISTS POOL_REDUCE_OP)
+            build_individual_pool_targets(${dt} ${op})
+        endforeach()
+    endforeach()
+endif()
+
diff --git a/tile_engine/ops/pooling/README.md b/tile_engine/ops/pooling/README.md
new file mode 100644
index 00000000000..9495b5f0b52
--- /dev/null
+++ b/tile_engine/ops/pooling/README.md
@@ -0,0 +1,381 @@
+# CK Tile Engine Pool Operations
+
+## Overview
+
+The CK Tile Engine Pool module provides a comprehensive system for generating, building, and benchmarking pooling kernels (2D and 3D) with various configurations. It supports multiple data types, reduce operations (max, min, average), and optimization strategies. The system follows the same architecture as the GEMM module with individual kernel compilation for better build parallelism and targeted testing capabilities.
+
+## Table of Contents
+
+1. [Build System Architecture](#build-system-architecture)
+2. [Build Instructions](#build-instructions)
+3. [Running Benchmarks](#running-benchmarks)
+4. [Configuration System](#configuration-system)
+5. [Scripts and Tools](#scripts-and-tools)
+6. [Command Line Options](#command-line-options)
+7. [Understanding Kernel Names](#understanding-kernel-names)
+8. [Troubleshooting](#troubleshooting)
+9. [Performance Tips](#performance-tips)
+
+## Build System Architecture
+
+### Individual Kernel Compilation
+
+The tile engine benchmark system compiles each kernel configuration into a separate executable. This provides:
+- Better build parallelism
+- Faster incremental builds
+- More targeted testing
+- Easier debugging of specific configurations
+
+Each benchmark executable follows the naming pattern:
+```
+benchmark_pool<dim>d_<dtype>_<reduce_op>_<output_index>_<propagate_nan>_<block_config>
+```
+
+## Build Instructions
+
+### Prerequisites
+- ROCm installation
+- CMake 3.16 or higher
+- C++17 compatible compiler
+- Python 3.6 or higher
+
+### Basic Build
+
+```bash
+# In the root of composable kernel, create build directory
+mkdir build && cd build
+
+# Configure with specific datatypes and reduce operations
+# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942)
+../script/cmake-ck-dev.sh ../ [Arch] -DPOOL_DATATYPE="fp16;fp32" -DPOOL_REDUCE_OP="max;avg"
+
+# Build specific benchmarks
+make benchmark_pool_fp16_max -j
+```
+
+### Configuration Options
+
+The build system supports several configuration options:
+
+#### Using Custom Config Files
+```bash
+# Method 1: CMake variable (config file must be in configs/ directory)
+cmake -DPOOL_CONFIG_FILE=my_custom_config.json ...
+
+# Method 2: Environment variable (takes precedence over CMake variable)
+export POOL_CONFIG_FILE=my_custom_config.json
+cmake ...
+```
+
+#### Config File Priority Order
+1. **Environment variable** `POOL_CONFIG_FILE` (highest priority)
+2. **CMake variable** `POOL_CONFIG_FILE`
+3. **Default config** (default_config.json)
+
+**Note**: All custom config files must be placed in the `tile_engine/ops/pooling/configs/` directory.
+
+### Example Build Commands
+
+```bash
+# Build for gfx942 with fp16 datatype, max reduce operation
+mkdir build && cd build
+../script/cmake-ck-dev.sh ../ gfx942 -DPOOL_DATATYPE="fp16;fp32" -DPOOL_REDUCE_OP="max;avg"
+make benchmark_pool_fp16_max -j
+make benchmark_pool_fp32_avg -j
+```
+
+### Building Individual Kernels
+
+```bash
+# Build a specific kernel configuration
+make benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1
+
+# Build all fp16 max pooling benchmarks
+make benchmark_pool_fp16_max -j$(nproc)
+
+# Build all 3D pooling benchmarks
+make benchmark_pool3d -j$(nproc)
+```
+
+### Rebuilding After Configuration Changes
+
+If you modify the configuration file, you must rebuild:
+```bash
+rm -rf tile_engine/ && make benchmark_pool_[Datatype]_[ReduceOp] -j
+```
+
+## Running Benchmarks
+
+### Individual Kernel Execution
+
+```bash
+cd /path/to/build/directory
+./bin/benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1 \
+    -N=2 -D=30 -H=30 -W=30 -C=32 \
+    -Z=2 -Y=2 -X=2 \
+    -Sz=2 -Sy=2 -Sx=2 \
+    -verify=1
+```
+
+### Using the Benchmark Python Script
+
+```bash
+# Run benchmark sweep
+python pool_benchmark.py /path/to/build \
+    --problem-sizes "2,30,30,30,32" "4,64,64,64,64" \
+    --window-sizes "2,2,2" "3,3,3" \
+    --stride-sizes "2,2,2" \
+    --pool-dim 3 \
+    --verify \
+    --json results.json
+```
+
+## Configuration System
+
+### Configuration Files
+
+The system uses JSON configuration files to specify kernel parameters:
+
+- `configs/default_config.json` - Default configurations
+
+### Configuration Structure
+
+```json
+{
+    "block_config": {
+        "block_m": {"values": [64, 128, 256]},
+        "block_n": {"values": [1]},
+        "warp_m": {"values": [1, 2]},
+        "warp_n": {"values": [1]},
+        "thread_tile_m": {"values": [1, 2, 4]},
+        "thread_tile_n": {"values": [1]}
+    },
+    "trait_config": {
+        "output_index": {"values": [true, false]},
+        "propagate_nan": {"values": [false]},
+        "pool_dim": {"values": [2, 3]}
+    },
+    "k_block_per_cu": 1
+}
+```
+
+### Configuration Parameters
+
+- **block_m/block_n**: Block tile dimensions for output
+- **warp_m/warp_n**: Number of warps per block
+- **thread_tile_m/thread_tile_n**: Thread tile sizes
+- **output_index**: Whether to output indices (for max/min pooling)
+- **propagate_nan**: Whether to propagate NaN values
+- **pool_dim**: Pooling dimension (2 for 2D, 3 for 3D)
+
+## Scripts and Tools
+
+### Python Scripts
+
+#### pool_instance_builder.py
+**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files.
+
+**Key Features**:
+- Generates individual kernel header files for separate compilation
+- Supports multiple data types (fp16, fp32, bf16)
+- Validates block configurations for correctness
+- Creates CMake integration files
+
+**Usage**:
+```bash
+python pool_instance_builder.py \
+    --working_path ./generated \
+    --datatype fp16 \
+    --reduce_op max \
+    --config_json configs/default_config.json \
+    --gen_all_individual \
+    --gpu_target gfx942
+```
+
+#### pool_benchmark.py
+**Purpose**: Python script for running and analyzing pool benchmarks.
+
+**Features**:
+- Automated benchmark execution
+- Performance data collection
+- Result analysis and reporting
+- CSV and JSON export
+
+**Usage**:
+```bash
+python pool_benchmark.py /path/to/build \
+    --problem-sizes "2,30,30,30,32" \
+    --window-sizes "2,2,2" \
+    --verbose \
+    --json results.json
+```
+
+## Command Line Options
+
+All benchmark executables support the following options:
+
+### Tensor Dimensions
+- `-N=<value>` - Batch size (default: 2)
+- `-D=<value>` - Depth dimension for 3D pooling (default: 30)
+- `-H=<value>` - Height dimension (default: 30)
+- `-W=<value>` - Width dimension (default: 30)
+- `-C=<value>` - Channel dimension (default: 32)
+
+### Window Parameters
+- `-Z=<value>` - Window depth (default: 2)
+- `-Y=<value>` - Window height (default: 2)
+- `-X=<value>` - Window width (default: 2)
+
+### Stride Parameters
+- `-Sz=<value>` - Stride depth (default: 2)
+- `-Sy=<value>` - Stride height (default: 2)
+- `-Sx=<value>` - Stride width (default: 2)
+
+### Dilation Parameters
+- `-Dz=<value>` - Dilation depth (default: 1)
+- `-Dy=<value>` - Dilation height (default: 1)
+- `-Dx=<value>` - Dilation width (default: 1)
+
+### Padding Parameters
+- `-LeftPz=<value>` - Left padding depth (default: 0)
+- `-LeftPy=<value>` - Left padding height (default: 0)
+- `-LeftPx=<value>` - Left padding width (default: 0)
+- `-RightPz=<value>` - Right padding depth (default: 0)
+- `-RightPy=<value>` - Right padding height (default: 0)
+- `-RightPx=<value>` - Right padding width (default: 0)
+
+### Pool Dimension
+- `-pool_dim=<2|3>` - Pooling dimension (default: 3)
+
+### Verification
+- `-verify=<0|1>` - Verification mode
+  - 0: No verification
+  - 1: CPU verification (default)
+
+### Performance Testing
+- `-warmup=<value>` - Warmup iterations (default: 20)
+- `-repeat=<value>` - Benchmark iterations (default: 100)
+- `-timer=<true|false>` - Use GPU timer (default: true)
+- `-flush_cache=<true|false>` - Flush cache between runs (default: true)
+- `-rotating_count=<value>` - Cache rotation count (default: 1000)
+
+### Initialization
+- `-init=<0|1|2>` - Tensor initialization method
+  - 0: Random values [-5, 5] (default)
+  - 1: Linear sequence
+  - 2: Constant value (1.0)
+
+### Output Options
+- `-log=<true|false>` - Enable verbose logging (default: false)
+- `-metric=<0|1|2>` - Performance metric
+  - 0: Latency in ms
+  - 1: TFLOPS
+  - 2: Bandwidth in GB/s (default)
+- `-json_output=<true|false>` - JSON format output (default: false)
+- `-csv_filename=<filename>` - Save results to CSV
+
+## Understanding Kernel Names
+
+The kernel naming convention encodes the configuration:
+
+```
+benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1
+              ^^^^ ^^^^ ^^^ ^^^^ ^^^^^ ^^^^^ ^^^ ^^^
+              |    |    |   |    |     |     |   |
+              |    |    |   |    |     |     |   Thread tile (MxN)
+              |    |    |   |    |     |     Warp config (MxN)
+              |    |    |   |    |     Block tile (MxN)
+              |    |    |   |    Propagate NaN
+              |    |    |   Output Index
+              |    |    Reduce operation
+              |    Data type
+              Pool dimension (2D or 3D)
+```
+
+### Components:
+- **Pool dimension**: 2d, 3d
+- **Data type**: fp16, fp32, bf16
+- **Reduce op**: max, min, avg
+- **Output Index**: True/False (whether to output argmax/argmin)
+- **Propagate NaN**: True/False
+- **Block config**: Block_MxBlock_N_Warp_MxWarp_N_ThreadTile_MxThreadTile_N
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Kernel not found**
+   - Ensure the specific benchmark executable is built
+   - Check the build directory bin/ folder
+
+2. **Verification failures**
+   - Check tensor dimensions are valid for the window/stride configuration
+   - Verify padding values are reasonable
+
+3. **Build failures**
+   - Check GPU architecture compatibility
+   - Ensure ROCm is properly installed
+   - Verify configuration file syntax
+
+4. **Performance variations**
+   - Increase warmup iterations
+   - Disable CPU frequency scaling
+   - Use GPU timer for accurate measurements
+
+### Debug Options
+
+Enable verbose logging:
+```bash
+./bin/benchmark_pool... -log=true -verify=1
+```
+
+## Performance Tips
+
+1. **Optimal Problem Sizes**: Use sizes that are multiples of block dimensions
+2. **Warmup**: Use at least 20-50 warmup iterations
+3. **GPU Timer**: Always use `-timer=true` for accurate measurements
+4. **Cache Management**: Enable cache flushing for consistent results
+5. **Output Index**: Disable output index if not needed (reduces memory bandwidth)
+
+## Integration Examples
+
+### Python Integration
+
+```python
+import subprocess
+import json
+
+# Run benchmark with JSON output
+result = subprocess.run([
+    './bin/benchmark_pool3d_fp16_max_...', 
+    '-N=2', '-D=30', '-H=30', '-W=30', '-C=32',
+    '-json_output=true'
+], capture_output=True, text=True)
+
+# Parse results
+data = json.loads(result.stdout)
+print(f"Bandwidth: {data['bandwidth_gb_s']} GB/s")
+```
+
+### Batch Testing Script
+
+```bash
+#!/bin/bash
+SIZES="32 64 128 256"
+for size in $SIZES; do
+    echo "Testing HxW=${size}x${size}"
+    ./bin/benchmark_pool... -H=$size -W=$size \
+        -verify=1 -csv_filename=results.csv
+done
+```
+
+## Contributing
+
+When adding new features or configurations:
+1. Update the instance builder (`pool_instance_builder.py`)
+2. Update configuration examples in `configs/`
+3. Document new command-line options in this README
+4. Add appropriate tests
+
+For more information about the Composable Kernel project, visit the main repository documentation.
+
diff --git a/tile_engine/ops/pooling/configs/default_config.json b/tile_engine/ops/pooling/configs/default_config.json
new file mode 100644
index 00000000000..6eb97c901f6
--- /dev/null
+++ b/tile_engine/ops/pooling/configs/default_config.json
@@ -0,0 +1,34 @@
+{
+    "block_config": {
+        "block_m": {
+            "values": [64, 128, 256]
+        },
+        "block_n": {
+            "values": [1]
+        },
+        "warp_m": {
+            "values": [1]
+        },
+        "warp_n": {
+            "values": [1]
+        },
+        "thread_tile_m": {
+            "values": [1, 2, 4]
+        },
+        "thread_tile_n": {
+            "values": [1]
+        }
+    },
+    "trait_config": {
+        "output_index": {
+            "values": [true, false]
+        },
+        "propagate_nan": {
+            "values": [false]
+        },
+        "pool_dim": {
+            "values": [2, 3]
+        }
+    },
+    "k_block_per_cu": 1
+}
diff --git a/tile_engine/ops/pooling/pool_benchmark.hpp b/tile_engine/ops/pooling/pool_benchmark.hpp
new file mode 100644
index 00000000000..f8d83077e27
--- /dev/null
+++ b/tile_engine/ops/pooling/pool_benchmark.hpp
@@ -0,0 +1,195 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+#include <iomanip>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "pool_common.hpp"
+
+enum class Metric
+{
+    LATENCY   = 0,
+    TFLOPS    = 1,
+    BANDWIDTH = 2
+};
+
+inline constexpr auto get_metric_name(Metric m)
+{
+    switch(m)
+    {
+    case Metric::LATENCY: return "latency";
+    case Metric::TFLOPS: return "tflops";
+    case Metric::BANDWIDTH: return "bandwidth";
+    default: throw std::invalid_argument("Unsupported metric type");
+    }
+}
+
+struct PoolProblem
+{
+    std::string inDType;
+    std::string outDType;
+    std::string computeDType;
+    std::string indexDType;
+    std::string blockShape;
+    std::string reduceOp;
+
+    int poolDim;
+    int N, D, H, W, C;
+
+    int windowZ, windowY, windowX;
+    int strideZ, strideY, strideX;
+    int dilationZ, dilationY, dilationX;
+    int leftPadZ, leftPadY, leftPadX;
+    int rightPadZ, rightPadY, rightPadX;
+
+    bool outputIndex;
+    bool propagateNan;
+
+    friend std::ostream& operator<<(std::ostream& os, const PoolProblem& problem)
+    {
+        os << "{\n"
+           << "    \"inDType\": \"" << problem.inDType << "\",\n"
+           << "    \"outDType\": \"" << problem.outDType << "\",\n"
+           << "    \"computeDType\": \"" << problem.computeDType << "\",\n"
+           << "    \"indexDType\": \"" << problem.indexDType << "\",\n"
+           << "    \"blockShape\": \"" << problem.blockShape << "\",\n"
+           << "    \"reduceOp\": \"" << problem.reduceOp << "\",\n"
+           << "    \"poolDim\": " << problem.poolDim << ",\n"
+           << "    \"N\": " << problem.N << ",\n"
+           << "    \"D\": " << problem.D << ",\n"
+           << "    \"H\": " << problem.H << ",\n"
+           << "    \"W\": " << problem.W << ",\n"
+           << "    \"C\": " << problem.C << ",\n"
+           << "    \"windowZ\": " << problem.windowZ << ",\n"
+           << "    \"windowY\": " << problem.windowY << ",\n"
+           << "    \"windowX\": " << problem.windowX << ",\n"
+           << "    \"strideZ\": " << problem.strideZ << ",\n"
+           << "    \"strideY\": " << problem.strideY << ",\n"
+           << "    \"strideX\": " << problem.strideX << ",\n"
+           << "    \"dilationZ\": " << problem.dilationZ << ",\n"
+           << "    \"dilationY\": " << problem.dilationY << ",\n"
+           << "    \"dilationX\": " << problem.dilationX << ",\n"
+           << "    \"leftPadZ\": " << problem.leftPadZ << ",\n"
+           << "    \"leftPadY\": " << problem.leftPadY << ",\n"
+           << "    \"leftPadX\": " << problem.leftPadX << ",\n"
+           << "    \"rightPadZ\": " << problem.rightPadZ << ",\n"
+           << "    \"rightPadY\": " << problem.rightPadY << ",\n"
+           << "    \"rightPadX\": " << problem.rightPadX << ",\n"
+           << "    \"outputIndex\": " << (problem.outputIndex ? "true" : "false") << ",\n"
+           << "    \"propagateNan\": " << (problem.propagateNan ? "true" : "false") << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct PerformanceResult
+{
+    double latency_;
+    double tflops_;
+    double bandwidth_;
+
+    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
+    {
+        switch(m)
+        {
+        case Metric::LATENCY: return a.latency_ < b.latency_;
+        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
+        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
+        default: throw std::invalid_argument("Unsupported metric type");
+        }
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
+    {
+        os << "{\n"
+           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
+           << ",\n"
+           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
+           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct KernelInstance
+{
+    std::string name_;
+    PoolProblem problem_;
+    PerformanceResult perf_result_;
+
+    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
+    {
+        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
+    {
+        os << "{\n"
+           << " \"name\": \"" << obj.name_ << "\",\n"
+           << " \"problem\": " << obj.problem_ << ",\n"
+           << " \"perf_result\": " << obj.perf_result_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct Setting
+{
+    int n_warmup_;
+    int n_repeat_;
+    bool is_gpu_timer_;
+    int verify_;
+    int init_method_;
+    bool log_;
+    std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
+    bool json_output_;
+};
+
+inline std::string get_rocm_version()
+{
+    std::ifstream version_file("/opt/rocm/.info/version");
+    if(version_file.is_open())
+    {
+        std::string version;
+        std::getline(version_file, version);
+        return version;
+    }
+    return "Unknown";
+}
+
+/// @brief Function to compare the results of the device and host computations
+template <typename OutDataType>
+bool compare_pool_results(std::string instanceName,
+                          ck_tile::HostTensor<OutDataType>& out_dev_result,
+                          ck_tile::HostTensor<OutDataType>& out_host_result)
+{
+    bool pass = ck_tile::check_err(out_dev_result, out_host_result, "Error: Incorrect results!");
+
+    std::cout << "For " << instanceName
+              << " verification result is: " << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
+
+template <typename IndexDataType>
+bool compare_pool_index_results(std::string instanceName,
+                                ck_tile::HostTensor<IndexDataType>& out_index_dev_result,
+                                ck_tile::HostTensor<IndexDataType>& out_index_host_result)
+{
+    bool pass = ck_tile::check_err(
+        out_index_dev_result, out_index_host_result, "Error: Incorrect index results!");
+
+    std::cout << "For " << instanceName
+              << " index verification result is: " << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
diff --git a/tile_engine/ops/pooling/pool_benchmark.py b/tile_engine/ops/pooling/pool_benchmark.py
new file mode 100644
index 00000000000..0141e49f0f9
--- /dev/null
+++ b/tile_engine/ops/pooling/pool_benchmark.py
@@ -0,0 +1,624 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+
+class PoolBenchmark:
+    def __init__(self, build_dir: str, verbose: bool = False):
+        self.build_dir = Path(build_dir)
+        self.verbose = verbose
+        self.results = []
+
+    def discover_kernels(self) -> List[Path]:
+        """Find all benchmark_pool_* executables in the build directory"""
+        bin_dir = self.build_dir / "bin"
+        if not bin_dir.exists():
+            print(f"Error: Binary directory {bin_dir} does not exist")
+            return []
+
+        kernels = list(bin_dir.glob("benchmark_pool*"))
+        if self.verbose:
+            print(f"Found {len(kernels)} kernel executables")
+            for k in kernels:
+                print(f"  - {k.name}")
+        return kernels
+
+    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
+        """Extract comprehensive kernel information from filename"""
+        name = kernel_path.stem
+
+        # Initialize with basic info
+        info = {
+            "executable": str(kernel_path),
+            "name": name,
+            "data_type": "unknown",
+            "reduce_op": "unknown",
+            "pool_dim": 0,
+            "output_index": False,
+            "propagate_nan": False,
+        }
+
+        # Parse the kernel name pattern:
+        # benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1
+        parts = name.split("_")
+
+        if len(parts) >= 3:
+            # Extract pool dimension (e.g., pool3d -> 3)
+            if "pool2d" in parts[1]:
+                info["pool_dim"] = 2
+            elif "pool3d" in parts[1]:
+                info["pool_dim"] = 3
+
+            # Extract data type
+            info["data_type"] = parts[2] if len(parts) > 2 else "unknown"
+
+            # Extract reduce op
+            info["reduce_op"] = parts[3] if len(parts) > 3 else "unknown"
+
+            # Extract flags
+            if len(parts) > 4:
+                info["output_index"] = parts[4] == "True"
+            if len(parts) > 5:
+                info["propagate_nan"] = parts[5] == "True"
+
+        # Extract block configuration
+        config_info = self.parse_block_config(name)
+        info.update(config_info)
+
+        # Generate config ID
+        info["config_id"] = self.generate_config_id(info)
+
+        return info
+
+    def parse_block_config(self, kernel_name: str) -> Dict:
+        """Parse block configuration from kernel name"""
+        config = {
+            "block_sizes": {"block_m": 0, "block_n": 0},
+            "warp_config": {"warp_m": 0, "warp_n": 0},
+            "thread_tile": {"thread_tile_m": 0, "thread_tile_n": 0},
+        }
+
+        parts = kernel_name.split("_")
+
+        # Look for dimension patterns (e.g., 128x1)
+        dimension_groups = []
+        for part in parts:
+            if "x" in part and len(part.split("x")) == 2:
+                try:
+                    dims = [int(x) for x in part.split("x")]
+                    if all(d >= 0 for d in dims):
+                        dimension_groups.append(dims)
+                except ValueError:
+                    continue
+
+        # Assign dimensions based on order
+        if len(dimension_groups) >= 3:
+            config["block_sizes"]["block_m"] = dimension_groups[0][0]
+            config["block_sizes"]["block_n"] = dimension_groups[0][1]
+            config["warp_config"]["warp_m"] = dimension_groups[1][0]
+            config["warp_config"]["warp_n"] = dimension_groups[1][1]
+            config["thread_tile"]["thread_tile_m"] = dimension_groups[2][0]
+            config["thread_tile"]["thread_tile_n"] = dimension_groups[2][1]
+        elif len(dimension_groups) == 2:
+            config["block_sizes"]["block_m"] = dimension_groups[0][0]
+            config["block_sizes"]["block_n"] = dimension_groups[0][1]
+            config["warp_config"]["warp_m"] = dimension_groups[1][0]
+            config["warp_config"]["warp_n"] = dimension_groups[1][1]
+        elif len(dimension_groups) == 1:
+            config["block_sizes"]["block_m"] = dimension_groups[0][0]
+            config["block_sizes"]["block_n"] = dimension_groups[0][1]
+
+        return config
+
+    def generate_config_id(self, info: Dict) -> str:
+        """Generate a compact config ID from kernel info"""
+        parts = [
+            f"pool{info.get('pool_dim', 0)}d",
+            info.get("data_type", "unk"),
+            info.get("reduce_op", "unk"),
+        ]
+
+        block_sizes = info.get("block_sizes", {})
+        if block_sizes.get("block_m", 0) > 0:
+            block_str = f"{block_sizes['block_m']}x{block_sizes['block_n']}"
+            parts.append(block_str)
+
+        return "_".join(parts)
+
+    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
+        """Run a single kernel with given parameters"""
+        results_dir = self.build_dir / "results"
+        results_dir.mkdir(exist_ok=True)
+
+        json_file = results_dir / f"{kernel_path.stem}.json"
+
+        cmd = [str(kernel_path)]
+
+        for key, value in params.items():
+            cmd.append(f"-{key}={value}")
+
+        cmd.append("-json_output=true")
+
+        if self.verbose:
+            print(f"Running: {' '.join(cmd)}")
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+
+            if result.returncode != 0:
+                print(f"Error running {kernel_path.name}: {result.stderr}")
+                return None
+
+            output = result.stdout.strip()
+            if output:
+                with open(json_file, "w") as f:
+                    f.write(output)
+
+                return self.parse_json_file(json_file)
+            else:
+                print(f"No output from {kernel_path.name}")
+                return None
+
+        except subprocess.TimeoutExpired:
+            print(f"Timeout running {kernel_path.name}")
+            return None
+        except Exception as e:
+            print(f"Error running {kernel_path.name}: {e}")
+            return None
+
+    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
+        """Parse JSON data from individual kernel output file"""
+        try:
+            with open(json_file, "r") as f:
+                content = f.read().strip()
+
+            data = json.loads(content)
+
+            result = data.copy()
+            if "perf_result" in data:
+                perf = data["perf_result"]
+                result["time_ms"] = perf.get("latency(ms)", 0)
+                result["tflops"] = perf.get("tflops(TFlops)", 0)
+                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
+
+            return result
+
+        except json.JSONDecodeError as e:
+            if self.verbose:
+                print(f"Failed to parse JSON from {json_file}: {e}")
+            return None
+        except Exception as e:
+            if self.verbose:
+                print(f"Error reading JSON file {json_file}: {e}")
+            return None
+
+    def benchmark_problem_size(
+        self,
+        kernels: List[Path],
+        N: int,
+        D: int,
+        H: int,
+        W: int,
+        C: int,
+        window_z: int = 2,
+        window_y: int = 2,
+        window_x: int = 2,
+        stride_z: int = 2,
+        stride_y: int = 2,
+        stride_x: int = 2,
+        pool_dim: int = 3,
+        verify: int = 0,
+        warmup: int = 20,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> List[Dict]:
+        """Benchmark all kernels for a specific problem size"""
+        results = []
+
+        params = {
+            "N": N,
+            "D": D,
+            "H": H,
+            "W": W,
+            "C": C,
+            "Z": window_z,
+            "Y": window_y,
+            "X": window_x,
+            "Sz": stride_z,
+            "Sy": stride_y,
+            "Sx": stride_x,
+            "pool_dim": pool_dim,
+            "verify": verify,
+            "warmup": warmup,
+            "repeat": repeat,
+            "flush_cache": str(flush_cache).lower(),
+            "rotating_count": rotating_count,
+        }
+
+        print(f"\nBenchmarking N={N}, D={D}, H={H}, W={W}, C={C}")
+        print(
+            f"  Window: {window_z}x{window_y}x{window_x}, Stride: {stride_z}x{stride_y}x{stride_x}"
+        )
+
+        for kernel_path in kernels:
+            kernel_info = self.extract_kernel_info(kernel_path)
+            result = self.run_kernel(kernel_path, params)
+
+            if result:
+                structured_result = {
+                    "name": kernel_info["name"],
+                    "config_id": kernel_info["config_id"],
+                    "problem": result.get("problem", {}),
+                    "perf_result": result.get("perf_result", {}),
+                    "config": {
+                        "data_type": kernel_info["data_type"],
+                        "reduce_op": kernel_info["reduce_op"],
+                        "pool_dim": kernel_info["pool_dim"],
+                        "output_index": kernel_info["output_index"],
+                        "propagate_nan": kernel_info["propagate_nan"],
+                        "block_sizes": kernel_info.get("block_sizes", {}),
+                        "warp_config": kernel_info.get("warp_config", {}),
+                        "thread_tile": kernel_info.get("thread_tile", {}),
+                    },
+                    "executable": kernel_info["executable"],
+                    "time_ms": result.get("time_ms", 0),
+                    "tflops": result.get("tflops", 0),
+                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
+                }
+
+                results.append(structured_result)
+
+                if self.verbose:
+                    print(
+                        f"  {kernel_info['config_id']}: {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
+                    )
+
+        return results
+
+    def find_best_kernel(
+        self, results: List[Dict], metric: str = "bandwidth_gb_s"
+    ) -> Optional[Dict]:
+        """Find the best performing kernel based on metric"""
+        if not results:
+            return None
+
+        if metric == "bandwidth_gb_s":
+            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
+        elif metric == "time_ms":
+            return min(results, key=lambda x: x.get("time_ms", float("inf")))
+        elif metric == "tflops":
+            return max(results, key=lambda x: x.get("tflops", 0))
+        else:
+            raise ValueError(f"Unknown metric: {metric}")
+
+    def benchmark_sweep(
+        self,
+        problem_sizes: List[Tuple[int, int, int, int, int]],  # N, D, H, W, C
+        window_sizes: List[Tuple[int, int, int]] = [(2, 2, 2)],
+        stride_sizes: List[Tuple[int, int, int]] = [(2, 2, 2)],
+        pool_dim: int = 3,
+        verify: bool = False,
+        warmup: int = 20,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> Dict:
+        """Run comprehensive benchmark sweep"""
+        kernels = self.discover_kernels()
+        if not kernels:
+            print("No kernels found!")
+            return {}
+
+        all_results = []
+        best_kernels = {}
+
+        for N, D, H, W, C in problem_sizes:
+            for wz, wy, wx in window_sizes:
+                for sz, sy, sx in stride_sizes:
+                    results = self.benchmark_problem_size(
+                        kernels,
+                        N,
+                        D,
+                        H,
+                        W,
+                        C,
+                        window_z=wz,
+                        window_y=wy,
+                        window_x=wx,
+                        stride_z=sz,
+                        stride_y=sy,
+                        stride_x=sx,
+                        pool_dim=pool_dim,
+                        verify=1 if verify else 0,
+                        warmup=warmup,
+                        repeat=repeat,
+                        flush_cache=flush_cache,
+                        rotating_count=rotating_count,
+                    )
+
+                    all_results.extend(results)
+
+                    best = self.find_best_kernel(results)
+                    if best:
+                        key = (
+                            f"N{N}_D{D}_H{H}_W{W}_C{C}_w{wz}x{wy}x{wx}_s{sz}x{sy}x{sx}"
+                        )
+                        best_kernels[key] = best
+                        print(
+                            f"Best for {key}: {best['name']} ({best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
+                        )
+
+        self.results = all_results
+        return best_kernels
+
+    def export_csv(self, filename: str):
+        """Export all results to CSV"""
+        if not self.results:
+            print("No results to export")
+            return
+
+        all_keys = set()
+        for result in self.results:
+            all_keys.update(result.keys())
+
+        fieldnames = sorted(all_keys)
+
+        with open(filename, "w", newline="") as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(self.results)
+
+        print(f"Results exported to {filename}")
+
+    def export_best_kernels(self, best_kernels: Dict, filename: str):
+        """Export best kernel selections to file"""
+        with open(filename, "w") as f:
+            f.write("# Best kernel selections for pooling\n")
+            f.write("# Format: problem_size -> kernel_name (bandwidth, latency)\n\n")
+
+            for key, kernel in sorted(best_kernels.items()):
+                f.write(
+                    f"{key}: {kernel['name']} ({kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
+                )
+
+        print(f"Best kernels exported to {filename}")
+
+    def export_json(self, filename: str, best_kernels: Dict = None):
+        """Export all results and best kernels to JSON"""
+        from datetime import datetime
+
+        successful_results = [r for r in self.results if r.get("bandwidth_gb_s", 0) > 0]
+
+        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
+        latency_values = [
+            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
+        ]
+
+        # Performance breakdown by kernel type
+        reduce_op_stats = {}
+        data_type_stats = {}
+
+        for result in successful_results:
+            config = result.get("config", {})
+
+            reduce_op = config.get("reduce_op", "unknown")
+            if reduce_op not in reduce_op_stats:
+                reduce_op_stats[reduce_op] = {
+                    "count": 0,
+                    "avg_bandwidth": 0,
+                    "best_bandwidth": 0,
+                }
+            reduce_op_stats[reduce_op]["count"] += 1
+            reduce_op_stats[reduce_op]["best_bandwidth"] = max(
+                reduce_op_stats[reduce_op]["best_bandwidth"],
+                result.get("bandwidth_gb_s", 0),
+            )
+
+            data_type = config.get("data_type", "unknown")
+            if data_type not in data_type_stats:
+                data_type_stats[data_type] = {
+                    "count": 0,
+                    "avg_bandwidth": 0,
+                    "best_bandwidth": 0,
+                }
+            data_type_stats[data_type]["count"] += 1
+            data_type_stats[data_type]["best_bandwidth"] = max(
+                data_type_stats[data_type]["best_bandwidth"],
+                result.get("bandwidth_gb_s", 0),
+            )
+
+        output_data = {
+            "benchmark_metadata": {
+                "timestamp": datetime.now().isoformat(),
+                "total_kernels_tested": len(self.results),
+                "unique_kernels": len(
+                    set(r.get("name", "unknown") for r in self.results)
+                ),
+                "successful_runs": len(successful_results),
+                "failed_runs": len(self.results) - len(successful_results),
+            },
+            "performance_summary": {
+                "bandwidth_stats": {
+                    "best_gb_s": max(bandwidth_values, default=0),
+                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
+                    if bandwidth_values
+                    else 0,
+                    "min_gb_s": min(bandwidth_values, default=0),
+                },
+                "latency_stats": {
+                    "best_ms": min(latency_values, default=0),
+                    "average_ms": sum(latency_values) / len(latency_values)
+                    if latency_values
+                    else 0,
+                    "max_ms": max(latency_values, default=0),
+                },
+                "kernel_type_breakdown": {
+                    "by_reduce_op": reduce_op_stats,
+                    "by_data_type": data_type_stats,
+                },
+                "total_problem_configurations": len(best_kernels)
+                if best_kernels
+                else 0,
+            },
+            "kernel_results": self.results,
+            "best_kernels_by_problem": best_kernels or {},
+        }
+
+        with open(filename, "w") as f:
+            json.dump(output_data, f, indent=2)
+
+        print(f"JSON results exported to {filename}")
+        print(f"  - Total kernels: {len(self.results)}")
+        print(f"  - Successful runs: {len(successful_results)}")
+        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
+        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Pool Kernel Benchmarking Tool")
+    parser.add_argument(
+        "build_dir", help="Build directory containing kernel executables"
+    )
+    parser.add_argument(
+        "--problem-sizes",
+        nargs="+",
+        default=["2,30,30,30,32", "4,64,64,64,64", "8,128,128,128,128"],
+        help="Problem sizes as N,D,H,W,C tuples",
+    )
+    parser.add_argument(
+        "--window-sizes",
+        nargs="+",
+        default=["2,2,2", "3,3,3"],
+        help="Window sizes as Z,Y,X tuples",
+    )
+    parser.add_argument(
+        "--stride-sizes",
+        nargs="+",
+        default=["2,2,2"],
+        help="Stride sizes as Z,Y,X tuples",
+    )
+    parser.add_argument(
+        "--pool-dim", type=int, default=3, help="Pooling dimension (2 or 3)"
+    )
+    parser.add_argument("--verify", action="store_true", help="Enable verification")
+    parser.add_argument(
+        "--csv", default="pool_benchmark_results.csv", help="CSV output filename"
+    )
+    parser.add_argument(
+        "--best", default="best_pool_kernels.txt", help="Best kernels output filename"
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=20,
+        help="Number of warmup iterations (default: 20)",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations (default: 100)",
+    )
+    parser.add_argument(
+        "--flush-cache",
+        action="store_true",
+        default=True,
+        help="Enable cache flushing (default: True)",
+    )
+    parser.add_argument(
+        "--rotating-count",
+        type=int,
+        default=1000,
+        help="Number of iterations to rotate cache (default: 1000)",
+    )
+    parser.add_argument("--json", help="JSON output filename (optional)")
+
+    args = parser.parse_args()
+
+    # Parse problem sizes
+    problem_sizes = []
+    for size_str in args.problem_sizes:
+        try:
+            parts = list(map(int, size_str.split(",")))
+            if len(parts) == 5:
+                problem_sizes.append(tuple(parts))
+            else:
+                print(f"Invalid problem size: {size_str} (expected N,D,H,W,C)")
+                return 1
+        except ValueError:
+            print(f"Invalid problem size: {size_str}")
+            return 1
+
+    # Parse window sizes
+    window_sizes = []
+    for size_str in args.window_sizes:
+        try:
+            parts = list(map(int, size_str.split(",")))
+            if len(parts) == 3:
+                window_sizes.append(tuple(parts))
+            else:
+                print(f"Invalid window size: {size_str} (expected Z,Y,X)")
+                return 1
+        except ValueError:
+            print(f"Invalid window size: {size_str}")
+            return 1
+
+    # Parse stride sizes
+    stride_sizes = []
+    for size_str in args.stride_sizes:
+        try:
+            parts = list(map(int, size_str.split(",")))
+            if len(parts) == 3:
+                stride_sizes.append(tuple(parts))
+            else:
+                print(f"Invalid stride size: {size_str} (expected Z,Y,X)")
+                return 1
+        except ValueError:
+            print(f"Invalid stride size: {size_str}")
+            return 1
+
+    # Create benchmark instance
+    benchmark = PoolBenchmark(args.build_dir, verbose=args.verbose)
+
+    # Run benchmark sweep
+    print("Starting Pool kernel benchmark sweep...")
+    start_time = time.time()
+
+    best_kernels = benchmark.benchmark_sweep(
+        problem_sizes=problem_sizes,
+        window_sizes=window_sizes,
+        stride_sizes=stride_sizes,
+        pool_dim=args.pool_dim,
+        verify=args.verify,
+        warmup=args.warmup,
+        repeat=args.repeat,
+        flush_cache=args.flush_cache,
+        rotating_count=args.rotating_count,
+    )
+
+    elapsed_time = time.time() - start_time
+    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
+
+    # Export results
+    benchmark.export_csv(args.csv)
+    benchmark.export_best_kernels(best_kernels, args.best)
+
+    if args.json:
+        benchmark.export_json(args.json, best_kernels)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tile_engine/ops/pooling/pool_benchmark_single.cpp b/tile_engine/ops/pooling/pool_benchmark_single.cpp
new file mode 100644
index 00000000000..d4adb1cbb84
--- /dev/null
+++ b/tile_engine/ops/pooling/pool_benchmark_single.cpp
@@ -0,0 +1,459 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <iostream>
+#include <functional>
+#include <tuple>
+#include <exception>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/reference/reference_pool.hpp"
+#include "pool_benchmark.hpp"
+#include "pool_common.hpp"
+
+// The kernel header is included via the compile command line with -include flag
+// It defines: InDataType, OutDataType, ComputeDataType, IndexDataType,
+//             ReduceOpType, Kernel, Problem, OUTPUT_INDEX, PROPAGATE_NAN,
+//             KERNEL_NAME, BLOCK_SHAPE_NAME, REDUCE_OP_NAME
+
+// Create argument parser
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("N", "2", "Batch size N dimension. Default is 2.")
+        .insert("D", "30", "Depth D dimension (for 3D pooling). Default is 30.")
+        .insert("H", "30", "Height H dimension. Default is 30.")
+        .insert("W", "30", "Width W dimension. Default is 30.")
+        .insert("C", "32", "Channel C dimension. Default is 32.")
+        .insert("Z", "2", "Window depth Z dimension. Default is 2.")
+        .insert("Y", "2", "Window height Y dimension. Default is 2.")
+        .insert("X", "2", "Window width X dimension. Default is 2.")
+        .insert("Sz", "2", "Window stride depth. Default is 2.")
+        .insert("Sy", "2", "Window stride height. Default is 2.")
+        .insert("Sx", "2", "Window stride width. Default is 2.")
+        .insert("Dz", "1", "Window dilation depth. Default is 1.")
+        .insert("Dy", "1", "Window dilation height. Default is 1.")
+        .insert("Dx", "1", "Window dilation width. Default is 1.")
+        .insert("LeftPz", "0", "Left padding depth. Default is 0.")
+        .insert("LeftPy", "0", "Left padding height. Default is 0.")
+        .insert("LeftPx", "0", "Left padding width. Default is 0.")
+        .insert("RightPz", "0", "Right padding depth. Default is 0.")
+        .insert("RightPy", "0", "Right padding height. Default is 0.")
+        .insert("RightPx", "0", "Right padding width. Default is 0.")
+        .insert("verify",
+                "0",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU. "
+                "Default is 0.")
+        .insert(
+            "log", "false", "Whether output kernel instance information or not. Default is false")
+        .insert("warmup", "20", "The number of warmup iterations. Default is 20.")
+        .insert("repeat", "100", "The number of benchmark iterations. Default is 100.")
+        .insert("timer", "true", "Whether to use GPU timer. Default is true.")
+        .insert(
+            "init",
+            "0",
+            "The method of tensor initialization. 0=random, 1=linear, 2=constant(1). Default is 0.")
+        .insert("json_output",
+                "false",
+                "Whether to output results in JSON format only. Default is false");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <bool IsPool3D>
+void run_benchmark(const ck_tile::ArgParser& arg_parser)
+{
+    const ck_tile::index_t N = arg_parser.get_int("N");
+    const ck_tile::index_t H = arg_parser.get_int("H");
+    const ck_tile::index_t W = arg_parser.get_int("W");
+    const ck_tile::index_t C = arg_parser.get_int("C");
+
+    const ck_tile::index_t Y = arg_parser.get_int("Y");
+    const ck_tile::index_t X = arg_parser.get_int("X");
+
+    const ck_tile::index_t Sy = arg_parser.get_int("Sy");
+    const ck_tile::index_t Sx = arg_parser.get_int("Sx");
+
+    const ck_tile::index_t Dy = arg_parser.get_int("Dy");
+    const ck_tile::index_t Dx = arg_parser.get_int("Dx");
+
+    const ck_tile::index_t LeftPy  = arg_parser.get_int("LeftPy");
+    const ck_tile::index_t LeftPx  = arg_parser.get_int("LeftPx");
+    const ck_tile::index_t RightPy = arg_parser.get_int("RightPy");
+    const ck_tile::index_t RightPx = arg_parser.get_int("RightPx");
+
+    const int warmup        = arg_parser.get_int("warmup");
+    const int repeat        = arg_parser.get_int("repeat");
+    const int do_validation = arg_parser.get_int("verify");
+    const int init_method   = arg_parser.get_int("init");
+    const bool log          = arg_parser.get_bool("log");
+    const bool json_output  = arg_parser.get_bool("json_output");
+
+    if constexpr(IsPool3D)
+    {
+        // 3D Pooling (NDHWC layout)
+        const ck_tile::index_t D = arg_parser.get_int("D");
+        const ck_tile::index_t Z = arg_parser.get_int("Z");
+
+        const ck_tile::index_t Sz = arg_parser.get_int("Sz");
+        const ck_tile::index_t Dz = arg_parser.get_int("Dz");
+
+        const ck_tile::index_t LeftPz  = arg_parser.get_int("LeftPz");
+        const ck_tile::index_t RightPz = arg_parser.get_int("RightPz");
+
+        // Calculate effective window sizes
+        const ck_tile::index_t Zs = (Z - 1) * Dz + 1;
+        const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        const ck_tile::index_t Xs = (X - 1) * Dx + 1;
+
+        // Calculate output dimensions
+        const ck_tile::index_t Do = (D + LeftPz + RightPz - Zs) / Sz + 1;
+        const ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
+        const ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
+
+        if(log)
+        {
+            std::cout << "3D Pooling: N=" << N << ", D=" << D << ", H=" << H << ", W=" << W
+                      << ", C=" << C << std::endl;
+            std::cout << "Window: Z=" << Z << ", Y=" << Y << ", X=" << X << std::endl;
+            std::cout << "Stride: Sz=" << Sz << ", Sy=" << Sy << ", Sx=" << Sx << std::endl;
+            std::cout << "Output: Do=" << Do << ", Ho=" << Ho << ", Wo=" << Wo << std::endl;
+        }
+
+        // Create shapes using ck_tile::make_tuple
+        const auto input_shape   = ck_tile::make_tuple(N, D, H, W, C);
+        const auto output_shape  = ck_tile::make_tuple(N, Do, Ho, Wo, C);
+        const auto input_strides = ck_tile::make_tuple(D * H * W * C, H * W * C, W * C, C, 1);
+        const auto output_strides =
+            ck_tile::make_tuple(Do * Ho * Wo * C, Ho * Wo * C, Wo * C, C, 1);
+        const auto window_lengths   = ck_tile::make_tuple(Z, Y, X);
+        const auto window_strides   = ck_tile::make_tuple(Sz, Sy, Sx);
+        const auto window_dilations = ck_tile::make_tuple(Dz, Dy, Dx);
+        const auto input_left_pads  = ck_tile::make_tuple(LeftPz, LeftPy, LeftPx);
+        const auto input_right_pads = ck_tile::make_tuple(RightPz, RightPy, RightPx);
+
+        // Allocate host tensors
+        ck_tile::HostTensor<InDataType> in({N, D, H, W, C},
+                                           {D * H * W * C, H * W * C, W * C, C, 1});
+        ck_tile::HostTensor<OutDataType> out({N, Do, Ho, Wo, C},
+                                             {Do * Ho * Wo * C, Ho * Wo * C, Wo * C, C, 1});
+        ck_tile::HostTensor<IndexDataType> out_index(
+            OUTPUT_INDEX ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                    static_cast<std::size_t>(Do),
+                                                    static_cast<std::size_t>(Ho),
+                                                    static_cast<std::size_t>(Wo),
+                                                    static_cast<std::size_t>(C)}
+                         : std::vector<std::size_t>{1});
+
+        // Initialize input
+        if(init_method == 0)
+        {
+            ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(in);
+        }
+        else if(init_method == 1)
+        {
+            ck_tile::FillMonotonicSeq<InDataType>{}(in);
+        }
+        else
+        {
+            ck_tile::FillConstant<InDataType>{static_cast<InDataType>(1)}(in);
+        }
+
+        // Allocate device memory
+        ck_tile::DeviceMem in_buf(in.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_buf(out.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_index_buf(OUTPUT_INDEX ? out_index.get_element_space_size_in_bytes()
+                                                      : 0);
+
+        in_buf.ToDevice(in.data());
+
+        // Create host arguments
+        auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+            static_cast<InDataType*>(in_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_buf.GetDeviceBuffer()),
+            OUTPUT_INDEX ? static_cast<IndexDataType*>(out_index_buf.GetDeviceBuffer()) : nullptr,
+            input_shape,
+            output_shape,
+            input_strides,
+            output_strides,
+            window_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads};
+
+        auto kernel_args = Kernel::MakeKernelArgs(host_args);
+
+        // Validate arguments
+        if(!Kernel::IsSupportedArgument(kernel_args))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping pooling kernel!");
+        }
+
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+        const ck_tile::index_t kBlockSize      = Kernel::BlockSize();
+        const ck_tile::index_t kGridSize       = Kernel::CalculateGridSize(kernel_args);
+
+        if(log)
+        {
+            std::cout << "Launching kernel: " << KERNEL_NAME << std::endl;
+            std::cout << "Grid size: " << kGridSize << ", Block size: " << kBlockSize << std::endl;
+        }
+
+        // Launch kernel
+        float ave_time = ck_tile::launch_kernel(
+            ck_tile::stream_config{nullptr, true, log ? 1 : 0, warmup, repeat},
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{}, kGridSize, kBlockSize, 0, kernel_args));
+
+        // Calculate performance metrics
+        std::size_t num_bytes =
+            sizeof(InDataType) * N * D * H * W * C + sizeof(OutDataType) * N * Do * Ho * Wo * C;
+        float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+        // Output results
+        if(json_output)
+        {
+            std::cout << "{\n"
+                      << " \"name\": \"" << KERNEL_NAME << "\",\n"
+                      << " \"problem\": {\n"
+                      << "    \"N\": " << N << ",\n"
+                      << "    \"D\": " << D << ",\n"
+                      << "    \"H\": " << H << ",\n"
+                      << "    \"W\": " << W << ",\n"
+                      << "    \"C\": " << C << ",\n"
+                      << "    \"windowZ\": " << Z << ",\n"
+                      << "    \"windowY\": " << Y << ",\n"
+                      << "    \"windowX\": " << X << "\n"
+                      << " },\n"
+                      << " \"perf_result\": {\n"
+                      << "   \"latency(ms)\": " << ave_time << ",\n"
+                      << "   \"bandwidth(GB/s)\": " << gb_per_sec << "\n"
+                      << " }\n"
+                      << "}" << std::endl;
+        }
+        else
+        {
+            std::cout << "Kernel: " << KERNEL_NAME << std::endl;
+            std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+        }
+
+        // Verification (if requested)
+        if(do_validation)
+        {
+            out_buf.FromDevice(out.data());
+
+            ck_tile::HostTensor<OutDataType> out_ref({N, Do, Ho, Wo, C},
+                                                     {Do * Ho * Wo * C, Ho * Wo * C, Wo * C, C, 1});
+            ck_tile::HostTensor<IndexDataType> out_ref_index(
+                OUTPUT_INDEX ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                        static_cast<std::size_t>(Do),
+                                                        static_cast<std::size_t>(Ho),
+                                                        static_cast<std::size_t>(Wo),
+                                                        static_cast<std::size_t>(C)}
+                             : std::vector<std::size_t>{1});
+
+            ck_tile::reference_pool3d<InDataType,
+                                      ComputeDataType,
+                                      OutDataType,
+                                      IndexDataType,
+                                      ReduceOpType,
+                                      decltype(input_shape),
+                                      decltype(window_lengths),
+                                      OUTPUT_INDEX>(
+                in, out_ref, out_ref_index, kernel_args, ReduceOpType{});
+
+            bool pass = ck_tile::check_err(out, out_ref);
+            if(OUTPUT_INDEX)
+            {
+                out_index_buf.FromDevice(out_index.data());
+                pass = pass && ck_tile::check_err(out_index, out_ref_index);
+            }
+
+            std::cout << "Verification: " << (pass ? "PASSED" : "FAILED") << std::endl;
+        }
+    }
+    else
+    {
+        // 2D Pooling (NHWC layout)
+        const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        const ck_tile::index_t Xs = (X - 1) * Dx + 1;
+
+        const ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
+        const ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
+
+        if(log)
+        {
+            std::cout << "2D Pooling: N=" << N << ", H=" << H << ", W=" << W << ", C=" << C
+                      << std::endl;
+            std::cout << "Window: Y=" << Y << ", X=" << X << std::endl;
+            std::cout << "Stride: Sy=" << Sy << ", Sx=" << Sx << std::endl;
+            std::cout << "Output: Ho=" << Ho << ", Wo=" << Wo << std::endl;
+        }
+
+        const auto input_shape      = ck_tile::make_tuple(N, H, W, C);
+        const auto output_shape     = ck_tile::make_tuple(N, Ho, Wo, C);
+        const auto input_strides    = ck_tile::make_tuple(H * W * C, W * C, C, 1);
+        const auto output_strides   = ck_tile::make_tuple(Ho * Wo * C, Wo * C, C, 1);
+        const auto window_lengths   = ck_tile::make_tuple(Y, X);
+        const auto window_strides   = ck_tile::make_tuple(Sy, Sx);
+        const auto window_dilations = ck_tile::make_tuple(Dy, Dx);
+        const auto input_left_pads  = ck_tile::make_tuple(LeftPy, LeftPx);
+        const auto input_right_pads = ck_tile::make_tuple(RightPy, RightPx);
+
+        ck_tile::HostTensor<InDataType> in({N, H, W, C}, {H * W * C, W * C, C, 1});
+        ck_tile::HostTensor<OutDataType> out({N, Ho, Wo, C}, {Ho * Wo * C, Wo * C, C, 1});
+        ck_tile::HostTensor<IndexDataType> out_index(
+            OUTPUT_INDEX ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                    static_cast<std::size_t>(Ho),
+                                                    static_cast<std::size_t>(Wo),
+                                                    static_cast<std::size_t>(C)}
+                         : std::vector<std::size_t>{1});
+
+        if(init_method == 0)
+        {
+            ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(in);
+        }
+        else if(init_method == 1)
+        {
+            ck_tile::FillMonotonicSeq<InDataType>{}(in);
+        }
+        else
+        {
+            ck_tile::FillConstant<InDataType>{static_cast<InDataType>(1)}(in);
+        }
+
+        ck_tile::DeviceMem in_buf(in.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_buf(out.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_index_buf(OUTPUT_INDEX ? out_index.get_element_space_size_in_bytes()
+                                                      : 0);
+
+        in_buf.ToDevice(in.data());
+
+        auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+            static_cast<InDataType*>(in_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_buf.GetDeviceBuffer()),
+            OUTPUT_INDEX ? static_cast<IndexDataType*>(out_index_buf.GetDeviceBuffer()) : nullptr,
+            input_shape,
+            output_shape,
+            input_strides,
+            output_strides,
+            window_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads};
+
+        auto kernel_args = Kernel::MakeKernelArgs(host_args);
+
+        if(!Kernel::IsSupportedArgument(kernel_args))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping pooling kernel!");
+        }
+
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+        const ck_tile::index_t kBlockSize      = Kernel::BlockSize();
+        const ck_tile::index_t kGridSize       = Kernel::CalculateGridSize(kernel_args);
+
+        if(log)
+        {
+            std::cout << "Launching kernel: " << KERNEL_NAME << std::endl;
+            std::cout << "Grid size: " << kGridSize << ", Block size: " << kBlockSize << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            ck_tile::stream_config{nullptr, true, log ? 1 : 0, warmup, repeat},
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{}, kGridSize, kBlockSize, 0, kernel_args));
+
+        std::size_t num_bytes =
+            sizeof(InDataType) * N * H * W * C + sizeof(OutDataType) * N * Ho * Wo * C;
+        float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+        if(json_output)
+        {
+            std::cout << "{\n"
+                      << " \"name\": \"" << KERNEL_NAME << "\",\n"
+                      << " \"problem\": {\n"
+                      << "    \"N\": " << N << ",\n"
+                      << "    \"H\": " << H << ",\n"
+                      << "    \"W\": " << W << ",\n"
+                      << "    \"C\": " << C << ",\n"
+                      << "    \"windowY\": " << Y << ",\n"
+                      << "    \"windowX\": " << X << "\n"
+                      << " },\n"
+                      << " \"perf_result\": {\n"
+                      << "   \"latency(ms)\": " << ave_time << ",\n"
+                      << "   \"bandwidth(GB/s)\": " << gb_per_sec << "\n"
+                      << " }\n"
+                      << "}" << std::endl;
+        }
+        else
+        {
+            std::cout << "Kernel: " << KERNEL_NAME << std::endl;
+            std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+        }
+
+        if(do_validation)
+        {
+            out_buf.FromDevice(out.data());
+
+            ck_tile::HostTensor<OutDataType> out_ref({N, Ho, Wo, C}, {Ho * Wo * C, Wo * C, C, 1});
+            ck_tile::HostTensor<IndexDataType> out_ref_index(
+                OUTPUT_INDEX ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                        static_cast<std::size_t>(Ho),
+                                                        static_cast<std::size_t>(Wo),
+                                                        static_cast<std::size_t>(C)}
+                             : std::vector<std::size_t>{1});
+
+            ck_tile::reference_pool2d<InDataType,
+                                      ComputeDataType,
+                                      OutDataType,
+                                      IndexDataType,
+                                      ReduceOpType,
+                                      decltype(input_shape),
+                                      decltype(window_lengths),
+                                      OUTPUT_INDEX>(
+                in, out_ref, out_ref_index, kernel_args, ReduceOpType{});
+
+            bool pass = ck_tile::check_err(out, out_ref);
+            if(OUTPUT_INDEX)
+            {
+                out_index_buf.FromDevice(out_index.data());
+                pass = pass && ck_tile::check_err(out_index, out_ref_index);
+            }
+
+            std::cout << "Verification: " << (pass ? "PASSED" : "FAILED") << std::endl;
+        }
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, parser] = create_args(argc, argv);
+        if(!result)
+            return EXIT_FAILURE;
+
+        // POOL_DIM is defined in the generated header (2 or 3)
+        if constexpr(POOL_DIM == 3)
+        {
+            run_benchmark<true>(parser);
+        }
+        else
+        {
+            run_benchmark<false>(parser);
+        }
+
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << "\n";
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tile_engine/ops/pooling/pool_common.hpp b/tile_engine/ops/pooling/pool_common.hpp
new file mode 100644
index 00000000000..cd6db6ecf33
--- /dev/null
+++ b/tile_engine/ops/pooling/pool_common.hpp
@@ -0,0 +1,69 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/pk_int4.hpp"
+
+//[TODO] This can be moved to commons
+// DataTypeTraits for all supported types
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
diff --git a/tile_engine/ops/pooling/pool_instance_builder.py b/tile_engine/ops/pooling/pool_instance_builder.py
new file mode 100644
index 00000000000..06d6f31b750
--- /dev/null
+++ b/tile_engine/ops/pooling/pool_instance_builder.py
@@ -0,0 +1,625 @@
+#!/usr/bin/env python
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import os
+import json
+import argparse
+import itertools
+import multiprocessing
+import concurrent.futures
+from pathlib import Path
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+
+def get_dtype_string(dtype):
+    """Convert dtype name to C++ type string"""
+    dtype_map = {
+        "fp16": "ck_tile::half_t",
+        "fp32": "float",
+        "bf16": "ck_tile::bf16_t",
+        "fp8": "ck_tile::fp8_t",
+        "bf8": "ck_tile::bf8_t",
+        "int8": "ck_tile::int8_t",
+        "int32": "ck_tile::int32_t",
+        "index_t": "ck_tile::index_t",
+    }
+    return dtype_map.get(dtype, dtype)
+
+
+def get_reduce_op_string(reduce_op):
+    """Convert reduce op name to C++ type string"""
+    reduce_op_map = {
+        "max": "ck_tile::ReduceOp::Max",
+        "min": "ck_tile::ReduceOp::Min",
+        "add": "ck_tile::ReduceOp::Add",
+        "avg": "ck_tile::ReduceOp::Add",  # Average uses Add and divides later
+    }
+    return reduce_op_map.get(reduce_op.lower(), "ck_tile::ReduceOp::Max")
+
+
+class PoolKernelBuilder:
+    def __init__(self, working_path, gpu_target, datatype, reduce_op, config_json=None):
+        self.working_path = Path(working_path)
+        self.gpu_target = gpu_target
+        self.datatype = datatype
+        self.reduce_op = reduce_op
+        self.config_json = config_json
+
+        # Create working directory if it doesn't exist
+        self.working_path.mkdir(parents=True, exist_ok=True)
+
+        # Load configuration
+        if config_json and os.path.exists(config_json):
+            with open(config_json, "r") as f:
+                self.config = json.load(f)
+        else:
+            # Default configuration
+            self.config = self._get_default_config()
+
+    def _get_default_config(self):
+        """Return default configuration for pooling kernels"""
+        return {
+            "block_config": {
+                "block_m": {"values": [64, 128, 256]},
+                "block_n": {"values": [1]},
+                "warp_m": {"values": [1, 2]},
+                "warp_n": {"values": [1]},
+                "thread_tile_m": {"values": [1, 2, 4]},
+                "thread_tile_n": {"values": [1]},
+            },
+            "trait_config": {
+                "output_index": {"values": [True, False]},
+                "propagate_nan": {"values": [False]},
+                "pool_dim": {"values": [2, 3]},
+            },
+            "k_block_per_cu": 1,
+        }
+
+    def write_kernel_list(self):
+        """Write kernel list to file for CMake to read"""
+        block_configs = self._get_block_configs()
+        trait_combos = self._generate_trait_combinations()
+
+        kernel_list = []
+        for block_config in block_configs:
+            for trait_combo in trait_combos:
+                output_index, propagate_nan, pool_dim = trait_combo
+
+                # Create kernel name
+                kernel_name = f"pool{pool_dim}d_{self.datatype}_{self.reduce_op}"
+                kernel_name += f"_{str(output_index).capitalize()}_{str(propagate_nan).capitalize()}"
+
+                # Create block configuration string
+                block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
+                block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
+                block_str += (
+                    f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
+                )
+
+                kernel_name += f"_{block_str}"
+
+                kernel_list.append(
+                    {
+                        "name": kernel_name,
+                        "block_config": block_config,
+                        "trait_combo": trait_combo,
+                    }
+                )
+
+        # Write kernel count
+        with open(self.working_path / "pool_kernel_count.txt", "w") as f:
+            f.write(str(len(kernel_list)))
+
+        # Write kernel list
+        with open(self.working_path / "pool_kernel_list.txt", "w") as f:
+            for kernel in kernel_list:
+                block_config = kernel["block_config"]
+                trait_combo = kernel["trait_combo"]
+
+                block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
+                block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
+                block_str += (
+                    f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
+                )
+
+                trait_str = "_".join(str(x) for x in trait_combo)
+
+                f.write(f"{kernel['name']}|{block_str}|{trait_str}\n")
+
+        print(f"Listed {len(kernel_list)} kernel configurations")
+
+    def _get_block_configs(self):
+        """Get block configurations for the current datatype"""
+        block_config = self.config["block_config"]
+
+        block_m_values = block_config.get("block_m").get("values")
+        block_n_values = block_config.get("block_n").get("values")
+        warp_m_values = block_config.get("warp_m").get("values")
+        warp_n_values = block_config.get("warp_n").get("values")
+        thread_tile_m_values = block_config.get("thread_tile_m").get("values")
+        thread_tile_n_values = block_config.get("thread_tile_n").get("values")
+
+        configs = []
+        for block_m in block_m_values:
+            for block_n in block_n_values:
+                for warp_m in warp_m_values:
+                    for warp_n in warp_n_values:
+                        for thread_tile_m in thread_tile_m_values:
+                            for thread_tile_n in thread_tile_n_values:
+                                if self._validate_block_config(
+                                    block_m,
+                                    block_n,
+                                    warp_m,
+                                    warp_n,
+                                    thread_tile_m,
+                                    thread_tile_n,
+                                ):
+                                    configs.append(
+                                        {
+                                            "block_m": block_m,
+                                            "block_n": block_n,
+                                            "warp_m": warp_m,
+                                            "warp_n": warp_n,
+                                            "thread_tile_m": thread_tile_m,
+                                            "thread_tile_n": thread_tile_n,
+                                        }
+                                    )
+        return configs
+
+    def _validate_block_config(
+        self, block_m, block_n, warp_m, warp_n, thread_tile_m, thread_tile_n
+    ):
+        """Validate that block configuration is reasonable"""
+        if block_m <= 0 or block_n <= 0:
+            return False
+        if warp_m <= 0 or warp_n <= 0:
+            return False
+        if thread_tile_m <= 0 or thread_tile_n <= 0:
+            return False
+
+        # Warp size is 64 for AMD GPUs
+        warp_size = 64
+
+        # Calculate warp tile sizes
+        warp_tile_m = block_m // warp_m
+        warp_tile_n = block_n // warp_n
+
+        if warp_tile_m <= 0 or warp_tile_n <= 0:
+            return False
+
+        # Check block_m is divisible by warp_m
+        if block_m % warp_m != 0:
+            return False
+        if block_n % warp_n != 0:
+            return False
+
+        # Check thread tile fits in warp tile
+        if warp_tile_m % thread_tile_m != 0:
+            return False
+        if warp_tile_n % thread_tile_n != 0:
+            return False
+
+        # Critical constraint from pool_shape.hpp:
+        # (Warp_M * Warp_N / ThreadTile_M / ThreadTile_N) % warp_size == 0
+        # This means threads_per_warp must be a multiple of warp_size (typically equal to it)
+        threads_per_warp = (warp_tile_m * warp_tile_n) // (
+            thread_tile_m * thread_tile_n
+        )
+        if threads_per_warp % warp_size != 0:
+            return False
+
+        # threads_per_warp should not be too large (usually exactly warp_size)
+        if threads_per_warp > warp_size * 4:
+            return False
+
+        return True
+
+    def _generate_trait_combinations(self):
+        """Generate all combinations of traits"""
+        trait_config = self.config["trait_config"]
+
+        output_index_values = trait_config.get("output_index").get("values")
+        propagate_nan_values = trait_config.get("propagate_nan").get("values")
+        pool_dim_values = trait_config.get("pool_dim").get("values")
+
+        all_combinations = list(
+            itertools.product(
+                output_index_values,
+                propagate_nan_values,
+                pool_dim_values,
+            )
+        )
+
+        return all_combinations
+
+    def _generate_kernel_instance(
+        self, block_config, trait_combo, k_block_per_cu, is_header=True
+    ):
+        """Generate a single kernel instance"""
+        output_index, propagate_nan, pool_dim = trait_combo
+
+        # Create kernel name
+        kernel_name = f"pool{pool_dim}d_{self.datatype}_{self.reduce_op}"
+        kernel_name += (
+            f"_{str(output_index).capitalize()}_{str(propagate_nan).capitalize()}"
+        )
+
+        # Create block configuration string
+        block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
+        block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
+        block_str += f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
+
+        kernel_name += f"_{block_str}"
+
+        # Determine output type (same as input for pooling)
+        out_type = self.datatype
+        compute_type = "fp32"  # Always use fp32 for compute
+        index_type = "index_t"
+
+        # Calculate warp tile sizes
+        warp_tile_m = block_config["block_m"] // block_config["warp_m"]
+        warp_tile_n = block_config["block_n"] // block_config["warp_n"]
+
+        # Generate kernel instance code
+        pragma_line = "#pragma once\n" if is_header else ""
+        instance_code = f"""// Generated kernel instance for {kernel_name}
+{pragma_line}
+#include <cstdint>
+#include <utility>
+#include <tuple>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/ops/pooling/kernel/pool_kernel.hpp"
+#include "ck_tile/ops/pooling/pipeline/pool_problem.hpp"
+#include "ck_tile/ops/pooling/pipeline/pool_shape.hpp"
+
+using InDataType = {get_dtype_string(self.datatype)};
+using OutDataType = {get_dtype_string(out_type)};
+using ComputeDataType = {get_dtype_string(compute_type)};
+using IndexDataType = {get_dtype_string(index_type)};
+
+// Reduce operation
+using ReduceOpType = {get_reduce_op_string(self.reduce_op)};
+
+// Kernel name for display
+constexpr const char* KERNEL_NAME = "{kernel_name}";
+constexpr const char* BLOCK_SHAPE_NAME = "{block_str}";
+constexpr const char* REDUCE_OP_NAME = "{self.reduce_op}";
+
+// Flags and dimensions
+constexpr bool OUTPUT_INDEX = {"true" if output_index else "false"};
+constexpr bool PROPAGATE_NAN = {"true" if propagate_nan else "false"};
+constexpr int POOL_DIM = {pool_dim};
+
+// Block configuration
+using BlockWarps = ck_tile::sequence<{block_config["warp_m"]}, {block_config["warp_n"]}>;
+using BlockTile = ck_tile::sequence<{block_config["block_m"]}, {block_config["block_n"]}>;
+using WarpTile = ck_tile::sequence<{warp_tile_m}, {warp_tile_n}>;
+using ThreadTile = ck_tile::sequence<{block_config["thread_tile_m"]}, {block_config["thread_tile_n"]}>;
+
+using PoolBlockShape = ck_tile::PoolShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+
+// Pool problem definition
+using Problem = ck_tile::PoolProblem<InDataType,
+                                     OutDataType,
+                                     ComputeDataType,
+                                     IndexDataType,
+                                     ReduceOpType,
+                                     OUTPUT_INDEX,
+                                     PROPAGATE_NAN,
+                                     PoolBlockShape>;
+
+// Pool kernel type
+using Kernel = ck_tile::PoolKernel<Problem>;
+
+// Shape types for {pool_dim}D pooling
+"""
+        if pool_dim == 3:
+            instance_code += """// 3D pooling shapes (N, D, H, W, C)
+using TensorShapeType = decltype(ck_tile::make_tuple(
+    ck_tile::index_t{}, ck_tile::index_t{}, ck_tile::index_t{}, 
+    ck_tile::index_t{}, ck_tile::index_t{}));
+// Window shape (Z, Y, X)
+using WindowShapeType = decltype(ck_tile::make_tuple(
+    ck_tile::index_t{}, ck_tile::index_t{}, ck_tile::index_t{}));
+"""
+        else:
+            instance_code += """// 2D pooling shapes (N, H, W, C)
+using TensorShapeType = decltype(ck_tile::make_tuple(
+    ck_tile::index_t{}, ck_tile::index_t{}, 
+    ck_tile::index_t{}, ck_tile::index_t{}));
+// Window shape (Y, X)
+using WindowShapeType = decltype(ck_tile::make_tuple(
+    ck_tile::index_t{}, ck_tile::index_t{}));
+"""
+
+        instance_code += f"""
+// Wrapper for simplified launch interface
+struct SelectedKernel {{
+    template <typename TensorShape, typename WindowShape>
+    static float launch(const ck_tile::PoolHostArgs<TensorShape, WindowShape>& args,
+                        const ck_tile::stream_config& stream) {{
+        auto kernel_args = Kernel::MakeKernelArgs(
+            const_cast<ck_tile::PoolHostArgs<TensorShape, WindowShape>&>(args));
+        
+        if (!Kernel::IsSupportedArgument(kernel_args)) {{
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping pooling kernel!");
+        }}
+        
+        constexpr ck_tile::index_t kBlockPerCu = {k_block_per_cu};
+        const ck_tile::index_t kBlockSize = Kernel::BlockSize();
+        const ck_tile::index_t kGridSize = Kernel::CalculateGridSize(kernel_args);
+        
+        if(stream.log_level_ > 0) {{
+            std::cout << "Launching kernel: " << KERNEL_NAME << '\\n'
+                      << "grid: " << kGridSize
+                      << ", blocks: " << kBlockSize
+                      << std::endl;
+        }}
+        
+        // Launch kernel
+        float ave_time = ck_tile::launch_kernel(
+            stream,
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{{}}, kGridSize, kBlockSize, 0, kernel_args));
+        
+        return ave_time;
+    }}
+}};
+"""
+        return kernel_name, instance_code
+
+    def run(self, num_workers=None):
+        """Run the builder to generate individual kernel files"""
+        self.generate_individual(num_workers)
+
+    def generate_individual(self, num_workers=None):
+        """Generate individual kernel files for separate compilation"""
+        if num_workers is None:
+            num_workers = min(multiprocessing.cpu_count(), 8)
+
+        block_configs = self._get_block_configs()
+        trait_combos = self._generate_trait_combinations()
+        k_block_per_cu = self.config.get("k_block_per_cu", 1)
+
+        # Prepare work items
+        work_items = []
+        for block_config in block_configs:
+            for trait_combo in trait_combos:
+                work_items.append(
+                    (
+                        block_config,
+                        trait_combo,
+                        k_block_per_cu,
+                        self.working_path,
+                        self.gpu_target,
+                        self.datatype,
+                        self.reduce_op,
+                        self.config_json,
+                    )
+                )
+
+        print(
+            f"Generating {len(work_items)} individual kernel files using {num_workers} workers..."
+        )
+        print(f"  Block configs: {len(block_configs)}")
+        print(f"  Trait combinations: {len(trait_combos)}")
+        print(f"  Total kernels: {len(work_items)}")
+
+        # Process work items
+        kernel_list = []
+        completed = 0
+
+        with concurrent.futures.ProcessPoolExecutor(
+            max_workers=num_workers
+        ) as executor:
+            future_to_item = {
+                executor.submit(_generate_single_kernel_individual, item): item
+                for item in work_items
+            }
+
+            for future in concurrent.futures.as_completed(future_to_item):
+                completed += 1
+                if completed % 10 == 0 or completed == len(work_items):
+                    print(
+                        f"  Progress: {completed}/{len(work_items)} kernels generated"
+                    )
+                try:
+                    result = future.result()
+                    if result:
+                        kernel_list.append(result)
+                except Exception as exc:
+                    item = future_to_item[future]
+                    print(f"Kernel generation failed for {item}: {exc}")
+
+        # Sort kernel list
+        kernel_list.sort(key=lambda x: x[0])
+
+        # Generate CMake include file
+        self._generate_cmake_individual_targets(kernel_list)
+
+        print(
+            f"Generated {len(kernel_list)} individual kernel files in {self.working_path}"
+        )
+
+    def _generate_cmake_individual_targets(self, kernel_list):
+        """Generate CMake include file that creates individual targets"""
+        cmake_code = f"""# Generated CMake file for individual Pool targets
+# Datatype: {self.datatype}, ReduceOp: {self.reduce_op}
+
+"""
+        for kernel_name, trait_combo, block_config in kernel_list:
+            block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
+            block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
+            block_str += (
+                f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
+            )
+
+            trait_str = "_".join(str(x) for x in trait_combo)
+
+            cmake_code += f'create_individual_pool_target("{self.datatype}" "{self.reduce_op}" "{trait_str}" "{block_str}")\n'
+
+        with open(self.working_path / "pool_individual_targets.cmake", "w") as f:
+            f.write(cmake_code)
+
+
+def _generate_single_kernel_individual(work_item):
+    """Worker function to generate a single individual kernel file"""
+    (
+        block_config,
+        trait_combo,
+        k_block_per_cu,
+        working_path,
+        gpu_target,
+        datatype,
+        reduce_op,
+        config_json,
+    ) = work_item
+
+    # Create a temporary builder instance
+    builder = PoolKernelBuilder(
+        working_path, gpu_target, datatype, reduce_op, config_json
+    )
+
+    try:
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            block_config, trait_combo, k_block_per_cu
+        )
+
+        # Create simplified filename
+        simplified_name = kernel_name
+        if simplified_name.startswith("pool"):
+            simplified_name = simplified_name[4:]  # Remove "pool" prefix
+
+        # Write individual header file
+        header_file = working_path / f"pool_single_{simplified_name}.hpp"
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        return (kernel_name, trait_combo, block_config)
+    except Exception as e:
+        print(f"Error generating individual kernel: {e}")
+        return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Pool kernel instance builder with parallel support"
+    )
+    parser.add_argument("--working_path", required=True, help="Working directory path")
+    parser.add_argument(
+        "--gpu_target",
+        required=True,
+        help="GPU target architecture",
+    )
+    parser.add_argument(
+        "--datatype",
+        required=True,
+        choices=["fp16", "fp32", "bf16"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--reduce_op",
+        required=True,
+        choices=["max", "min", "avg"],
+        help="Reduce operation",
+    )
+    parser.add_argument("--config_json", help="Configuration JSON file")
+    parser.add_argument(
+        "--num_workers", type=int, help="Number of parallel workers (default: auto)"
+    )
+    parser.add_argument(
+        "--gen_all_individual",
+        action="store_true",
+        help="Generate individual kernel files",
+    )
+    parser.add_argument(
+        "--gen_single", action="store_true", help="Generate a single kernel file"
+    )
+    parser.add_argument("--kernel_name", help="Kernel name for single generation")
+    parser.add_argument(
+        "--block_config", help="Block configuration string for single generation"
+    )
+    parser.add_argument(
+        "--trait_combo", help="Trait combination string for single generation"
+    )
+    parser.add_argument(
+        "--list_kernels",
+        action="store_true",
+        help="List kernel configurations without generating files",
+    )
+
+    args = parser.parse_args()
+
+    # Create builder
+    builder = PoolKernelBuilder(
+        args.working_path,
+        args.gpu_target,
+        args.datatype,
+        args.reduce_op,
+        args.config_json,
+    )
+
+    if args.list_kernels:
+        builder.write_kernel_list()
+    elif args.gen_single:
+        # Generate a single kernel file
+        if not args.kernel_name or not args.block_config or not args.trait_combo:
+            parser.error(
+                "--gen_single requires --kernel_name, --block_config, and --trait_combo"
+            )
+
+        # Parse block config
+        block_parts = args.block_config.split("_")
+        block_dims = block_parts[0].split("x")
+        warp_dims = block_parts[1].split("x")
+        thread_tile_dims = block_parts[2].split("x")
+
+        block_config = {
+            "block_m": int(block_dims[0]),
+            "block_n": int(block_dims[1]),
+            "warp_m": int(warp_dims[0]),
+            "warp_n": int(warp_dims[1]),
+            "thread_tile_m": int(thread_tile_dims[0]),
+            "thread_tile_n": int(thread_tile_dims[1]),
+        }
+
+        # Parse trait combo
+        trait_parts = args.trait_combo.split("_")
+        trait_combo = (
+            trait_parts[0] == "True",  # output_index
+            trait_parts[1] == "True",  # propagate_nan
+            int(trait_parts[2]),  # pool_dim
+        )
+
+        k_block_per_cu = builder.config.get("k_block_per_cu", 1)
+
+        # Generate the kernel
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            block_config, trait_combo, k_block_per_cu
+        )
+
+        # Write the file
+        simplified_name = kernel_name
+        if simplified_name.startswith("pool"):
+            simplified_name = simplified_name[4:]
+
+        header_file = builder.working_path / f"pool_single_{simplified_name}.hpp"
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        print(f"Generated {header_file}")
+
+    elif args.gen_all_individual:
+        builder.run(args.num_workers)
+    else:
+        parser.error(
+            "Must specify one of: --list_kernels, --gen_all_individual, or --gen_single"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tile_engine/ops/pooling/pool_profiler.hpp b/tile_engine/ops/pooling/pool_profiler.hpp
new file mode 100644
index 00000000000..66bf953abe4
--- /dev/null
+++ b/tile_engine/ops/pooling/pool_profiler.hpp
@@ -0,0 +1,404 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/host/reference/reference_pool.hpp"
+#include "pool_benchmark.hpp"
+
+class PoolProfiler
+{
+    public:
+    static PoolProfiler& instance(Setting setting)
+    {
+        static PoolProfiler instance{setting};
+        return instance;
+    }
+
+    // Overload for single kernel benchmarking
+    template <typename TensorShape, typename WindowShape>
+    void benchmark(PoolProblem& pool_problem,
+                   std::function<float(const ck_tile::PoolHostArgs<TensorShape, WindowShape>&,
+                                       const ck_tile::stream_config&)> kernel_func)
+    {
+        // Create a vector with a single callable that returns both name and time
+        std::vector<std::function<std::tuple<std::string, float>(
+            ck_tile::PoolHostArgs<TensorShape, WindowShape>&, const ck_tile::stream_config&)>>
+            callables;
+
+        callables.push_back([kernel_func](ck_tile::PoolHostArgs<TensorShape, WindowShape>& args,
+                                          const ck_tile::stream_config& stream) {
+            float time = kernel_func(args, stream);
+            return std::make_tuple(std::string(KERNEL_NAME), time);
+        });
+
+        benchmark(pool_problem, callables);
+    }
+
+    template <typename TensorShape, typename WindowShape>
+    void benchmark(
+        PoolProblem& pool_problem,
+        std::vector<std::function<std::tuple<std::string, float>(
+            ck_tile::PoolHostArgs<TensorShape, WindowShape>&, const ck_tile::stream_config&)>>&
+            callables)
+    {
+        // Calculate output dimensions based on pool dimension
+        const ck_tile::index_t N = pool_problem.N;
+        const ck_tile::index_t D = pool_problem.D;
+        const ck_tile::index_t H = pool_problem.H;
+        const ck_tile::index_t W = pool_problem.W;
+        const ck_tile::index_t C = pool_problem.C;
+
+        const ck_tile::index_t Z = pool_problem.windowZ;
+        const ck_tile::index_t Y = pool_problem.windowY;
+        const ck_tile::index_t X = pool_problem.windowX;
+
+        const ck_tile::index_t Sz = pool_problem.strideZ;
+        const ck_tile::index_t Sy = pool_problem.strideY;
+        const ck_tile::index_t Sx = pool_problem.strideX;
+
+        const ck_tile::index_t Dz = pool_problem.dilationZ;
+        const ck_tile::index_t Dy = pool_problem.dilationY;
+        const ck_tile::index_t Dx = pool_problem.dilationX;
+
+        const ck_tile::index_t LeftPz  = pool_problem.leftPadZ;
+        const ck_tile::index_t LeftPy  = pool_problem.leftPadY;
+        const ck_tile::index_t LeftPx  = pool_problem.leftPadX;
+        const ck_tile::index_t RightPz = pool_problem.rightPadZ;
+        const ck_tile::index_t RightPy = pool_problem.rightPadY;
+        const ck_tile::index_t RightPx = pool_problem.rightPadX;
+
+        // Calculate effective window sizes
+        const ck_tile::index_t Zs = (Z - 1) * Dz + 1;
+        const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        const ck_tile::index_t Xs = (X - 1) * Dx + 1;
+
+        // Calculate output dimensions
+        const ck_tile::index_t Do = (D + LeftPz + RightPz - Zs) / Sz + 1;
+        const ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
+        const ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
+
+        // Create input/output tensors based on pool dimension (3D: NDHWC, 2D: NHWC)
+        ck_tile::HostTensor<InDataType> in_tensor(
+            pool_problem.poolDim == 3 ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                 static_cast<std::size_t>(D),
+                                                                 static_cast<std::size_t>(H),
+                                                                 static_cast<std::size_t>(W),
+                                                                 static_cast<std::size_t>(C)}
+                                      : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                 static_cast<std::size_t>(H),
+                                                                 static_cast<std::size_t>(W),
+                                                                 static_cast<std::size_t>(C)});
+
+        ck_tile::HostTensor<OutDataType> out_tensor(
+            pool_problem.poolDim == 3 ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                 static_cast<std::size_t>(Do),
+                                                                 static_cast<std::size_t>(Ho),
+                                                                 static_cast<std::size_t>(Wo),
+                                                                 static_cast<std::size_t>(C)}
+                                      : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                 static_cast<std::size_t>(Ho),
+                                                                 static_cast<std::size_t>(Wo),
+                                                                 static_cast<std::size_t>(C)});
+
+        ck_tile::HostTensor<OutDataType> out_host_result(
+            pool_problem.poolDim == 3 ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                 static_cast<std::size_t>(Do),
+                                                                 static_cast<std::size_t>(Ho),
+                                                                 static_cast<std::size_t>(Wo),
+                                                                 static_cast<std::size_t>(C)}
+                                      : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                 static_cast<std::size_t>(Ho),
+                                                                 static_cast<std::size_t>(Wo),
+                                                                 static_cast<std::size_t>(C)});
+
+        ck_tile::HostTensor<IndexDataType> out_index_tensor(
+            pool_problem.outputIndex ? (pool_problem.poolDim == 3
+                                            ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                       static_cast<std::size_t>(Do),
+                                                                       static_cast<std::size_t>(Ho),
+                                                                       static_cast<std::size_t>(Wo),
+                                                                       static_cast<std::size_t>(C)}
+                                            : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                       static_cast<std::size_t>(Ho),
+                                                                       static_cast<std::size_t>(Wo),
+                                                                       static_cast<std::size_t>(C)})
+                                     : std::vector<std::size_t>{1});
+
+        ck_tile::HostTensor<IndexDataType> out_index_host_result(
+            pool_problem.outputIndex ? (pool_problem.poolDim == 3
+                                            ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                       static_cast<std::size_t>(Do),
+                                                                       static_cast<std::size_t>(Ho),
+                                                                       static_cast<std::size_t>(Wo),
+                                                                       static_cast<std::size_t>(C)}
+                                            : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                                       static_cast<std::size_t>(Ho),
+                                                                       static_cast<std::size_t>(Wo),
+                                                                       static_cast<std::size_t>(C)})
+                                     : std::vector<std::size_t>{1});
+
+        // Initialize input tensor
+        if(setting_.init_method_ == 0)
+        {
+            ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(in_tensor);
+        }
+        else if(setting_.init_method_ == 1)
+        {
+            ck_tile::FillMonotonicSeq<InDataType>{}(in_tensor);
+        }
+        else if(setting_.init_method_ == 2)
+        {
+            ck_tile::FillConstant<InDataType>{static_cast<InDataType>(1)}(in_tensor);
+        }
+        else
+        {
+            in_tensor.SetZero();
+        }
+
+        // Allocate device memory
+        ck_tile::DeviceMem in_dev_buf(in_tensor.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_dev_buf(out_tensor.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_index_dev_buf(
+            pool_problem.outputIndex ? out_index_tensor.get_element_space_size_in_bytes() : 0);
+
+        in_dev_buf.ToDevice(in_tensor.data());
+        out_dev_buf.SetZero();
+        if(pool_problem.outputIndex)
+        {
+            out_index_dev_buf.SetZero();
+        }
+
+        // Create shapes for host args
+        TensorShape input_shape, output_shape, input_strides, output_strides;
+        WindowShape window_lengths, window_strides, window_dilations, input_left_pads,
+            input_right_pads;
+
+        // Create host arguments
+        ck_tile::PoolHostArgs<TensorShape, WindowShape> pool_args{
+            in_dev_buf.GetDeviceBuffer(),
+            out_dev_buf.GetDeviceBuffer(),
+            pool_problem.outputIndex ? out_index_dev_buf.GetDeviceBuffer() : nullptr,
+            input_shape,
+            output_shape,
+            input_strides,
+            output_strides,
+            window_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads};
+
+        // Run reference if verification is enabled
+        // (Reference computation would be added here based on pool dimension)
+
+        for(auto& callable : callables)
+        {
+            auto kernel_run_result = callable(pool_args,
+                                              ck_tile::stream_config{nullptr,
+                                                                     true,
+                                                                     setting_.log_,
+                                                                     setting_.n_warmup_,
+                                                                     setting_.n_repeat_,
+                                                                     setting_.is_gpu_timer_,
+                                                                     setting_.flush_cache_,
+                                                                     setting_.rotating_count_});
+            process_result(pool_problem,
+                           out_dev_buf,
+                           out_host_result,
+                           out_tensor,
+                           out_index_dev_buf,
+                           out_index_host_result,
+                           out_index_tensor,
+                           kernel_run_result);
+        }
+    }
+
+    void process_result(const PoolProblem& pool_problem,
+                        ck_tile::DeviceMem& out_dev_buf,
+                        ck_tile::HostTensor<OutDataType>& out_host_result,
+                        ck_tile::HostTensor<OutDataType>& out_dev_result,
+                        ck_tile::DeviceMem& out_index_dev_buf,
+                        ck_tile::HostTensor<IndexDataType>& out_index_host_result,
+                        ck_tile::HostTensor<IndexDataType>& out_index_dev_result,
+                        const std::tuple<std::string, float>& kernel_run_result)
+    {
+        auto [name, avg_time] = kernel_run_result;
+
+        KernelInstance kernel_instance{name, pool_problem, {-1.0f, -1.0f, -1.0f}};
+
+        // Compute performance metrics
+        const ck_tile::index_t N  = pool_problem.N;
+        const ck_tile::index_t D  = pool_problem.D;
+        const ck_tile::index_t H  = pool_problem.H;
+        const ck_tile::index_t W  = pool_problem.W;
+        const ck_tile::index_t C  = pool_problem.C;
+        const ck_tile::index_t Z  = pool_problem.windowZ;
+        const ck_tile::index_t Y  = pool_problem.windowY;
+        const ck_tile::index_t X  = pool_problem.windowX;
+        const ck_tile::index_t Sz = pool_problem.strideZ;
+        const ck_tile::index_t Sy = pool_problem.strideY;
+        const ck_tile::index_t Sx = pool_problem.strideX;
+        const ck_tile::index_t Dz = pool_problem.dilationZ;
+        const ck_tile::index_t Dy = pool_problem.dilationY;
+        const ck_tile::index_t Dx = pool_problem.dilationX;
+
+        const ck_tile::index_t Zs = (Z - 1) * Dz + 1;
+        const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        const ck_tile::index_t Xs = (X - 1) * Dx + 1;
+
+        const ck_tile::index_t Do =
+            (D + pool_problem.leftPadZ + pool_problem.rightPadZ - Zs) / Sz + 1;
+        const ck_tile::index_t Ho =
+            (H + pool_problem.leftPadY + pool_problem.rightPadY - Ys) / Sy + 1;
+        const ck_tile::index_t Wo =
+            (W + pool_problem.leftPadX + pool_problem.rightPadX - Xs) / Sx + 1;
+
+        // Calculate FLOPs: for pooling, we count one compare/add per window element per output
+        // element
+        std::size_t window_size =
+            static_cast<std::size_t>(Z) * static_cast<std::size_t>(Y) * static_cast<std::size_t>(X);
+        std::size_t output_elements = static_cast<std::size_t>(N) * static_cast<std::size_t>(Do) *
+                                      static_cast<std::size_t>(Ho) * static_cast<std::size_t>(Wo) *
+                                      static_cast<std::size_t>(C);
+        std::size_t flop = output_elements * window_size;
+
+        // Calculate memory bandwidth
+        std::size_t num_byte =
+            sizeof(InDataType) * N * D * H * W * C + sizeof(OutDataType) * N * Do * Ho * Wo * C;
+
+        // Update performance results
+        kernel_instance.perf_result_.latency_   = avg_time;
+        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
+        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
+
+        if(setting_.log_ > 0 && !setting_.json_output_)
+        {
+            std::cout << kernel_instance << std::endl;
+        }
+
+        // Verify result
+        out_dev_buf.FromDevice(out_dev_result.data());
+
+        bool verified_correct = true;
+        if(setting_.verify_)
+        {
+            verified_correct = compare_pool_results(name, out_dev_result, out_host_result);
+            if(pool_problem.outputIndex)
+            {
+                out_index_dev_buf.FromDevice(out_index_dev_result.data());
+                verified_correct =
+                    verified_correct &&
+                    compare_pool_index_results(name, out_index_dev_result, out_index_host_result);
+            }
+        }
+
+        if(verified_correct)
+        {
+            kernel_instances_.emplace_back(kernel_instance);
+        }
+        else
+        {
+            std::cout << "Verification failed, skip kernel: " << name << std::endl;
+        }
+
+        // Clear tensors
+        out_dev_buf.SetZero();
+        out_dev_result.SetZero();
+    }
+
+    KernelInstance select_best_instance(Metric metric)
+    {
+        if(kernel_instances_.empty())
+            throw std::runtime_error("Empty instances");
+
+        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
+                                                 kernel_instances_.end(),
+                                                 [metric](const auto& a, const auto& b) {
+                                                     return PerformanceResult::compare(
+                                                         b.perf_result_, a.perf_result_, metric);
+                                                 });
+
+        if(setting_.json_output_)
+        {
+            // Output clean JSON only
+            std::cout << kernel_instance << std::endl;
+        }
+        else
+        {
+            std::cout << "**********************************" << std::endl;
+            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
+                      << "Current kernel performance is: " << kernel_instance << std::endl;
+            std::cout << "**********************************" << std::endl;
+        }
+
+        if(!setting_.csv_filename_.empty())
+        {
+            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
+
+            if(!file.is_open())
+            {
+                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
+            }
+            else
+            {
+                if(file.tellp() == 0)
+                {
+                    file << "rocm_version,device_name,"
+                         << "in_dtype,out_dtype,compute_dtype,index_dtype,"
+                         << "block_shape,reduce_op,pool_dim," << "N,D,H,W,C,"
+                         << "window_z,window_y,window_x," << "stride_z,stride_y,stride_x,"
+                         << "dilation_z,dilation_y,dilation_x,"
+                         << "left_pad_z,left_pad_y,left_pad_x,"
+                         << "right_pad_z,right_pad_y,right_pad_x," << "output_index,propagate_nan,"
+                         << "name," << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
+                }
+
+                const auto& problem = kernel_instance.problem_;
+                const auto& name    = kernel_instance.name_;
+                const auto& perf    = kernel_instance.perf_result_;
+
+                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
+                     << problem.inDType << "," << problem.outDType << "," << problem.computeDType
+                     << "," << problem.indexDType << "," << problem.blockShape << ","
+                     << problem.reduceOp << "," << problem.poolDim << "," << problem.N << ","
+                     << problem.D << "," << problem.H << "," << problem.W << "," << problem.C << ","
+                     << problem.windowZ << "," << problem.windowY << "," << problem.windowX << ","
+                     << problem.strideZ << "," << problem.strideY << "," << problem.strideX << ","
+                     << problem.dilationZ << "," << problem.dilationY << "," << problem.dilationX
+                     << "," << problem.leftPadZ << "," << problem.leftPadY << ","
+                     << problem.leftPadX << "," << problem.rightPadZ << "," << problem.rightPadY
+                     << "," << problem.rightPadX << "," << problem.outputIndex << ","
+                     << problem.propagateNan << "," << name << "," << std::fixed
+                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
+                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
+                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
+                     << "\n";
+
+                if(!file)
+                {
+                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
+                }
+            }
+        }
+
+        return kernel_instance;
+    }
+
+    PoolProfiler(const PoolProfiler&)            = delete;
+    PoolProfiler& operator=(const PoolProfiler&) = delete;
+
+    private:
+    ~PoolProfiler() { kernel_instances_.clear(); }
+    PoolProfiler(Setting setting) : setting_(setting) {}
+
+    Setting setting_;
+
+    std::vector<KernelInstance> kernel_instances_;
+};