diff --git a/CMakeLists.txt b/CMakeLists.txt index d8ef04736..5e9b1f717 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,11 @@ OPTION(ENABLE_OPENGL "Enable OpenGL support" ON) OPTION(ENABLE_VAAPI "Enable VA-API support" ON) OPTION(ENABLE_TEGRAJPEG "Enable Tegra HW JPEG support" ON) OPTION(ENABLE_PROFILING "Collect profiling stats (memory consuming)" OFF) +IF(APPLE) + OPTION(ENABLE_METAL "Enable Metal GPU depth processing (Apple platforms)" ON) +ELSE() + OPTION(ENABLE_METAL "Enable Metal GPU depth processing (Apple platforms)" OFF) +ENDIF() IF(ENABLE_PROFILING) SET(LIBFREENECT2_WITH_PROFILING 1) @@ -337,6 +342,54 @@ IF(ENABLE_OPENCL) ENDIF(OpenCL_FOUND) ENDIF(ENABLE_OPENCL) +SET(HAVE_Metal "disabled (non-Apple)") +IF(APPLE AND ENABLE_METAL) + SET(HAVE_Metal no) + + SET(METAL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/metal_depth_packet_processor.metal") + SET(METAL_AIR "${CMAKE_CURRENT_BINARY_DIR}/metal_depth_packet_processor.air") + SET(METAL_LIB "${CMAKE_CURRENT_BINARY_DIR}/default.metallib") + + ADD_CUSTOM_COMMAND( + OUTPUT ${METAL_AIR} + COMMAND xcrun -sdk macosx metal -c ${METAL_SOURCE} -o ${METAL_AIR} + DEPENDS ${METAL_SOURCE} + COMMENT "Compiling Metal shader to AIR" + VERBATIM + ) + + ADD_CUSTOM_COMMAND( + OUTPUT ${METAL_LIB} + COMMAND xcrun -sdk macosx metallib ${METAL_AIR} -o ${METAL_LIB} + DEPENDS ${METAL_AIR} + COMMENT "Linking Metal library" + VERBATIM + ) + + ADD_CUSTOM_TARGET(MetalShaders ALL DEPENDS ${METAL_LIB}) + + SET(LIBFREENECT2_WITH_METAL_SUPPORT 1) + SET(HAVE_Metal yes) + + LIST(APPEND SOURCES + src/metal_depth_packet_processor.mm + ) + + LIST(APPEND LIBRARIES + "-framework Metal" + "-framework Foundation" + ) + + # Mark the .mm file for Objective-C++ compilation (CMake handles this via the + # extension, but the explicit property guards against edge cases). + SET_SOURCE_FILES_PROPERTIES(src/metal_depth_packet_processor.mm + PROPERTIES COMPILE_FLAGS "-x objective-c++" + ) + + # Install the compiled metallib alongside the dylib. + INSTALL(FILES ${METAL_LIB} DESTINATION lib) +ENDIF(APPLE AND ENABLE_METAL) + SET(HAVE_CUDA disabled) IF(ENABLE_CUDA) FIND_PACKAGE(CUDA) @@ -417,6 +470,11 @@ GENERATE_RESOURCES(${RESOURCES_INC_FILE} ${MY_DIR} ${RESOURCES}) ADD_DEFINITIONS(-DRESOURCES_INC) ADD_LIBRARY(freenect2 ${SOURCES}) + +IF(APPLE AND ENABLE_METAL AND LIBFREENECT2_WITH_METAL_SUPPORT) + ADD_DEPENDENCIES(freenect2 MetalShaders) +ENDIF() + SET_TARGET_PROPERTIES(freenect2 PROPERTIES CXX_VISIBILITY_PRESET hidden VISIBILITY_INLINES_HIDDEN 1 diff --git a/include/internal/libfreenect2/depth_packet_processor.h b/include/internal/libfreenect2/depth_packet_processor.h index ad74ed1d2..8d5d90a64 100644 --- a/include/internal/libfreenect2/depth_packet_processor.h +++ b/include/internal/libfreenect2/depth_packet_processor.h @@ -280,6 +280,31 @@ class CudaKdeDepthPacketProcessor : public DepthPacketProcessor }; #endif // LIBFREENECT2_WITH_CUDA_SUPPORT +#ifdef LIBFREENECT2_WITH_METAL_SUPPORT +class MetalDepthPacketProcessorImpl; + +/** Depth packet processor using Apple Metal GPU compute. */ +class MetalDepthPacketProcessor : public DepthPacketProcessor +{ +public: + MetalDepthPacketProcessor(const int deviceIndex = -1); + virtual ~MetalDepthPacketProcessor(); + virtual void setConfiguration(const libfreenect2::DepthPacketProcessor::Config &config); + + virtual void loadP0TablesFromCommandResponse(unsigned char *buffer, size_t buffer_length); + + virtual void loadXZTables(const float *xtable, const float *ztable); + virtual void loadLookupTable(const short *lut); + + virtual bool good(); + virtual const char *name() { return "Metal"; } + + virtual void process(const DepthPacket &packet); +private: + MetalDepthPacketProcessorImpl *impl_; +}; +#endif // LIBFREENECT2_WITH_METAL_SUPPORT + class DumpDepthPacketProcessor : public DepthPacketProcessor { public: diff --git a/include/libfreenect2/config.h.in b/include/libfreenect2/config.h.in index 35a0f638c..92227f5ff 100644 --- a/include/libfreenect2/config.h.in +++ b/include/libfreenect2/config.h.in @@ -44,6 +44,8 @@ #cmakedefine LIBFREENECT2_WITH_OPENCL_SUPPORT #cmakedefine LIBFREENECT2_OPENCL_ICD_LOADER_IS_OLD +#cmakedefine LIBFREENECT2_WITH_METAL_SUPPORT + #cmakedefine LIBFREENECT2_WITH_CUDA_SUPPORT #cmakedefine LIBFREENECT2_WITH_VT_SUPPORT diff --git a/include/libfreenect2/packet_pipeline.h b/include/libfreenect2/packet_pipeline.h index 08d6540e8..1c43a4c1e 100644 --- a/include/libfreenect2/packet_pipeline.h +++ b/include/libfreenect2/packet_pipeline.h @@ -156,6 +156,18 @@ class LIBFREENECT2_API CudaKdePacketPipeline : public PacketPipeline }; #endif // LIBFREENECT2_WITH_CUDA_SUPPORT +#ifdef LIBFREENECT2_WITH_METAL_SUPPORT +/** Pipeline with Metal GPU depth processing (Apple platforms). */ +class LIBFREENECT2_API MetalPacketPipeline : public PacketPipeline +{ +protected: + const int deviceId; +public: + MetalPacketPipeline(const int deviceId = -1); + virtual ~MetalPacketPipeline(); +}; +#endif // LIBFREENECT2_WITH_METAL_SUPPORT + ///@} } /* namespace libfreenect2 */ #endif /* PACKET_PIPELINE_H_ */ diff --git a/src/libfreenect2.cpp b/src/libfreenect2.cpp index 631a5780c..b10a868f3 100644 --- a/src/libfreenect2.cpp +++ b/src/libfreenect2.cpp @@ -1101,6 +1101,10 @@ PacketPipeline *createPacketPipelineByName(std::string name) #if defined(LIBFREENECT2_WITH_OPENCL_SUPPORT) if (name == "cl") return new OpenCLPacketPipeline(); +#endif +#if defined(LIBFREENECT2_WITH_METAL_SUPPORT) + if (name == "metal") + return new MetalPacketPipeline(); #endif if (name == "cpu") return new CpuPacketPipeline(); @@ -1123,6 +1127,8 @@ PacketPipeline *createDefaultPacketPipeline() return new OpenGLPacketPipeline(); #elif defined(LIBFREENECT2_WITH_CUDA_SUPPORT) return new CudaPacketPipeline(); +#elif defined(LIBFREENECT2_WITH_METAL_SUPPORT) + return new MetalPacketPipeline(); #elif defined(LIBFREENECT2_WITH_OPENCL_SUPPORT) return new OpenCLPacketPipeline(); #else diff --git a/src/metal_depth_packet_processor.metal b/src/metal_depth_packet_processor.metal new file mode 100644 index 000000000..2beee9635 --- /dev/null +++ b/src/metal_depth_packet_processor.metal @@ -0,0 +1,502 @@ +/* + * This file is part of the OpenKinect Project. http://www.openkinect.org + * + * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file + * for details. + * + * This code is licensed to you under the terms of the Apache License, version + * 2.0, or, at your option, the terms of the GNU General Public License, + * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses, + * or the following URLs: + * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * If you redistribute this file in source form, modified or unmodified, you + * may: + * 1) Leave this header intact and distribute it under the same terms, + * accompanying it with the APACHE20 and GPL20 files, or + * 2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or + * 3) Delete the GPL v2 clause and accompany it with the APACHE20 file + * In all cases you must keep the copyright notice intact and include a copy + * of the CONTRIB file. + * + * Binary distributions must follow the binary distribution requirements of + * either License. + */ + +/** @file metal_depth_packet_processor.metal Metal compute kernels for depth processing. */ + +#include +using namespace metal; + +/** Parameters struct passed as a constant buffer to all kernels. + * Mirrors libfreenect2::DepthPacketProcessor::Parameters. */ +struct MetalDepthParams +{ + float ab_multiplier; + float ab_multiplier_per_frq0; + float ab_multiplier_per_frq1; + float ab_multiplier_per_frq2; + float ab_output_multiplier; + float padding0[3]; + + float phase_in_rad0; + float phase_in_rad1; + float phase_in_rad2; + float padding1; + + float joint_bilateral_ab_threshold; + float joint_bilateral_max_edge; + float joint_bilateral_exp; + float joint_bilateral_threshold; /* precomputed: (ab_threshold^2)/(ab_multiplier^2) */ + + float gaussian_kernel0; + float gaussian_kernel1; + float gaussian_kernel2; + float gaussian_kernel3; + float gaussian_kernel4; + float gaussian_kernel5; + float gaussian_kernel6; + float gaussian_kernel7; + float gaussian_kernel8; + float padding2[3]; + + float phase_offset; + float unambigious_dist; + float individual_ab_threshold; + float ab_threshold; + float ab_confidence_slope; + float ab_confidence_offset; + float min_dealias_confidence; + float max_dealias_confidence; + + float edge_ab_avg_min_value; + float edge_ab_std_dev_threshold; + float edge_close_delta_threshold; + float edge_far_delta_threshold; + float edge_max_delta_threshold; + float edge_avg_delta_threshold; + float max_edge_count; + float padding3; + + float min_depth; + float max_depth; + float padding4[2]; +}; + +/******************************************************************************* + * Decode a packed 11-bit pixel measurement from the raw IR packet buffer. + ******************************************************************************/ +static float decodePixelMeasurement( + device const ushort *data, + device const short *lut11to16, + const uint sub, + const uint x, + const uint y) +{ + uint row_idx = (424u * sub + y) * 352u; + uint idx = (((x >> 2u) + ((x << 7u) & 0x180u)) * 11u) & 0xffffffffu; + + uint col_idx = idx >> 4u; + uint upper_bytes = idx & 15u; + uint lower_bytes = 16u - upper_bytes; + + uint data_idx0 = row_idx + col_idx; + uint data_idx1 = row_idx + col_idx + 1u; + + uint packed = (x < 1u || 510u < x || col_idx > 352u) ? 0u : + ((uint(data[data_idx0]) >> upper_bytes) | (uint(data[data_idx1]) << lower_bytes)) & 2047u; + + return float(lut11to16[packed]); +} + +/******************************************************************************* + * Process pixel stage 1: phase unwrapping from raw IR data. + * + * One thread per pixel (linear index i = y * 512 + x). + * Reads raw packet data and p0 calibration tables, outputs complex IR vectors + * (a, b) and amplitude (n) for three modulation frequencies, plus a quick IR + * image for monitoring. + ******************************************************************************/ +kernel void processPixelStage1( + device const short *lut11to16 [[ buffer(0) ]], + device const float *z_table [[ buffer(1) ]], + device const float3 *p0_table [[ buffer(2) ]], + device const ushort *data [[ buffer(3) ]], + device float3 *a_out [[ buffer(4) ]], + device float3 *b_out [[ buffer(5) ]], + device float3 *n_out [[ buffer(6) ]], + device float *ir_out [[ buffer(7) ]], + constant MetalDepthParams ¶ms [[ buffer(8) ]], + uint i [[ thread_position_in_grid ]]) +{ + const uint x = i % 512u; + const uint y = i / 512u; + + /* The raw frame rows are stored in a rearranged order: + * bottom half of the sensor is stored first, top half second. + * y_in maps the output row index to the correct input row. */ + const uint y_tmp = 423u - y; + const uint y_in = (y_tmp < 212u) ? y_tmp + 212u : 423u - y_tmp; + + /* Pixel validity: z_table[i] <= 0 means no calibration data. */ + const bool invalid = (0.0f >= z_table[i]); + + /* Per-pixel phase offsets from calibration tables. */ + const float3 p0 = p0_table[i]; + + /* Phase vector for the three modulation frequencies. */ + const float3 phase = float3(params.phase_in_rad0, params.phase_in_rad1, params.phase_in_rad2); + + /* Compute sin/cos for each frequency combined with the p0 offset. */ + float3 p0x_cos, p0y_cos, p0z_cos; + float3 p0x_sin = -sincos(phase + p0.x, p0x_cos); + float3 p0y_sin = -sincos(phase + p0.y, p0y_cos); + float3 p0z_sin = -sincos(phase + p0.z, p0z_cos); + + /* Decode the nine raw measurements (3 sub-frames per frequency). */ + const float3 v0 = float3(decodePixelMeasurement(data, lut11to16, 0u, x, y_in), + decodePixelMeasurement(data, lut11to16, 1u, x, y_in), + decodePixelMeasurement(data, lut11to16, 2u, x, y_in)); + const float3 v1 = float3(decodePixelMeasurement(data, lut11to16, 3u, x, y_in), + decodePixelMeasurement(data, lut11to16, 4u, x, y_in), + decodePixelMeasurement(data, lut11to16, 5u, x, y_in)); + const float3 v2 = float3(decodePixelMeasurement(data, lut11to16, 6u, x, y_in), + decodePixelMeasurement(data, lut11to16, 7u, x, y_in), + decodePixelMeasurement(data, lut11to16, 8u, x, y_in)); + + /* Per-frequency multipliers for the ab (amplitude-bias) computation. */ + const float3 ab_mult = float3(params.ab_multiplier_per_frq0, + params.ab_multiplier_per_frq1, + params.ab_multiplier_per_frq2); + + /* Compute complex IR vectors a (real) and b (imaginary). */ + float3 a = float3(dot(v0, p0x_cos), dot(v1, p0y_cos), dot(v2, p0z_cos)) * ab_mult; + float3 b = float3(dot(v0, p0x_sin), dot(v1, p0y_sin), dot(v2, p0z_sin)) * ab_mult; + + /* Zero out invalid pixels. */ + a = select(a, float3(0.0f), invalid); + b = select(b, float3(0.0f), invalid); + float3 n = sqrt(a * a + b * b); + + /* Detect saturated measurements (raw value == 32767 in any sub-frame). */ + const bool sat0 = any(v0 == float3(32767.0f)); + const bool sat1 = any(v1 == float3(32767.0f)); + const bool sat2 = any(v2 == float3(32767.0f)); + const bool3 saturated = bool3(sat0, sat1, sat2); + + /* Zero a/b for saturated frequencies; IR reports saturation as 65535. */ + a_out[i] = select(a, float3(0.0f), saturated); + b_out[i] = select(b, float3(0.0f), saturated); + n_out[i] = n; + + float3 n_or_sat = select(n, float3(65535.0f), saturated); + ir_out[i] = min(dot(n_or_sat, float3(0.333333333f * params.ab_multiplier * params.ab_output_multiplier)), 65535.0f); +} + +/******************************************************************************* + * Filter pixel stage 1: joint bilateral filter on the complex IR vectors. + * + * Smooths the (a, b) outputs from stage 1 while preserving edges. + * Also computes the max_edge_test flag used by stage 2 edge filtering. + ******************************************************************************/ +kernel void filterPixelStage1( + device const float3 *a [[ buffer(0) ]], + device const float3 *b [[ buffer(1) ]], + device const float3 *n [[ buffer(2) ]], + device float3 *a_out [[ buffer(3) ]], + device float3 *b_out [[ buffer(4) ]], + device uchar *max_edge_test [[ buffer(5) ]], + constant MetalDepthParams ¶ms [[ buffer(6) ]], + uint i [[ thread_position_in_grid ]]) +{ + const uint x = i % 512u; + const uint y = i / 512u; + + const float3 self_a = a[i]; + const float3 self_b = b[i]; + + const float gaussian[9] = { + params.gaussian_kernel0, params.gaussian_kernel1, params.gaussian_kernel2, + params.gaussian_kernel3, params.gaussian_kernel4, params.gaussian_kernel5, + params.gaussian_kernel6, params.gaussian_kernel7, params.gaussian_kernel8 + }; + + /* Border pixels: pass through without filtering and mark as valid edge. */ + if(x < 1u || y < 1u || x > 510u || y > 422u) + { + a_out[i] = self_a; + b_out[i] = self_b; + max_edge_test[i] = 1; + return; + } + + float3 threshold = float3(params.joint_bilateral_threshold); + float3 joint_bilateral_exp = float3(params.joint_bilateral_exp); + + const float3 self_norm = n[i]; + const float3 self_normalized_a = self_a / self_norm; + const float3 self_normalized_b = self_b / self_norm; + + /* If the centre pixel's signal is too weak, disable distance weighting. */ + const bool3 c0 = self_norm * self_norm < threshold; + threshold = select(threshold, float3(0.0f), c0); + joint_bilateral_exp = select(joint_bilateral_exp, float3(0.0f), c0); + + float3 weight_acc = float3(0.0f); + float3 weighted_a_acc = float3(0.0f); + float3 weighted_b_acc = float3(0.0f); + float3 dist_acc = float3(0.0f); + + for(int yi = -1, j = 0; yi < 2; ++yi) + { + uint i_other = uint(int(y) + yi) * 512u + x - 1u; + + for(int xi = -1; xi < 2; ++xi, ++j, ++i_other) + { + const float3 other_a = a[i_other]; + const float3 other_b = b[i_other]; + const float3 other_norm = n[i_other]; + const float3 other_normalized_a = other_a / other_norm; + const float3 other_normalized_b = other_b / other_norm; + + const bool3 c1 = other_norm * other_norm < threshold; + + const float3 dist = 0.5f * (1.0f - (self_normalized_a * other_normalized_a + + self_normalized_b * other_normalized_b)); + const float3 weight = select(gaussian[j] * exp(-1.442695f * joint_bilateral_exp * dist), + float3(0.0f), c1); + + weighted_a_acc += weight * other_a; + weighted_b_acc += weight * other_b; + weight_acc += weight; + dist_acc += select(dist, float3(0.0f), c1); + } + } + + const bool3 c2 = weight_acc > float3(0.0f); + a_out[i] = select(float3(0.0f), weighted_a_acc / weight_acc, c2); + b_out[i] = select(float3(0.0f), weighted_b_acc / weight_acc, c2); + + max_edge_test[i] = all(dist_acc < float3(params.joint_bilateral_max_edge)) ? 1u : 0u; +} + +/******************************************************************************* + * Process pixel stage 2: depth calculation from unwrapped phase. + * + * Implements three-frequency phase disambiguation and converts the final phase + * to depth in millimetres using the x/z calibration tables. + ******************************************************************************/ +kernel void processPixelStage2( + device const float3 *a_in [[ buffer(0) ]], + device const float3 *b_in [[ buffer(1) ]], + device const float *x_table [[ buffer(2) ]], + device const float *z_table [[ buffer(3) ]], + device float *depth [[ buffer(4) ]], + device float *ir_sums [[ buffer(5) ]], + constant MetalDepthParams ¶ms [[ buffer(6) ]], + uint i [[ thread_position_in_grid ]]) +{ + float3 a = a_in[i]; + float3 b = b_in[i]; + + float3 phase = atan2(b, a); + phase = select(phase, phase + 2.0f * M_PI_F, phase < float3(0.0f)); + phase = select(phase, float3(0.0f), isnan(phase)); + float3 ir = sqrt(a * a + b * b) * params.ab_multiplier; + + float ir_sum = ir.x + ir.y + ir.z; + float ir_min = min(ir.x, min(ir.y, ir.z)); + + float phase_final = 0.0f; + + if(ir_min >= params.individual_ab_threshold && ir_sum >= params.ab_threshold) + { + float3 t = phase / (2.0f * M_PI_F) * float3(3.0f, 15.0f, 2.0f); + + float t0 = t.x; + float t1 = t.y; + float t2 = t.z; + + float t5 = (floor((t1 - t0) * 0.333333f + 0.5f) * 3.0f + t0); + float t3 = (-t2 + t5); + float t4 = t3 * 2.0f; + + bool c1 = t4 >= -t4; + + float f1 = c1 ? 2.0f : -2.0f; + float f2 = c1 ? 0.5f : -0.5f; + t3 *= f2; + t3 = (t3 - floor(t3)) * f1; + + bool c2 = 0.5f < abs(t3) && abs(t3) < 1.5f; + + float t6 = c2 ? t5 + 15.0f : t5; + float t7 = c2 ? t1 + 15.0f : t1; + + float t8 = (floor((-t2 + t6) * 0.5f + 0.5f) * 2.0f + t2) * 0.5f; + + t6 *= 0.333333f; + t7 *= 0.066667f; + + float t9 = (t8 + t6 + t7); + float t10 = t9 * 0.333333f; + + t6 *= 2.0f * M_PI_F; + t7 *= 2.0f * M_PI_F; + t8 *= 2.0f * M_PI_F; + + float t8_new = t7 * 0.826977f - t8 * 0.110264f; + float t6_new = t8 * 0.551318f - t6 * 0.826977f; + float t7_new = t6 * 0.110264f - t7 * 0.551318f; + + t8 = t8_new; + t6 = t6_new; + t7 = t7_new; + + float norm = t8 * t8 + t6 * t6 + t7 * t7; + float mask = t9 >= 0.0f ? 1.0f : 0.0f; + t10 *= mask; + + bool slope_positive = 0 < params.ab_confidence_slope; + + float ir_max = max(ir.x, max(ir.y, ir.z)); + float ir_x = slope_positive ? ir_min : ir_max; + + ir_x = log(ir_x); + ir_x = (ir_x * params.ab_confidence_slope * 0.301030f + params.ab_confidence_offset) * 3.321928f; + ir_x = exp(ir_x); + ir_x = clamp(ir_x, params.min_dealias_confidence, params.max_dealias_confidence); + ir_x *= ir_x; + + float mask2 = ir_x >= norm ? 1.0f : 0.0f; + float t11 = t10 * mask2; + + float mask3 = params.max_dealias_confidence * params.max_dealias_confidence >= norm ? 1.0f : 0.0f; + t10 *= mask3; + phase_final = t11; + } + + float zmultiplier = z_table[i]; + float xmultiplier = x_table[i]; + + phase_final = 0.0f < phase_final ? phase_final + params.phase_offset : phase_final; + + float depth_linear = zmultiplier * phase_final; + float max_depth = phase_final * params.unambigious_dist * 2.0f; + + bool cond1 = 0.0f < depth_linear && 0.0f < max_depth; + + xmultiplier = (xmultiplier * 90.0f) / (max_depth * max_depth * 8192.0f); + + float depth_fit = depth_linear / (-depth_linear * xmultiplier + 1.0f); + depth_fit = depth_fit < 0.0f ? 0.0f : depth_fit; + + float d = cond1 ? depth_fit : depth_linear; + depth[i] = d; + ir_sums[i] = ir_sum; +} + +/******************************************************************************* + * Filter pixel stage 2: edge-aware depth filter. + * + * Removes depth measurements at depth discontinuities where the IR signal + * variance indicates an unreliable reading. + ******************************************************************************/ +kernel void filterPixelStage2( + device const float *depth [[ buffer(0) ]], + device const float *ir_sums [[ buffer(1) ]], + device const uchar *max_edge_test [[ buffer(2) ]], + device float *filtered [[ buffer(3) ]], + constant MetalDepthParams ¶ms [[ buffer(4) ]], + uint i [[ thread_position_in_grid ]]) +{ + const uint x = i % 512u; + const uint y = i / 512u; + + const float raw_depth = depth[i]; + const float ir_sum = ir_sums[i]; + const uchar edge_test = max_edge_test[i]; + + if(raw_depth >= params.min_depth && raw_depth <= params.max_depth) + { + if(x < 1u || y < 1u || x > 510u || y > 422u) + { + filtered[i] = raw_depth; + } + else + { + float ir_sum_acc = ir_sum; + float squared_ir_sum_acc = ir_sum * ir_sum; + float min_depth = raw_depth; + float max_depth = raw_depth; + + for(int yi = -1; yi < 2; ++yi) + { + uint i_other = uint(int(y) + yi) * 512u + x - 1u; + + for(int xi = -1; xi < 2; ++xi, ++i_other) + { + if(i_other == i) + { + continue; + } + + const float raw_depth_other = depth[i_other]; + const float ir_sum_other = ir_sums[i_other]; + + ir_sum_acc += ir_sum_other; + squared_ir_sum_acc += ir_sum_other * ir_sum_other; + + if(0.0f < raw_depth_other) + { + min_depth = min(min_depth, raw_depth_other); + max_depth = max(max_depth, raw_depth_other); + } + } + } + + float tmp0 = sqrt(squared_ir_sum_acc * 9.0f - ir_sum_acc * ir_sum_acc) / 9.0f; + float edge_avg = max(ir_sum_acc / 9.0f, params.edge_ab_avg_min_value); + tmp0 /= edge_avg; + + float abs_min_diff = abs(raw_depth - min_depth); + float abs_max_diff = abs(raw_depth - max_depth); + + float avg_diff = (abs_min_diff + abs_max_diff) * 0.5f; + float max_abs_diff = max(abs_min_diff, abs_max_diff); + + bool cond0 = + 0.0f < raw_depth && + tmp0 >= params.edge_ab_std_dev_threshold && + params.edge_close_delta_threshold < abs_min_diff && + params.edge_far_delta_threshold < abs_max_diff && + params.edge_max_delta_threshold < max_abs_diff && + params.edge_avg_delta_threshold < avg_diff; + + if(!cond0) + { + if(edge_test != 0) + { + /* tmp1 and edge_count would be used for a more sophisticated edge + * count filter; currently edge_count is always 0 so this path + * always passes depth through. */ + float edge_count = 0.0f; + filtered[i] = edge_count > params.max_edge_count ? 0.0f : raw_depth; + } + else + { + filtered[i] = 0.0f; + } + } + else + { + filtered[i] = 0.0f; + } + } + } + else + { + filtered[i] = 0.0f; + } +} diff --git a/src/metal_depth_packet_processor.mm b/src/metal_depth_packet_processor.mm new file mode 100644 index 000000000..c719846e2 --- /dev/null +++ b/src/metal_depth_packet_processor.mm @@ -0,0 +1,763 @@ +/* + * This file is part of the OpenKinect Project. http://www.openkinect.org + * + * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file + * for details. + * + * This code is licensed to you under the terms of the Apache License, version + * 2.0, or, at your option, the terms of the GNU General Public License, + * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses, + * or the following URLs: + * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * If you redistribute this file in source form, modified or unmodified, you + * may: + * 1) Leave this header intact and distribute it under the same terms, + * accompanying it with the APACHE20 and GPL20 files, or + * 2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or + * 3) Delete the GPL v2 clause and accompany it with the APACHE20 file + * In all cases you must keep the copyright notice intact and include a copy + * of the CONTRIB file. + * + * Binary distributions must follow the binary distribution requirements of + * either License. + */ + +/** @file metal_depth_packet_processor.mm Metal GPU depth packet processor host code. */ + +#include +#include +#include + +#import +#import + +#include +#include +#include + +#define _USE_MATH_DEFINES +#include + +namespace libfreenect2 +{ + +static const size_t IMAGE_SIZE = 512 * 424; +static const size_t IMAGE_WIDTH = 512; +static const size_t IMAGE_HEIGHT = 424; +static const size_t LUT_SIZE = 2048; + +/** Parameters struct laid out to match the Metal shader's MetalDepthParams. */ +struct MetalDepthParamsBuffer +{ + float ab_multiplier; + float ab_multiplier_per_frq0; + float ab_multiplier_per_frq1; + float ab_multiplier_per_frq2; + float ab_output_multiplier; + float padding0[3]; + + float phase_in_rad0; + float phase_in_rad1; + float phase_in_rad2; + float padding1; + + float joint_bilateral_ab_threshold; + float joint_bilateral_max_edge; + float joint_bilateral_exp; + float joint_bilateral_threshold; + + float gaussian_kernel0; + float gaussian_kernel1; + float gaussian_kernel2; + float gaussian_kernel3; + float gaussian_kernel4; + float gaussian_kernel5; + float gaussian_kernel6; + float gaussian_kernel7; + float gaussian_kernel8; + float padding2[3]; + + float phase_offset; + float unambigious_dist; + float individual_ab_threshold; + float ab_threshold; + float ab_confidence_slope; + float ab_confidence_offset; + float min_dealias_confidence; + float max_dealias_confidence; + + float edge_ab_avg_min_value; + float edge_ab_std_dev_threshold; + float edge_close_delta_threshold; + float edge_far_delta_threshold; + float edge_max_delta_threshold; + float edge_avg_delta_threshold; + float max_edge_count; + float padding3; + + float min_depth; + float max_depth; + float padding4[2]; +}; + +/** Populate a MetalDepthParamsBuffer from a DepthPacketProcessor::Parameters + * and config values (MinDepth/MaxDepth in metres, converted to mm here). */ +static void fillParamsBuffer(MetalDepthParamsBuffer &dst, + const DepthPacketProcessor::Parameters &p, + const DepthPacketProcessor::Config &cfg) +{ + dst.ab_multiplier = p.ab_multiplier; + dst.ab_multiplier_per_frq0 = p.ab_multiplier_per_frq[0]; + dst.ab_multiplier_per_frq1 = p.ab_multiplier_per_frq[1]; + dst.ab_multiplier_per_frq2 = p.ab_multiplier_per_frq[2]; + dst.ab_output_multiplier = p.ab_output_multiplier; + dst.padding0[0] = dst.padding0[1] = dst.padding0[2] = 0.0f; + + dst.phase_in_rad0 = p.phase_in_rad[0]; + dst.phase_in_rad1 = p.phase_in_rad[1]; + dst.phase_in_rad2 = p.phase_in_rad[2]; + dst.padding1 = 0.0f; + + dst.joint_bilateral_ab_threshold = p.joint_bilateral_ab_threshold; + dst.joint_bilateral_max_edge = p.joint_bilateral_max_edge; + dst.joint_bilateral_exp = p.joint_bilateral_exp; + /* Precomputed threshold used by filterPixelStage1. */ + dst.joint_bilateral_threshold = (p.joint_bilateral_ab_threshold * p.joint_bilateral_ab_threshold) + / (p.ab_multiplier * p.ab_multiplier); + + dst.gaussian_kernel0 = p.gaussian_kernel[0]; + dst.gaussian_kernel1 = p.gaussian_kernel[1]; + dst.gaussian_kernel2 = p.gaussian_kernel[2]; + dst.gaussian_kernel3 = p.gaussian_kernel[3]; + dst.gaussian_kernel4 = p.gaussian_kernel[4]; + dst.gaussian_kernel5 = p.gaussian_kernel[5]; + dst.gaussian_kernel6 = p.gaussian_kernel[6]; + dst.gaussian_kernel7 = p.gaussian_kernel[7]; + dst.gaussian_kernel8 = p.gaussian_kernel[8]; + dst.padding2[0] = dst.padding2[1] = dst.padding2[2] = 0.0f; + + dst.phase_offset = p.phase_offset; + dst.unambigious_dist = p.unambigious_dist; + dst.individual_ab_threshold = p.individual_ab_threshold; + dst.ab_threshold = p.ab_threshold; + dst.ab_confidence_slope = p.ab_confidence_slope; + dst.ab_confidence_offset = p.ab_confidence_offset; + dst.min_dealias_confidence = p.min_dealias_confidence; + dst.max_dealias_confidence = p.max_dealias_confidence; + + dst.edge_ab_avg_min_value = p.edge_ab_avg_min_value; + dst.edge_ab_std_dev_threshold = p.edge_ab_std_dev_threshold; + dst.edge_close_delta_threshold = p.edge_close_delta_threshold; + dst.edge_far_delta_threshold = p.edge_far_delta_threshold; + dst.edge_max_delta_threshold = p.edge_max_delta_threshold; + dst.edge_avg_delta_threshold = p.edge_avg_delta_threshold; + dst.max_edge_count = p.max_edge_count; + dst.padding3 = 0.0f; + + /* Config values are in metres; shaders expect millimetres. */ + dst.min_depth = cfg.MinDepth * 1000.0f; + dst.max_depth = cfg.MaxDepth * 1000.0f; + dst.padding4[0] = dst.padding4[1] = 0.0f; +} + +/** PIMPL implementation struct holding all Metal objects. */ +class MetalDepthPacketProcessorImpl +{ +public: + id device; + id command_queue; + + id pipeline_stage1; + id pipeline_filter_stage1; + id pipeline_stage2; + id pipeline_filter_stage2; + + /* Static lookup/calibration buffers. */ + id buf_lut11to16; + id buf_p0_table; /* float3 (16-byte aligned), IMAGE_SIZE entries */ + id buf_x_table; + id buf_z_table; + + /* Per-frame input buffer (raw USB packet). */ + id buf_packet; + + /* Intermediate GPU buffers. */ + id buf_a; + id buf_b; + id buf_n; + id buf_ir; + id buf_a_filtered; + id buf_b_filtered; + id buf_edge_test; + id buf_depth; + id buf_ir_sum; + id buf_filtered; + + /* Parameters constant buffer. */ + id buf_params; + + /* Output frames. */ + Frame *ir_frame; + Frame *depth_frame; + + DepthPacketProcessor::Parameters params; + DepthPacketProcessor::Config config; + + bool device_initialized; + bool runtime_ok; + + MetalDepthPacketProcessorImpl() + : device(nil) + , command_queue(nil) + , pipeline_stage1(nil) + , pipeline_filter_stage1(nil) + , pipeline_stage2(nil) + , pipeline_filter_stage2(nil) + , buf_lut11to16(nil) + , buf_p0_table(nil) + , buf_x_table(nil) + , buf_z_table(nil) + , buf_packet(nil) + , buf_a(nil) + , buf_b(nil) + , buf_n(nil) + , buf_ir(nil) + , buf_a_filtered(nil) + , buf_b_filtered(nil) + , buf_edge_test(nil) + , buf_depth(nil) + , buf_ir_sum(nil) + , buf_filtered(nil) + , buf_params(nil) + , ir_frame(NULL) + , depth_frame(NULL) + , device_initialized(false) + , runtime_ok(true) + { + device_initialized = init(); + if(device_initialized) + { + newIrFrame(); + newDepthFrame(); + } + } + + ~MetalDepthPacketProcessorImpl() + { + delete ir_frame; + delete depth_frame; + + /* ARC / manual release: set to nil to release all Metal objects. */ + buf_params = nil; + buf_filtered = nil; + buf_ir_sum = nil; + buf_depth = nil; + buf_edge_test = nil; + buf_b_filtered = nil; + buf_a_filtered = nil; + buf_ir = nil; + buf_n = nil; + buf_b = nil; + buf_a = nil; + buf_packet = nil; + buf_z_table = nil; + buf_x_table = nil; + buf_p0_table = nil; + buf_lut11to16 = nil; + pipeline_filter_stage2 = nil; + pipeline_stage2 = nil; + pipeline_filter_stage1 = nil; + pipeline_stage1 = nil; + command_queue = nil; + device = nil; + } + + /** Create a new (empty) IR output frame. */ + void newIrFrame() + { + delete ir_frame; + ir_frame = new Frame(IMAGE_WIDTH, IMAGE_HEIGHT, 4); + ir_frame->format = Frame::Float; + } + + /** Create a new (empty) depth output frame. */ + void newDepthFrame() + { + delete depth_frame; + depth_frame = new Frame(IMAGE_WIDTH, IMAGE_HEIGHT, 4); + depth_frame->format = Frame::Float; + } + + /** Allocate a shared-storage MTLBuffer (zero-copy on Apple Silicon). */ + id makeBuffer(size_t size) + { + id buf = [device newBufferWithLength:size + options:MTLResourceStorageModeShared]; + if(!buf) + { + LOG_ERROR << "MetalDepthPacketProcessor: failed to allocate MTLBuffer of size " << size; + } + return buf; + } + + /** Locate the compiled Metal library alongside the running dylib. + * + * Strategy: walk up the dylib path to find the build/install directory, + * then look for default.metallib next to it or in a known relative path. + * Falls back to the executable's bundle or current working directory. */ + id loadMetalLibrary() + { + NSError *error = nil; + id lib = nil; + + /* 1. Try to load from the same directory as this dylib. */ + Dl_info info; + if(dladdr((void *)&loadMetalLibrary_stub, &info) && info.dli_fname) + { + NSString *dylib_path = [NSString stringWithUTF8String:info.dli_fname]; + NSString *dylib_dir = [dylib_path stringByDeletingLastPathComponent]; + NSString *metallib_path = [dylib_dir stringByAppendingPathComponent:@"default.metallib"]; + + lib = [device newLibraryWithURL:[NSURL fileURLWithPath:metallib_path] error:&error]; + if(lib) + { + LOG_INFO << "MetalDepthPacketProcessor: loaded Metal library from " << [metallib_path UTF8String]; + return lib; + } + } + + /* 2. Try the main bundle (useful when linked into an app). */ + lib = [device newDefaultLibraryWithBundle:[NSBundle mainBundle] error:&error]; + if(lib) + { + LOG_INFO << "MetalDepthPacketProcessor: loaded Metal library from main bundle"; + return lib; + } + + /* 3. Try the default library of the device (works if the .metal was compiled + * into the app target itself). */ + lib = [device newDefaultLibrary]; + if(lib) + { + LOG_INFO << "MetalDepthPacketProcessor: using device default Metal library"; + return lib; + } + + LOG_ERROR << "MetalDepthPacketProcessor: could not find default.metallib. " + << "Make sure it is installed alongside libfreenect2.dylib."; + return nil; + } + + /** Dummy static function whose address is used for dladdr dylib path lookup. */ + static void loadMetalLibrary_stub() {} + + /** Initialise the Metal device, command queue, pipelines, and buffers. */ + bool init() + { + @autoreleasepool + { + device = MTLCreateSystemDefaultDevice(); + if(!device) + { + LOG_ERROR << "MetalDepthPacketProcessor: no Metal device available."; + return false; + } + + LOG_INFO << "MetalDepthPacketProcessor: using device " << [[device name] UTF8String]; + + command_queue = [device newCommandQueue]; + if(!command_queue) + { + LOG_ERROR << "MetalDepthPacketProcessor: failed to create command queue."; + return false; + } + + id library = loadMetalLibrary(); + if(!library) + return false; + + if(!buildPipelines(library)) + return false; + + if(!allocateBuffers()) + return false; + + return true; + } + } + + /** Build the four compute pipeline states from the Metal library. */ + bool buildPipelines(id library) + { + NSError *error = nil; + + id fn_stage1 = [library newFunctionWithName:@"processPixelStage1"]; + if(!fn_stage1) + { + LOG_ERROR << "MetalDepthPacketProcessor: kernel 'processPixelStage1' not found."; + return false; + } + pipeline_stage1 = [device newComputePipelineStateWithFunction:fn_stage1 error:&error]; + if(!pipeline_stage1) + { + LOG_ERROR << "MetalDepthPacketProcessor: failed to build processPixelStage1 pipeline: " + << [[error localizedDescription] UTF8String]; + return false; + } + + id fn_filter1 = [library newFunctionWithName:@"filterPixelStage1"]; + if(!fn_filter1) + { + LOG_ERROR << "MetalDepthPacketProcessor: kernel 'filterPixelStage1' not found."; + return false; + } + pipeline_filter_stage1 = [device newComputePipelineStateWithFunction:fn_filter1 error:&error]; + if(!pipeline_filter_stage1) + { + LOG_ERROR << "MetalDepthPacketProcessor: failed to build filterPixelStage1 pipeline: " + << [[error localizedDescription] UTF8String]; + return false; + } + + id fn_stage2 = [library newFunctionWithName:@"processPixelStage2"]; + if(!fn_stage2) + { + LOG_ERROR << "MetalDepthPacketProcessor: kernel 'processPixelStage2' not found."; + return false; + } + pipeline_stage2 = [device newComputePipelineStateWithFunction:fn_stage2 error:&error]; + if(!pipeline_stage2) + { + LOG_ERROR << "MetalDepthPacketProcessor: failed to build processPixelStage2 pipeline: " + << [[error localizedDescription] UTF8String]; + return false; + } + + id fn_filter2 = [library newFunctionWithName:@"filterPixelStage2"]; + if(!fn_filter2) + { + LOG_ERROR << "MetalDepthPacketProcessor: kernel 'filterPixelStage2' not found."; + return false; + } + pipeline_filter_stage2 = [device newComputePipelineStateWithFunction:fn_filter2 error:&error]; + if(!pipeline_filter_stage2) + { + LOG_ERROR << "MetalDepthPacketProcessor: failed to build filterPixelStage2 pipeline: " + << [[error localizedDescription] UTF8String]; + return false; + } + + return true; + } + + /** Allocate all Metal buffers. */ + bool allocateBuffers() + { + /* float3 in Metal is stored 16-byte aligned (4 floats per element). */ + const size_t float3_size = 4 * sizeof(float); + + buf_lut11to16 = makeBuffer(LUT_SIZE * sizeof(short)); + buf_p0_table = makeBuffer(IMAGE_SIZE * float3_size); + buf_x_table = makeBuffer(IMAGE_SIZE * sizeof(float)); + buf_z_table = makeBuffer(IMAGE_SIZE * sizeof(float)); + + /* Raw packet: 10 sub-frames, each 424 rows * 352 ushorts. */ + buf_packet = makeBuffer(((IMAGE_SIZE * 11) / 16) * 10 * sizeof(unsigned short)); + + buf_a = makeBuffer(IMAGE_SIZE * float3_size); + buf_b = makeBuffer(IMAGE_SIZE * float3_size); + buf_n = makeBuffer(IMAGE_SIZE * float3_size); + buf_ir = makeBuffer(IMAGE_SIZE * sizeof(float)); + buf_a_filtered = makeBuffer(IMAGE_SIZE * float3_size); + buf_b_filtered = makeBuffer(IMAGE_SIZE * float3_size); + buf_edge_test = makeBuffer(IMAGE_SIZE * sizeof(uint8_t)); + buf_depth = makeBuffer(IMAGE_SIZE * sizeof(float)); + buf_ir_sum = makeBuffer(IMAGE_SIZE * sizeof(float)); + buf_filtered = makeBuffer(IMAGE_SIZE * sizeof(float)); + buf_params = makeBuffer(sizeof(MetalDepthParamsBuffer)); + + /* Verify that all allocations succeeded. */ + if(!buf_lut11to16 || !buf_p0_table || !buf_x_table || !buf_z_table || + !buf_packet || !buf_a || !buf_b || !buf_n || !buf_ir || + !buf_a_filtered || !buf_b_filtered || !buf_edge_test || + !buf_depth || !buf_ir_sum || !buf_filtered || !buf_params) + { + LOG_ERROR << "MetalDepthPacketProcessor: buffer allocation failed."; + return false; + } + + return true; + } + + /** Upload processing parameters to the GPU constant buffer. */ + void uploadParams() + { + MetalDepthParamsBuffer *dst = (MetalDepthParamsBuffer *)[buf_params contents]; + fillParamsBuffer(*dst, params, config); + } + + /** Dispatch one compute pass using threadgroupsWithRemainder (non-uniform). */ + void dispatchKernel(id enc, + id pso) + { + /* One thread per pixel over the 512x424 image. */ + MTLSize threads_per_grid = MTLSizeMake(IMAGE_SIZE, 1, 1); + /* 64-thread threadgroups work well on all Apple GPU generations. */ + NSUInteger tg_size = MIN(64u, [pso maxTotalThreadsPerThreadgroup]); + MTLSize threads_per_tg = MTLSizeMake(tg_size, 1, 1); + /* Non-uniform dispatch: handles IMAGE_SIZE not divisible by tg_size. */ + [enc dispatchThreads:threads_per_grid threadsPerThreadgroup:threads_per_tg]; + } + + /** Run the full depth processing pipeline for one packet. + * + * Stages dispatched: + * 1. processPixelStage1 + * 2. filterPixelStage1 (only when bilateral filter is enabled) + * 3. processPixelStage2 + * 4. filterPixelStage2 (only when edge-aware filter is enabled) + * + * The call blocks until all GPU work is complete so that the output + * frames are ready when process() returns. */ + bool run(const DepthPacket &packet) + { + @autoreleasepool + { + /* Upload raw packet data (zero-copy on unified memory). */ + memcpy([buf_packet contents], packet.buffer, MIN(packet.buffer_length, [buf_packet length])); + + /* Upload current parameters. */ + uploadParams(); + + id cmd = [command_queue commandBuffer]; + if(!cmd) + { + LOG_ERROR << "MetalDepthPacketProcessor: failed to create command buffer."; + return false; + } + + /* ------------------------------------------------------------------ */ + /* Stage 1: processPixelStage1 */ + /* ------------------------------------------------------------------ */ + { + id enc = [cmd computeCommandEncoder]; + [enc setComputePipelineState:pipeline_stage1]; + [enc setBuffer:buf_lut11to16 offset:0 atIndex:0]; + [enc setBuffer:buf_z_table offset:0 atIndex:1]; + [enc setBuffer:buf_p0_table offset:0 atIndex:2]; + [enc setBuffer:buf_packet offset:0 atIndex:3]; + [enc setBuffer:buf_a offset:0 atIndex:4]; + [enc setBuffer:buf_b offset:0 atIndex:5]; + [enc setBuffer:buf_n offset:0 atIndex:6]; + [enc setBuffer:buf_ir offset:0 atIndex:7]; + [enc setBuffer:buf_params offset:0 atIndex:8]; + dispatchKernel(enc, pipeline_stage1); + [enc endEncoding]; + } + + /* ------------------------------------------------------------------ */ + /* Stage 1 filter (optional) */ + /* ------------------------------------------------------------------ */ + if(config.EnableBilateralFilter) + { + id enc = [cmd computeCommandEncoder]; + [enc setComputePipelineState:pipeline_filter_stage1]; + [enc setBuffer:buf_a offset:0 atIndex:0]; + [enc setBuffer:buf_b offset:0 atIndex:1]; + [enc setBuffer:buf_n offset:0 atIndex:2]; + [enc setBuffer:buf_a_filtered offset:0 atIndex:3]; + [enc setBuffer:buf_b_filtered offset:0 atIndex:4]; + [enc setBuffer:buf_edge_test offset:0 atIndex:5]; + [enc setBuffer:buf_params offset:0 atIndex:6]; + dispatchKernel(enc, pipeline_filter_stage1); + [enc endEncoding]; + } + + /* ------------------------------------------------------------------ */ + /* Stage 2: processPixelStage2 */ + /* ------------------------------------------------------------------ */ + { + id enc = [cmd computeCommandEncoder]; + [enc setComputePipelineState:pipeline_stage2]; + /* Use filtered a/b if bilateral filter was run, otherwise raw. */ + id a_src = config.EnableBilateralFilter ? buf_a_filtered : buf_a; + id b_src = config.EnableBilateralFilter ? buf_b_filtered : buf_b; + [enc setBuffer:a_src offset:0 atIndex:0]; + [enc setBuffer:b_src offset:0 atIndex:1]; + [enc setBuffer:buf_x_table offset:0 atIndex:2]; + [enc setBuffer:buf_z_table offset:0 atIndex:3]; + [enc setBuffer:buf_depth offset:0 atIndex:4]; + [enc setBuffer:buf_ir_sum offset:0 atIndex:5]; + [enc setBuffer:buf_params offset:0 atIndex:6]; + dispatchKernel(enc, pipeline_stage2); + [enc endEncoding]; + } + + /* ------------------------------------------------------------------ */ + /* Stage 2 filter (optional) */ + /* ------------------------------------------------------------------ */ + if(config.EnableEdgeAwareFilter) + { + id enc = [cmd computeCommandEncoder]; + [enc setComputePipelineState:pipeline_filter_stage2]; + [enc setBuffer:buf_depth offset:0 atIndex:0]; + [enc setBuffer:buf_ir_sum offset:0 atIndex:1]; + [enc setBuffer:buf_edge_test offset:0 atIndex:2]; + [enc setBuffer:buf_filtered offset:0 atIndex:3]; + [enc setBuffer:buf_params offset:0 atIndex:4]; + dispatchKernel(enc, pipeline_filter_stage2); + [enc endEncoding]; + } + + /* Commit and wait — process() must return with frames ready. */ + [cmd commit]; + [cmd waitUntilCompleted]; + + if([cmd status] == MTLCommandBufferStatusError) + { + LOG_ERROR << "MetalDepthPacketProcessor: command buffer execution error: " + << [[[cmd error] localizedDescription] UTF8String]; + return false; + } + + /* Copy results into output frames (zero-copy on shared storage). */ + const float *depth_src = config.EnableEdgeAwareFilter + ? (const float *)[buf_filtered contents] + : (const float *)[buf_depth contents]; + const float *ir_src = (const float *)[buf_ir contents]; + + memcpy(ir_frame->data, ir_src, IMAGE_SIZE * sizeof(float)); + memcpy(depth_frame->data, depth_src, IMAGE_SIZE * sizeof(float)); + + return true; + } + } +}; + +/* -------------------------------------------------------------------------- */ +/* MetalDepthPacketProcessor public interface */ +/* -------------------------------------------------------------------------- */ + +MetalDepthPacketProcessor::MetalDepthPacketProcessor(const int /*deviceIndex*/) + : impl_(new MetalDepthPacketProcessorImpl()) +{ +} + +MetalDepthPacketProcessor::~MetalDepthPacketProcessor() +{ + delete impl_; +} + +void MetalDepthPacketProcessor::setConfiguration(const libfreenect2::DepthPacketProcessor::Config &config) +{ + DepthPacketProcessor::setConfiguration(config); + impl_->config = config; + /* Re-upload parameters on next process() call via uploadParams(). */ +} + +void MetalDepthPacketProcessor::loadP0TablesFromCommandResponse(unsigned char *buffer, + size_t buffer_length) +{ + if(!impl_->device_initialized) + { + LOG_ERROR << "MetalDepthPacketProcessor: not initialized."; + return; + } + + libfreenect2::protocol::P0TablesResponse *p0table = + (libfreenect2::protocol::P0TablesResponse *)buffer; + + if(buffer_length < sizeof(libfreenect2::protocol::P0TablesResponse)) + { + LOG_ERROR << "P0Table response too short!"; + return; + } + + /* Convert uint16 p0 values to float radians and pack into float3 (float4) + * layout matching the Metal shader buffer expectation. */ + float *p0_dst = (float *)[impl_->buf_p0_table contents]; + + for(int r = 0; r < 424; ++r) + { + float *it = p0_dst + r * 512 * 4; /* 4 floats per float3 slot */ + const uint16_t *it0 = &p0table->p0table0[r * 512]; + const uint16_t *it1 = &p0table->p0table1[r * 512]; + const uint16_t *it2 = &p0table->p0table2[r * 512]; + + for(int c = 0; c < 512; ++c, it += 4, ++it0, ++it1, ++it2) + { + it[0] = -((float)*it0) * 0.000031f * (float)M_PI; + it[1] = -((float)*it1) * 0.000031f * (float)M_PI; + it[2] = -((float)*it2) * 0.000031f * (float)M_PI; + it[3] = 0.0f; + } + } +} + +void MetalDepthPacketProcessor::loadXZTables(const float *xtable, const float *ztable) +{ + if(!impl_->device_initialized) + { + LOG_ERROR << "MetalDepthPacketProcessor: not initialized."; + return; + } + + memcpy([impl_->buf_x_table contents], xtable, TABLE_SIZE * sizeof(float)); + memcpy([impl_->buf_z_table contents], ztable, TABLE_SIZE * sizeof(float)); +} + +void MetalDepthPacketProcessor::loadLookupTable(const short *lut) +{ + if(!impl_->device_initialized) + { + LOG_ERROR << "MetalDepthPacketProcessor: not initialized."; + return; + } + + memcpy([impl_->buf_lut11to16 contents], lut, LUT_SIZE * sizeof(short)); +} + +bool MetalDepthPacketProcessor::good() +{ + return impl_->device_initialized && impl_->runtime_ok; +} + +void MetalDepthPacketProcessor::process(const DepthPacket &packet) +{ + if(!listener_) + return; + + if(!impl_->device_initialized) + { + LOG_ERROR << "MetalDepthPacketProcessor: not initialized, dropping packet."; + return; + } + + impl_->ir_frame->timestamp = packet.timestamp; + impl_->depth_frame->timestamp = packet.timestamp; + impl_->ir_frame->sequence = packet.sequence; + impl_->depth_frame->sequence = packet.sequence; + + impl_->runtime_ok = impl_->run(packet); + + if(!impl_->runtime_ok) + { + impl_->ir_frame->status = 1; + impl_->depth_frame->status = 1; + } + + if(listener_->onNewFrame(Frame::Ir, impl_->ir_frame)) + { + impl_->ir_frame = NULL; // listener took ownership; don't let newIrFrame() delete it + impl_->newIrFrame(); + } + if(listener_->onNewFrame(Frame::Depth, impl_->depth_frame)) + { + impl_->depth_frame = NULL; // listener took ownership; don't let newDepthFrame() delete it + impl_->newDepthFrame(); + } +} + +} /* namespace libfreenect2 */ diff --git a/src/packet_pipeline.cpp b/src/packet_pipeline.cpp index e5d7ca315..f441a10c8 100644 --- a/src/packet_pipeline.cpp +++ b/src/packet_pipeline.cpp @@ -178,6 +178,15 @@ CudaKdePacketPipeline::CudaKdePacketPipeline(const int deviceId) : deviceId(devi CudaPacketPipeline::~CudaPacketPipeline() { } #endif // LIBFREENECT2_WITH_CUDA_SUPPORT +#ifdef LIBFREENECT2_WITH_METAL_SUPPORT +MetalPacketPipeline::MetalPacketPipeline(const int deviceId) : deviceId(deviceId) +{ + comp_->initialize(getDefaultRgbPacketProcessor(), new MetalDepthPacketProcessor(deviceId)); +} + +MetalPacketPipeline::~MetalPacketPipeline() { } +#endif // LIBFREENECT2_WITH_METAL_SUPPORT + DumpPacketPipeline::DumpPacketPipeline() { RgbPacketProcessor *rgb = new DumpRgbPacketProcessor();