diff --git a/CMakeLists.txt b/CMakeLists.txt
index d8ef04736..5e9b1f717 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,11 @@ OPTION(ENABLE_OPENGL "Enable OpenGL support" ON)
 OPTION(ENABLE_VAAPI "Enable VA-API support" ON)
 OPTION(ENABLE_TEGRAJPEG "Enable Tegra HW JPEG support" ON)
 OPTION(ENABLE_PROFILING "Collect profiling stats (memory consuming)" OFF)
+IF(APPLE)
+  OPTION(ENABLE_METAL "Enable Metal GPU depth processing (Apple platforms)" ON)
+ELSE()
+  OPTION(ENABLE_METAL "Enable Metal GPU depth processing (Apple platforms)" OFF)
+ENDIF()
 
 IF(ENABLE_PROFILING)
   SET(LIBFREENECT2_WITH_PROFILING 1)
@@ -337,6 +342,54 @@ IF(ENABLE_OPENCL)
   ENDIF(OpenCL_FOUND)
 ENDIF(ENABLE_OPENCL)
 
+SET(HAVE_Metal "disabled (non-Apple)")
+IF(APPLE AND ENABLE_METAL)
+  SET(HAVE_Metal no)
+
+  SET(METAL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/metal_depth_packet_processor.metal")
+  SET(METAL_AIR    "${CMAKE_CURRENT_BINARY_DIR}/metal_depth_packet_processor.air")
+  SET(METAL_LIB    "${CMAKE_CURRENT_BINARY_DIR}/default.metallib")
+
+  ADD_CUSTOM_COMMAND(
+    OUTPUT ${METAL_AIR}
+    COMMAND xcrun -sdk macosx metal -c ${METAL_SOURCE} -o ${METAL_AIR}
+    DEPENDS ${METAL_SOURCE}
+    COMMENT "Compiling Metal shader to AIR"
+    VERBATIM
+  )
+
+  ADD_CUSTOM_COMMAND(
+    OUTPUT ${METAL_LIB}
+    COMMAND xcrun -sdk macosx metallib ${METAL_AIR} -o ${METAL_LIB}
+    DEPENDS ${METAL_AIR}
+    COMMENT "Linking Metal library"
+    VERBATIM
+  )
+
+  ADD_CUSTOM_TARGET(MetalShaders ALL DEPENDS ${METAL_LIB})
+
+  SET(LIBFREENECT2_WITH_METAL_SUPPORT 1)
+  SET(HAVE_Metal yes)
+
+  LIST(APPEND SOURCES
+    src/metal_depth_packet_processor.mm
+  )
+
+  LIST(APPEND LIBRARIES
+    "-framework Metal"
+    "-framework Foundation"
+  )
+
+  # Mark the .mm file for Objective-C++ compilation (CMake handles this via the
+  # extension, but the explicit property guards against edge cases).
+  SET_SOURCE_FILES_PROPERTIES(src/metal_depth_packet_processor.mm
+    PROPERTIES COMPILE_FLAGS "-x objective-c++"
+  )
+
+  # Install the compiled metallib alongside the dylib.
+  INSTALL(FILES ${METAL_LIB} DESTINATION lib)
+ENDIF(APPLE AND ENABLE_METAL)
+
 SET(HAVE_CUDA disabled)
 IF(ENABLE_CUDA)
   FIND_PACKAGE(CUDA)
@@ -417,6 +470,11 @@ GENERATE_RESOURCES(${RESOURCES_INC_FILE} ${MY_DIR} ${RESOURCES})
 
 ADD_DEFINITIONS(-DRESOURCES_INC)
 ADD_LIBRARY(freenect2 ${SOURCES})
+
+IF(APPLE AND ENABLE_METAL AND LIBFREENECT2_WITH_METAL_SUPPORT)
+  ADD_DEPENDENCIES(freenect2 MetalShaders)
+ENDIF()
+
 SET_TARGET_PROPERTIES(freenect2 PROPERTIES
   CXX_VISIBILITY_PRESET hidden
   VISIBILITY_INLINES_HIDDEN 1
diff --git a/include/internal/libfreenect2/depth_packet_processor.h b/include/internal/libfreenect2/depth_packet_processor.h
index ad74ed1d2..8d5d90a64 100644
--- a/include/internal/libfreenect2/depth_packet_processor.h
+++ b/include/internal/libfreenect2/depth_packet_processor.h
@@ -280,6 +280,31 @@ class CudaKdeDepthPacketProcessor : public DepthPacketProcessor
 };
 #endif // LIBFREENECT2_WITH_CUDA_SUPPORT
 
+#ifdef LIBFREENECT2_WITH_METAL_SUPPORT
+class MetalDepthPacketProcessorImpl;
+
+/** Depth packet processor using Apple Metal GPU compute. */
+class MetalDepthPacketProcessor : public DepthPacketProcessor
+{
+public:
+  MetalDepthPacketProcessor(const int deviceIndex = -1);
+  virtual ~MetalDepthPacketProcessor();
+  virtual void setConfiguration(const libfreenect2::DepthPacketProcessor::Config &config);
+
+  virtual void loadP0TablesFromCommandResponse(unsigned char *buffer, size_t buffer_length);
+
+  virtual void loadXZTables(const float *xtable, const float *ztable);
+  virtual void loadLookupTable(const short *lut);
+
+  virtual bool good();
+  virtual const char *name() { return "Metal"; }
+
+  virtual void process(const DepthPacket &packet);
+private:
+  MetalDepthPacketProcessorImpl *impl_;
+};
+#endif // LIBFREENECT2_WITH_METAL_SUPPORT
+
 class DumpDepthPacketProcessor : public DepthPacketProcessor
 {
  public:
diff --git a/include/libfreenect2/config.h.in b/include/libfreenect2/config.h.in
index 35a0f638c..92227f5ff 100644
--- a/include/libfreenect2/config.h.in
+++ b/include/libfreenect2/config.h.in
@@ -44,6 +44,8 @@
 #cmakedefine LIBFREENECT2_WITH_OPENCL_SUPPORT
 #cmakedefine LIBFREENECT2_OPENCL_ICD_LOADER_IS_OLD
 
+#cmakedefine LIBFREENECT2_WITH_METAL_SUPPORT
+
 #cmakedefine LIBFREENECT2_WITH_CUDA_SUPPORT
 
 #cmakedefine LIBFREENECT2_WITH_VT_SUPPORT
diff --git a/include/libfreenect2/packet_pipeline.h b/include/libfreenect2/packet_pipeline.h
index 08d6540e8..1c43a4c1e 100644
--- a/include/libfreenect2/packet_pipeline.h
+++ b/include/libfreenect2/packet_pipeline.h
@@ -156,6 +156,18 @@ class LIBFREENECT2_API CudaKdePacketPipeline : public PacketPipeline
 };
 #endif // LIBFREENECT2_WITH_CUDA_SUPPORT
 
+#ifdef LIBFREENECT2_WITH_METAL_SUPPORT
+/** Pipeline with Metal GPU depth processing (Apple platforms). */
+class LIBFREENECT2_API MetalPacketPipeline : public PacketPipeline
+{
+protected:
+  const int deviceId;
+public:
+  MetalPacketPipeline(const int deviceId = -1);
+  virtual ~MetalPacketPipeline();
+};
+#endif // LIBFREENECT2_WITH_METAL_SUPPORT
+
 ///@}
 } /* namespace libfreenect2 */
 #endif /* PACKET_PIPELINE_H_ */
diff --git a/src/libfreenect2.cpp b/src/libfreenect2.cpp
index 631a5780c..b10a868f3 100644
--- a/src/libfreenect2.cpp
+++ b/src/libfreenect2.cpp
@@ -1101,6 +1101,10 @@ PacketPipeline *createPacketPipelineByName(std::string name)
 #if defined(LIBFREENECT2_WITH_OPENCL_SUPPORT)
   if (name == "cl")
     return new OpenCLPacketPipeline();
+#endif
+#if defined(LIBFREENECT2_WITH_METAL_SUPPORT)
+  if (name == "metal")
+    return new MetalPacketPipeline();
 #endif
   if (name == "cpu")
     return new CpuPacketPipeline();
@@ -1123,6 +1127,8 @@ PacketPipeline *createDefaultPacketPipeline()
   return new OpenGLPacketPipeline();
 #elif defined(LIBFREENECT2_WITH_CUDA_SUPPORT)
   return new CudaPacketPipeline();
+#elif defined(LIBFREENECT2_WITH_METAL_SUPPORT)
+  return new MetalPacketPipeline();
 #elif defined(LIBFREENECT2_WITH_OPENCL_SUPPORT)
   return new OpenCLPacketPipeline();
 #else
diff --git a/src/metal_depth_packet_processor.metal b/src/metal_depth_packet_processor.metal
new file mode 100644
index 000000000..2beee9635
--- /dev/null
+++ b/src/metal_depth_packet_processor.metal
@@ -0,0 +1,502 @@
+/*
+ * This file is part of the OpenKinect Project. http://www.openkinect.org
+ *
+ * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file
+ * for details.
+ *
+ * This code is licensed to you under the terms of the Apache License, version
+ * 2.0, or, at your option, the terms of the GNU General Public License,
+ * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses,
+ * or the following URLs:
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * If you redistribute this file in source form, modified or unmodified, you
+ * may:
+ *   1) Leave this header intact and distribute it under the same terms,
+ *      accompanying it with the APACHE20 and GPL20 files, or
+ *   2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or
+ *   3) Delete the GPL v2 clause and accompany it with the APACHE20 file
+ * In all cases you must keep the copyright notice intact and include a copy
+ * of the CONTRIB file.
+ *
+ * Binary distributions must follow the binary distribution requirements of
+ * either License.
+ */
+
+/** @file metal_depth_packet_processor.metal Metal compute kernels for depth processing. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+/** Parameters struct passed as a constant buffer to all kernels.
+ * Mirrors libfreenect2::DepthPacketProcessor::Parameters. */
+struct MetalDepthParams
+{
+  float ab_multiplier;
+  float ab_multiplier_per_frq0;
+  float ab_multiplier_per_frq1;
+  float ab_multiplier_per_frq2;
+  float ab_output_multiplier;
+  float padding0[3];
+
+  float phase_in_rad0;
+  float phase_in_rad1;
+  float phase_in_rad2;
+  float padding1;
+
+  float joint_bilateral_ab_threshold;
+  float joint_bilateral_max_edge;
+  float joint_bilateral_exp;
+  float joint_bilateral_threshold; /* precomputed: (ab_threshold^2)/(ab_multiplier^2) */
+
+  float gaussian_kernel0;
+  float gaussian_kernel1;
+  float gaussian_kernel2;
+  float gaussian_kernel3;
+  float gaussian_kernel4;
+  float gaussian_kernel5;
+  float gaussian_kernel6;
+  float gaussian_kernel7;
+  float gaussian_kernel8;
+  float padding2[3];
+
+  float phase_offset;
+  float unambigious_dist;
+  float individual_ab_threshold;
+  float ab_threshold;
+  float ab_confidence_slope;
+  float ab_confidence_offset;
+  float min_dealias_confidence;
+  float max_dealias_confidence;
+
+  float edge_ab_avg_min_value;
+  float edge_ab_std_dev_threshold;
+  float edge_close_delta_threshold;
+  float edge_far_delta_threshold;
+  float edge_max_delta_threshold;
+  float edge_avg_delta_threshold;
+  float max_edge_count;
+  float padding3;
+
+  float min_depth;
+  float max_depth;
+  float padding4[2];
+};
+
+/*******************************************************************************
+ * Decode a packed 11-bit pixel measurement from the raw IR packet buffer.
+ ******************************************************************************/
+static float decodePixelMeasurement(
+    device const ushort *data,
+    device const short *lut11to16,
+    const uint sub,
+    const uint x,
+    const uint y)
+{
+  uint row_idx = (424u * sub + y) * 352u;
+  uint idx = (((x >> 2u) + ((x << 7u) & 0x180u)) * 11u) & 0xffffffffu;
+
+  uint col_idx = idx >> 4u;
+  uint upper_bytes = idx & 15u;
+  uint lower_bytes = 16u - upper_bytes;
+
+  uint data_idx0 = row_idx + col_idx;
+  uint data_idx1 = row_idx + col_idx + 1u;
+
+  uint packed = (x < 1u || 510u < x || col_idx > 352u) ? 0u :
+      ((uint(data[data_idx0]) >> upper_bytes) | (uint(data[data_idx1]) << lower_bytes)) & 2047u;
+
+  return float(lut11to16[packed]);
+}
+
+/*******************************************************************************
+ * Process pixel stage 1: phase unwrapping from raw IR data.
+ *
+ * One thread per pixel (linear index i = y * 512 + x).
+ * Reads raw packet data and p0 calibration tables, outputs complex IR vectors
+ * (a, b) and amplitude (n) for three modulation frequencies, plus a quick IR
+ * image for monitoring.
+ ******************************************************************************/
+kernel void processPixelStage1(
+    device const short   *lut11to16   [[ buffer(0) ]],
+    device const float   *z_table     [[ buffer(1) ]],
+    device const float3  *p0_table    [[ buffer(2) ]],
+    device const ushort  *data        [[ buffer(3) ]],
+    device float3        *a_out       [[ buffer(4) ]],
+    device float3        *b_out       [[ buffer(5) ]],
+    device float3        *n_out       [[ buffer(6) ]],
+    device float         *ir_out      [[ buffer(7) ]],
+    constant MetalDepthParams &params [[ buffer(8) ]],
+    uint i [[ thread_position_in_grid ]])
+{
+  const uint x = i % 512u;
+  const uint y = i / 512u;
+
+  /* The raw frame rows are stored in a rearranged order:
+   * bottom half of the sensor is stored first, top half second.
+   * y_in maps the output row index to the correct input row. */
+  const uint y_tmp = 423u - y;
+  const uint y_in = (y_tmp < 212u) ? y_tmp + 212u : 423u - y_tmp;
+
+  /* Pixel validity: z_table[i] <= 0 means no calibration data. */
+  const bool invalid = (0.0f >= z_table[i]);
+
+  /* Per-pixel phase offsets from calibration tables. */
+  const float3 p0 = p0_table[i];
+
+  /* Phase vector for the three modulation frequencies. */
+  const float3 phase = float3(params.phase_in_rad0, params.phase_in_rad1, params.phase_in_rad2);
+
+  /* Compute sin/cos for each frequency combined with the p0 offset. */
+  float3 p0x_cos, p0y_cos, p0z_cos;
+  float3 p0x_sin = -sincos(phase + p0.x, p0x_cos);
+  float3 p0y_sin = -sincos(phase + p0.y, p0y_cos);
+  float3 p0z_sin = -sincos(phase + p0.z, p0z_cos);
+
+  /* Decode the nine raw measurements (3 sub-frames per frequency). */
+  const float3 v0 = float3(decodePixelMeasurement(data, lut11to16, 0u, x, y_in),
+                           decodePixelMeasurement(data, lut11to16, 1u, x, y_in),
+                           decodePixelMeasurement(data, lut11to16, 2u, x, y_in));
+  const float3 v1 = float3(decodePixelMeasurement(data, lut11to16, 3u, x, y_in),
+                           decodePixelMeasurement(data, lut11to16, 4u, x, y_in),
+                           decodePixelMeasurement(data, lut11to16, 5u, x, y_in));
+  const float3 v2 = float3(decodePixelMeasurement(data, lut11to16, 6u, x, y_in),
+                           decodePixelMeasurement(data, lut11to16, 7u, x, y_in),
+                           decodePixelMeasurement(data, lut11to16, 8u, x, y_in));
+
+  /* Per-frequency multipliers for the ab (amplitude-bias) computation. */
+  const float3 ab_mult = float3(params.ab_multiplier_per_frq0,
+                                params.ab_multiplier_per_frq1,
+                                params.ab_multiplier_per_frq2);
+
+  /* Compute complex IR vectors a (real) and b (imaginary). */
+  float3 a = float3(dot(v0, p0x_cos), dot(v1, p0y_cos), dot(v2, p0z_cos)) * ab_mult;
+  float3 b = float3(dot(v0, p0x_sin), dot(v1, p0y_sin), dot(v2, p0z_sin)) * ab_mult;
+
+  /* Zero out invalid pixels. */
+  a = select(a, float3(0.0f), invalid);
+  b = select(b, float3(0.0f), invalid);
+  float3 n = sqrt(a * a + b * b);
+
+  /* Detect saturated measurements (raw value == 32767 in any sub-frame). */
+  const bool sat0 = any(v0 == float3(32767.0f));
+  const bool sat1 = any(v1 == float3(32767.0f));
+  const bool sat2 = any(v2 == float3(32767.0f));
+  const bool3 saturated = bool3(sat0, sat1, sat2);
+
+  /* Zero a/b for saturated frequencies; IR reports saturation as 65535. */
+  a_out[i] = select(a, float3(0.0f), saturated);
+  b_out[i] = select(b, float3(0.0f), saturated);
+  n_out[i] = n;
+
+  float3 n_or_sat = select(n, float3(65535.0f), saturated);
+  ir_out[i] = min(dot(n_or_sat, float3(0.333333333f * params.ab_multiplier * params.ab_output_multiplier)), 65535.0f);
+}
+
+/*******************************************************************************
+ * Filter pixel stage 1: joint bilateral filter on the complex IR vectors.
+ *
+ * Smooths the (a, b) outputs from stage 1 while preserving edges.
+ * Also computes the max_edge_test flag used by stage 2 edge filtering.
+ ******************************************************************************/
+kernel void filterPixelStage1(
+    device const float3 *a           [[ buffer(0) ]],
+    device const float3 *b           [[ buffer(1) ]],
+    device const float3 *n           [[ buffer(2) ]],
+    device float3       *a_out       [[ buffer(3) ]],
+    device float3       *b_out       [[ buffer(4) ]],
+    device uchar        *max_edge_test [[ buffer(5) ]],
+    constant MetalDepthParams &params [[ buffer(6) ]],
+    uint i [[ thread_position_in_grid ]])
+{
+  const uint x = i % 512u;
+  const uint y = i / 512u;
+
+  const float3 self_a = a[i];
+  const float3 self_b = b[i];
+
+  const float gaussian[9] = {
+    params.gaussian_kernel0, params.gaussian_kernel1, params.gaussian_kernel2,
+    params.gaussian_kernel3, params.gaussian_kernel4, params.gaussian_kernel5,
+    params.gaussian_kernel6, params.gaussian_kernel7, params.gaussian_kernel8
+  };
+
+  /* Border pixels: pass through without filtering and mark as valid edge. */
+  if(x < 1u || y < 1u || x > 510u || y > 422u)
+  {
+    a_out[i] = self_a;
+    b_out[i] = self_b;
+    max_edge_test[i] = 1;
+    return;
+  }
+
+  float3 threshold = float3(params.joint_bilateral_threshold);
+  float3 joint_bilateral_exp = float3(params.joint_bilateral_exp);
+
+  const float3 self_norm = n[i];
+  const float3 self_normalized_a = self_a / self_norm;
+  const float3 self_normalized_b = self_b / self_norm;
+
+  /* If the centre pixel's signal is too weak, disable distance weighting. */
+  const bool3 c0 = self_norm * self_norm < threshold;
+  threshold = select(threshold, float3(0.0f), c0);
+  joint_bilateral_exp = select(joint_bilateral_exp, float3(0.0f), c0);
+
+  float3 weight_acc = float3(0.0f);
+  float3 weighted_a_acc = float3(0.0f);
+  float3 weighted_b_acc = float3(0.0f);
+  float3 dist_acc = float3(0.0f);
+
+  for(int yi = -1, j = 0; yi < 2; ++yi)
+  {
+    uint i_other = uint(int(y) + yi) * 512u + x - 1u;
+
+    for(int xi = -1; xi < 2; ++xi, ++j, ++i_other)
+    {
+      const float3 other_a = a[i_other];
+      const float3 other_b = b[i_other];
+      const float3 other_norm = n[i_other];
+      const float3 other_normalized_a = other_a / other_norm;
+      const float3 other_normalized_b = other_b / other_norm;
+
+      const bool3 c1 = other_norm * other_norm < threshold;
+
+      const float3 dist = 0.5f * (1.0f - (self_normalized_a * other_normalized_a +
+                                           self_normalized_b * other_normalized_b));
+      const float3 weight = select(gaussian[j] * exp(-1.442695f * joint_bilateral_exp * dist),
+                                   float3(0.0f), c1);
+
+      weighted_a_acc += weight * other_a;
+      weighted_b_acc += weight * other_b;
+      weight_acc += weight;
+      dist_acc += select(dist, float3(0.0f), c1);
+    }
+  }
+
+  const bool3 c2 = weight_acc > float3(0.0f);
+  a_out[i] = select(float3(0.0f), weighted_a_acc / weight_acc, c2);
+  b_out[i] = select(float3(0.0f), weighted_b_acc / weight_acc, c2);
+
+  max_edge_test[i] = all(dist_acc < float3(params.joint_bilateral_max_edge)) ? 1u : 0u;
+}
+
+/*******************************************************************************
+ * Process pixel stage 2: depth calculation from unwrapped phase.
+ *
+ * Implements three-frequency phase disambiguation and converts the final phase
+ * to depth in millimetres using the x/z calibration tables.
+ ******************************************************************************/
+kernel void processPixelStage2(
+    device const float3 *a_in        [[ buffer(0) ]],
+    device const float3 *b_in        [[ buffer(1) ]],
+    device const float  *x_table     [[ buffer(2) ]],
+    device const float  *z_table     [[ buffer(3) ]],
+    device float        *depth       [[ buffer(4) ]],
+    device float        *ir_sums     [[ buffer(5) ]],
+    constant MetalDepthParams &params [[ buffer(6) ]],
+    uint i [[ thread_position_in_grid ]])
+{
+  float3 a = a_in[i];
+  float3 b = b_in[i];
+
+  float3 phase = atan2(b, a);
+  phase = select(phase, phase + 2.0f * M_PI_F, phase < float3(0.0f));
+  phase = select(phase, float3(0.0f), isnan(phase));
+  float3 ir = sqrt(a * a + b * b) * params.ab_multiplier;
+
+  float ir_sum = ir.x + ir.y + ir.z;
+  float ir_min = min(ir.x, min(ir.y, ir.z));
+
+  float phase_final = 0.0f;
+
+  if(ir_min >= params.individual_ab_threshold && ir_sum >= params.ab_threshold)
+  {
+    float3 t = phase / (2.0f * M_PI_F) * float3(3.0f, 15.0f, 2.0f);
+
+    float t0 = t.x;
+    float t1 = t.y;
+    float t2 = t.z;
+
+    float t5 = (floor((t1 - t0) * 0.333333f + 0.5f) * 3.0f + t0);
+    float t3 = (-t2 + t5);
+    float t4 = t3 * 2.0f;
+
+    bool c1 = t4 >= -t4;
+
+    float f1 = c1 ? 2.0f : -2.0f;
+    float f2 = c1 ? 0.5f : -0.5f;
+    t3 *= f2;
+    t3 = (t3 - floor(t3)) * f1;
+
+    bool c2 = 0.5f < abs(t3) && abs(t3) < 1.5f;
+
+    float t6 = c2 ? t5 + 15.0f : t5;
+    float t7 = c2 ? t1 + 15.0f : t1;
+
+    float t8 = (floor((-t2 + t6) * 0.5f + 0.5f) * 2.0f + t2) * 0.5f;
+
+    t6 *= 0.333333f;
+    t7 *= 0.066667f;
+
+    float t9 = (t8 + t6 + t7);
+    float t10 = t9 * 0.333333f;
+
+    t6 *= 2.0f * M_PI_F;
+    t7 *= 2.0f * M_PI_F;
+    t8 *= 2.0f * M_PI_F;
+
+    float t8_new = t7 * 0.826977f - t8 * 0.110264f;
+    float t6_new = t8 * 0.551318f - t6 * 0.826977f;
+    float t7_new = t6 * 0.110264f - t7 * 0.551318f;
+
+    t8 = t8_new;
+    t6 = t6_new;
+    t7 = t7_new;
+
+    float norm = t8 * t8 + t6 * t6 + t7 * t7;
+    float mask = t9 >= 0.0f ? 1.0f : 0.0f;
+    t10 *= mask;
+
+    bool slope_positive = 0 < params.ab_confidence_slope;
+
+    float ir_max = max(ir.x, max(ir.y, ir.z));
+    float ir_x = slope_positive ? ir_min : ir_max;
+
+    ir_x = log(ir_x);
+    ir_x = (ir_x * params.ab_confidence_slope * 0.301030f + params.ab_confidence_offset) * 3.321928f;
+    ir_x = exp(ir_x);
+    ir_x = clamp(ir_x, params.min_dealias_confidence, params.max_dealias_confidence);
+    ir_x *= ir_x;
+
+    float mask2 = ir_x >= norm ? 1.0f : 0.0f;
+    float t11 = t10 * mask2;
+
+    float mask3 = params.max_dealias_confidence * params.max_dealias_confidence >= norm ? 1.0f : 0.0f;
+    t10 *= mask3;
+    phase_final = t11;
+  }
+
+  float zmultiplier = z_table[i];
+  float xmultiplier = x_table[i];
+
+  phase_final = 0.0f < phase_final ? phase_final + params.phase_offset : phase_final;
+
+  float depth_linear = zmultiplier * phase_final;
+  float max_depth = phase_final * params.unambigious_dist * 2.0f;
+
+  bool cond1 = 0.0f < depth_linear && 0.0f < max_depth;
+
+  xmultiplier = (xmultiplier * 90.0f) / (max_depth * max_depth * 8192.0f);
+
+  float depth_fit = depth_linear / (-depth_linear * xmultiplier + 1.0f);
+  depth_fit = depth_fit < 0.0f ? 0.0f : depth_fit;
+
+  float d = cond1 ? depth_fit : depth_linear;
+  depth[i] = d;
+  ir_sums[i] = ir_sum;
+}
+
+/*******************************************************************************
+ * Filter pixel stage 2: edge-aware depth filter.
+ *
+ * Removes depth measurements at depth discontinuities where the IR signal
+ * variance indicates an unreliable reading.
+ ******************************************************************************/
+kernel void filterPixelStage2(
+    device const float *depth        [[ buffer(0) ]],
+    device const float *ir_sums      [[ buffer(1) ]],
+    device const uchar *max_edge_test [[ buffer(2) ]],
+    device float       *filtered     [[ buffer(3) ]],
+    constant MetalDepthParams &params [[ buffer(4) ]],
+    uint i [[ thread_position_in_grid ]])
+{
+  const uint x = i % 512u;
+  const uint y = i / 512u;
+
+  const float raw_depth = depth[i];
+  const float ir_sum = ir_sums[i];
+  const uchar edge_test = max_edge_test[i];
+
+  if(raw_depth >= params.min_depth && raw_depth <= params.max_depth)
+  {
+    if(x < 1u || y < 1u || x > 510u || y > 422u)
+    {
+      filtered[i] = raw_depth;
+    }
+    else
+    {
+      float ir_sum_acc = ir_sum;
+      float squared_ir_sum_acc = ir_sum * ir_sum;
+      float min_depth = raw_depth;
+      float max_depth = raw_depth;
+
+      for(int yi = -1; yi < 2; ++yi)
+      {
+        uint i_other = uint(int(y) + yi) * 512u + x - 1u;
+
+        for(int xi = -1; xi < 2; ++xi, ++i_other)
+        {
+          if(i_other == i)
+          {
+            continue;
+          }
+
+          const float raw_depth_other = depth[i_other];
+          const float ir_sum_other = ir_sums[i_other];
+
+          ir_sum_acc += ir_sum_other;
+          squared_ir_sum_acc += ir_sum_other * ir_sum_other;
+
+          if(0.0f < raw_depth_other)
+          {
+            min_depth = min(min_depth, raw_depth_other);
+            max_depth = max(max_depth, raw_depth_other);
+          }
+        }
+      }
+
+      float tmp0 = sqrt(squared_ir_sum_acc * 9.0f - ir_sum_acc * ir_sum_acc) / 9.0f;
+      float edge_avg = max(ir_sum_acc / 9.0f, params.edge_ab_avg_min_value);
+      tmp0 /= edge_avg;
+
+      float abs_min_diff = abs(raw_depth - min_depth);
+      float abs_max_diff = abs(raw_depth - max_depth);
+
+      float avg_diff = (abs_min_diff + abs_max_diff) * 0.5f;
+      float max_abs_diff = max(abs_min_diff, abs_max_diff);
+
+      bool cond0 =
+          0.0f < raw_depth &&
+          tmp0 >= params.edge_ab_std_dev_threshold &&
+          params.edge_close_delta_threshold < abs_min_diff &&
+          params.edge_far_delta_threshold < abs_max_diff &&
+          params.edge_max_delta_threshold < max_abs_diff &&
+          params.edge_avg_delta_threshold < avg_diff;
+
+      if(!cond0)
+      {
+        if(edge_test != 0)
+        {
+          /* tmp1 and edge_count would be used for a more sophisticated edge
+           * count filter; currently edge_count is always 0 so this path
+           * always passes depth through. */
+          float edge_count = 0.0f;
+          filtered[i] = edge_count > params.max_edge_count ? 0.0f : raw_depth;
+        }
+        else
+        {
+          filtered[i] = 0.0f;
+        }
+      }
+      else
+      {
+        filtered[i] = 0.0f;
+      }
+    }
+  }
+  else
+  {
+    filtered[i] = 0.0f;
+  }
+}
diff --git a/src/metal_depth_packet_processor.mm b/src/metal_depth_packet_processor.mm
new file mode 100644
index 000000000..c719846e2
--- /dev/null
+++ b/src/metal_depth_packet_processor.mm
@@ -0,0 +1,763 @@
+/*
+ * This file is part of the OpenKinect Project. http://www.openkinect.org
+ *
+ * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file
+ * for details.
+ *
+ * This code is licensed to you under the terms of the Apache License, version
+ * 2.0, or, at your option, the terms of the GNU General Public License,
+ * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses,
+ * or the following URLs:
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * If you redistribute this file in source form, modified or unmodified, you
+ * may:
+ *   1) Leave this header intact and distribute it under the same terms,
+ *      accompanying it with the APACHE20 and GPL20 files, or
+ *   2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or
+ *   3) Delete the GPL v2 clause and accompany it with the APACHE20 file
+ * In all cases you must keep the copyright notice intact and include a copy
+ * of the CONTRIB file.
+ *
+ * Binary distributions must follow the binary distribution requirements of
+ * either License.
+ */
+
+/** @file metal_depth_packet_processor.mm Metal GPU depth packet processor host code. */
+
+#include <libfreenect2/depth_packet_processor.h>
+#include <libfreenect2/protocol/response.h>
+#include <libfreenect2/logging.h>
+
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+
+#include <string>
+#include <dlfcn.h>
+#include <cstring>
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+namespace libfreenect2
+{
+
+static const size_t IMAGE_SIZE = 512 * 424;
+static const size_t IMAGE_WIDTH = 512;
+static const size_t IMAGE_HEIGHT = 424;
+static const size_t LUT_SIZE = 2048;
+
+/** Parameters struct laid out to match the Metal shader's MetalDepthParams. */
+struct MetalDepthParamsBuffer
+{
+  float ab_multiplier;
+  float ab_multiplier_per_frq0;
+  float ab_multiplier_per_frq1;
+  float ab_multiplier_per_frq2;
+  float ab_output_multiplier;
+  float padding0[3];
+
+  float phase_in_rad0;
+  float phase_in_rad1;
+  float phase_in_rad2;
+  float padding1;
+
+  float joint_bilateral_ab_threshold;
+  float joint_bilateral_max_edge;
+  float joint_bilateral_exp;
+  float joint_bilateral_threshold;
+
+  float gaussian_kernel0;
+  float gaussian_kernel1;
+  float gaussian_kernel2;
+  float gaussian_kernel3;
+  float gaussian_kernel4;
+  float gaussian_kernel5;
+  float gaussian_kernel6;
+  float gaussian_kernel7;
+  float gaussian_kernel8;
+  float padding2[3];
+
+  float phase_offset;
+  float unambigious_dist;
+  float individual_ab_threshold;
+  float ab_threshold;
+  float ab_confidence_slope;
+  float ab_confidence_offset;
+  float min_dealias_confidence;
+  float max_dealias_confidence;
+
+  float edge_ab_avg_min_value;
+  float edge_ab_std_dev_threshold;
+  float edge_close_delta_threshold;
+  float edge_far_delta_threshold;
+  float edge_max_delta_threshold;
+  float edge_avg_delta_threshold;
+  float max_edge_count;
+  float padding3;
+
+  float min_depth;
+  float max_depth;
+  float padding4[2];
+};
+
+/** Populate a MetalDepthParamsBuffer from a DepthPacketProcessor::Parameters
+ *  and config values (MinDepth/MaxDepth in metres, converted to mm here). */
+static void fillParamsBuffer(MetalDepthParamsBuffer &dst,
+                             const DepthPacketProcessor::Parameters &p,
+                             const DepthPacketProcessor::Config &cfg)
+{
+  dst.ab_multiplier = p.ab_multiplier;
+  dst.ab_multiplier_per_frq0 = p.ab_multiplier_per_frq[0];
+  dst.ab_multiplier_per_frq1 = p.ab_multiplier_per_frq[1];
+  dst.ab_multiplier_per_frq2 = p.ab_multiplier_per_frq[2];
+  dst.ab_output_multiplier = p.ab_output_multiplier;
+  dst.padding0[0] = dst.padding0[1] = dst.padding0[2] = 0.0f;
+
+  dst.phase_in_rad0 = p.phase_in_rad[0];
+  dst.phase_in_rad1 = p.phase_in_rad[1];
+  dst.phase_in_rad2 = p.phase_in_rad[2];
+  dst.padding1 = 0.0f;
+
+  dst.joint_bilateral_ab_threshold = p.joint_bilateral_ab_threshold;
+  dst.joint_bilateral_max_edge = p.joint_bilateral_max_edge;
+  dst.joint_bilateral_exp = p.joint_bilateral_exp;
+  /* Precomputed threshold used by filterPixelStage1. */
+  dst.joint_bilateral_threshold = (p.joint_bilateral_ab_threshold * p.joint_bilateral_ab_threshold)
+                                  / (p.ab_multiplier * p.ab_multiplier);
+
+  dst.gaussian_kernel0 = p.gaussian_kernel[0];
+  dst.gaussian_kernel1 = p.gaussian_kernel[1];
+  dst.gaussian_kernel2 = p.gaussian_kernel[2];
+  dst.gaussian_kernel3 = p.gaussian_kernel[3];
+  dst.gaussian_kernel4 = p.gaussian_kernel[4];
+  dst.gaussian_kernel5 = p.gaussian_kernel[5];
+  dst.gaussian_kernel6 = p.gaussian_kernel[6];
+  dst.gaussian_kernel7 = p.gaussian_kernel[7];
+  dst.gaussian_kernel8 = p.gaussian_kernel[8];
+  dst.padding2[0] = dst.padding2[1] = dst.padding2[2] = 0.0f;
+
+  dst.phase_offset = p.phase_offset;
+  dst.unambigious_dist = p.unambigious_dist;
+  dst.individual_ab_threshold = p.individual_ab_threshold;
+  dst.ab_threshold = p.ab_threshold;
+  dst.ab_confidence_slope = p.ab_confidence_slope;
+  dst.ab_confidence_offset = p.ab_confidence_offset;
+  dst.min_dealias_confidence = p.min_dealias_confidence;
+  dst.max_dealias_confidence = p.max_dealias_confidence;
+
+  dst.edge_ab_avg_min_value = p.edge_ab_avg_min_value;
+  dst.edge_ab_std_dev_threshold = p.edge_ab_std_dev_threshold;
+  dst.edge_close_delta_threshold = p.edge_close_delta_threshold;
+  dst.edge_far_delta_threshold = p.edge_far_delta_threshold;
+  dst.edge_max_delta_threshold = p.edge_max_delta_threshold;
+  dst.edge_avg_delta_threshold = p.edge_avg_delta_threshold;
+  dst.max_edge_count = p.max_edge_count;
+  dst.padding3 = 0.0f;
+
+  /* Config values are in metres; shaders expect millimetres. */
+  dst.min_depth = cfg.MinDepth * 1000.0f;
+  dst.max_depth = cfg.MaxDepth * 1000.0f;
+  dst.padding4[0] = dst.padding4[1] = 0.0f;
+}
+
+/** PIMPL implementation struct holding all Metal objects. */
+class MetalDepthPacketProcessorImpl
+{
+public:
+  id<MTLDevice>              device;
+  id<MTLCommandQueue>        command_queue;
+
+  id<MTLComputePipelineState> pipeline_stage1;
+  id<MTLComputePipelineState> pipeline_filter_stage1;
+  id<MTLComputePipelineState> pipeline_stage2;
+  id<MTLComputePipelineState> pipeline_filter_stage2;
+
+  /* Static lookup/calibration buffers. */
+  id<MTLBuffer> buf_lut11to16;
+  id<MTLBuffer> buf_p0_table;   /* float3 (16-byte aligned), IMAGE_SIZE entries */
+  id<MTLBuffer> buf_x_table;
+  id<MTLBuffer> buf_z_table;
+
+  /* Per-frame input buffer (raw USB packet). */
+  id<MTLBuffer> buf_packet;
+
+  /* Intermediate GPU buffers. */
+  id<MTLBuffer> buf_a;
+  id<MTLBuffer> buf_b;
+  id<MTLBuffer> buf_n;
+  id<MTLBuffer> buf_ir;
+  id<MTLBuffer> buf_a_filtered;
+  id<MTLBuffer> buf_b_filtered;
+  id<MTLBuffer> buf_edge_test;
+  id<MTLBuffer> buf_depth;
+  id<MTLBuffer> buf_ir_sum;
+  id<MTLBuffer> buf_filtered;
+
+  /* Parameters constant buffer. */
+  id<MTLBuffer> buf_params;
+
+  /* Output frames. */
+  Frame *ir_frame;
+  Frame *depth_frame;
+
+  DepthPacketProcessor::Parameters params;
+  DepthPacketProcessor::Config config;
+
+  bool device_initialized;
+  bool runtime_ok;
+
+  MetalDepthPacketProcessorImpl()
+    : device(nil)
+    , command_queue(nil)
+    , pipeline_stage1(nil)
+    , pipeline_filter_stage1(nil)
+    , pipeline_stage2(nil)
+    , pipeline_filter_stage2(nil)
+    , buf_lut11to16(nil)
+    , buf_p0_table(nil)
+    , buf_x_table(nil)
+    , buf_z_table(nil)
+    , buf_packet(nil)
+    , buf_a(nil)
+    , buf_b(nil)
+    , buf_n(nil)
+    , buf_ir(nil)
+    , buf_a_filtered(nil)
+    , buf_b_filtered(nil)
+    , buf_edge_test(nil)
+    , buf_depth(nil)
+    , buf_ir_sum(nil)
+    , buf_filtered(nil)
+    , buf_params(nil)
+    , ir_frame(NULL)
+    , depth_frame(NULL)
+    , device_initialized(false)
+    , runtime_ok(true)
+  {
+    device_initialized = init();
+    if(device_initialized)
+    {
+      newIrFrame();
+      newDepthFrame();
+    }
+  }
+
+  ~MetalDepthPacketProcessorImpl()
+  {
+    delete ir_frame;
+    delete depth_frame;
+
+    /* ARC / manual release: set to nil to release all Metal objects. */
+    buf_params = nil;
+    buf_filtered = nil;
+    buf_ir_sum = nil;
+    buf_depth = nil;
+    buf_edge_test = nil;
+    buf_b_filtered = nil;
+    buf_a_filtered = nil;
+    buf_ir = nil;
+    buf_n = nil;
+    buf_b = nil;
+    buf_a = nil;
+    buf_packet = nil;
+    buf_z_table = nil;
+    buf_x_table = nil;
+    buf_p0_table = nil;
+    buf_lut11to16 = nil;
+    pipeline_filter_stage2 = nil;
+    pipeline_stage2 = nil;
+    pipeline_filter_stage1 = nil;
+    pipeline_stage1 = nil;
+    command_queue = nil;
+    device = nil;
+  }
+
+  /** Create a new (empty) IR output frame. */
+  void newIrFrame()
+  {
+    delete ir_frame;
+    ir_frame = new Frame(IMAGE_WIDTH, IMAGE_HEIGHT, 4);
+    ir_frame->format = Frame::Float;
+  }
+
+  /** Create a new (empty) depth output frame. */
+  void newDepthFrame()
+  {
+    delete depth_frame;
+    depth_frame = new Frame(IMAGE_WIDTH, IMAGE_HEIGHT, 4);
+    depth_frame->format = Frame::Float;
+  }
+
+  /** Allocate a shared-storage MTLBuffer (zero-copy on Apple Silicon). */
+  id<MTLBuffer> makeBuffer(size_t size)
+  {
+    id<MTLBuffer> buf = [device newBufferWithLength:size
+                                           options:MTLResourceStorageModeShared];
+    if(!buf)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: failed to allocate MTLBuffer of size " << size;
+    }
+    return buf;
+  }
+
+  /** Locate the compiled Metal library alongside the running dylib.
+   *
+   *  Strategy: walk up the dylib path to find the build/install directory,
+   *  then look for default.metallib next to it or in a known relative path.
+   *  Falls back to the executable's bundle or current working directory. */
+  id<MTLLibrary> loadMetalLibrary()
+  {
+    NSError *error = nil;
+    id<MTLLibrary> lib = nil;
+
+    /* 1. Try to load from the same directory as this dylib. */
+    Dl_info info;
+    if(dladdr((void *)&loadMetalLibrary_stub, &info) && info.dli_fname)
+    {
+      NSString *dylib_path = [NSString stringWithUTF8String:info.dli_fname];
+      NSString *dylib_dir = [dylib_path stringByDeletingLastPathComponent];
+      NSString *metallib_path = [dylib_dir stringByAppendingPathComponent:@"default.metallib"];
+
+      lib = [device newLibraryWithURL:[NSURL fileURLWithPath:metallib_path] error:&error];
+      if(lib)
+      {
+        LOG_INFO << "MetalDepthPacketProcessor: loaded Metal library from " << [metallib_path UTF8String];
+        return lib;
+      }
+    }
+
+    /* 2. Try the main bundle (useful when linked into an app). */
+    lib = [device newDefaultLibraryWithBundle:[NSBundle mainBundle] error:&error];
+    if(lib)
+    {
+      LOG_INFO << "MetalDepthPacketProcessor: loaded Metal library from main bundle";
+      return lib;
+    }
+
+    /* 3. Try the default library of the device (works if the .metal was compiled
+     *    into the app target itself). */
+    lib = [device newDefaultLibrary];
+    if(lib)
+    {
+      LOG_INFO << "MetalDepthPacketProcessor: using device default Metal library";
+      return lib;
+    }
+
+    LOG_ERROR << "MetalDepthPacketProcessor: could not find default.metallib. "
+              << "Make sure it is installed alongside libfreenect2.dylib.";
+    return nil;
+  }
+
+  /** Dummy static function whose address is used for dladdr dylib path lookup. */
+  static void loadMetalLibrary_stub() {}
+
+  /** Initialise the Metal device, command queue, pipelines, and buffers. */
+  bool init()
+  {
+    @autoreleasepool
+    {
+      device = MTLCreateSystemDefaultDevice();
+      if(!device)
+      {
+        LOG_ERROR << "MetalDepthPacketProcessor: no Metal device available.";
+        return false;
+      }
+
+      LOG_INFO << "MetalDepthPacketProcessor: using device " << [[device name] UTF8String];
+
+      command_queue = [device newCommandQueue];
+      if(!command_queue)
+      {
+        LOG_ERROR << "MetalDepthPacketProcessor: failed to create command queue.";
+        return false;
+      }
+
+      id<MTLLibrary> library = loadMetalLibrary();
+      if(!library)
+        return false;
+
+      if(!buildPipelines(library))
+        return false;
+
+      if(!allocateBuffers())
+        return false;
+
+      return true;
+    }
+  }
+
+  /** Build the four compute pipeline states from the Metal library. */
+  bool buildPipelines(id<MTLLibrary> library)
+  {
+    NSError *error = nil;
+
+    id<MTLFunction> fn_stage1 = [library newFunctionWithName:@"processPixelStage1"];
+    if(!fn_stage1)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: kernel 'processPixelStage1' not found.";
+      return false;
+    }
+    pipeline_stage1 = [device newComputePipelineStateWithFunction:fn_stage1 error:&error];
+    if(!pipeline_stage1)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: failed to build processPixelStage1 pipeline: "
+                << [[error localizedDescription] UTF8String];
+      return false;
+    }
+
+    id<MTLFunction> fn_filter1 = [library newFunctionWithName:@"filterPixelStage1"];
+    if(!fn_filter1)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: kernel 'filterPixelStage1' not found.";
+      return false;
+    }
+    pipeline_filter_stage1 = [device newComputePipelineStateWithFunction:fn_filter1 error:&error];
+    if(!pipeline_filter_stage1)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: failed to build filterPixelStage1 pipeline: "
+                << [[error localizedDescription] UTF8String];
+      return false;
+    }
+
+    id<MTLFunction> fn_stage2 = [library newFunctionWithName:@"processPixelStage2"];
+    if(!fn_stage2)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: kernel 'processPixelStage2' not found.";
+      return false;
+    }
+    pipeline_stage2 = [device newComputePipelineStateWithFunction:fn_stage2 error:&error];
+    if(!pipeline_stage2)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: failed to build processPixelStage2 pipeline: "
+                << [[error localizedDescription] UTF8String];
+      return false;
+    }
+
+    id<MTLFunction> fn_filter2 = [library newFunctionWithName:@"filterPixelStage2"];
+    if(!fn_filter2)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: kernel 'filterPixelStage2' not found.";
+      return false;
+    }
+    pipeline_filter_stage2 = [device newComputePipelineStateWithFunction:fn_filter2 error:&error];
+    if(!pipeline_filter_stage2)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: failed to build filterPixelStage2 pipeline: "
+                << [[error localizedDescription] UTF8String];
+      return false;
+    }
+
+    return true;
+  }
+
+  /** Allocate all Metal buffers. */
+  bool allocateBuffers()
+  {
+    /* float3 in Metal is stored 16-byte aligned (4 floats per element). */
+    const size_t float3_size = 4 * sizeof(float);
+
+    buf_lut11to16  = makeBuffer(LUT_SIZE * sizeof(short));
+    buf_p0_table   = makeBuffer(IMAGE_SIZE * float3_size);
+    buf_x_table    = makeBuffer(IMAGE_SIZE * sizeof(float));
+    buf_z_table    = makeBuffer(IMAGE_SIZE * sizeof(float));
+
+    /* Raw packet: 10 sub-frames, each 424 rows * 352 ushorts. */
+    buf_packet = makeBuffer(((IMAGE_SIZE * 11) / 16) * 10 * sizeof(unsigned short));
+
+    buf_a          = makeBuffer(IMAGE_SIZE * float3_size);
+    buf_b          = makeBuffer(IMAGE_SIZE * float3_size);
+    buf_n          = makeBuffer(IMAGE_SIZE * float3_size);
+    buf_ir         = makeBuffer(IMAGE_SIZE * sizeof(float));
+    buf_a_filtered = makeBuffer(IMAGE_SIZE * float3_size);
+    buf_b_filtered = makeBuffer(IMAGE_SIZE * float3_size);
+    buf_edge_test  = makeBuffer(IMAGE_SIZE * sizeof(uint8_t));
+    buf_depth      = makeBuffer(IMAGE_SIZE * sizeof(float));
+    buf_ir_sum     = makeBuffer(IMAGE_SIZE * sizeof(float));
+    buf_filtered   = makeBuffer(IMAGE_SIZE * sizeof(float));
+    buf_params     = makeBuffer(sizeof(MetalDepthParamsBuffer));
+
+    /* Verify that all allocations succeeded. */
+    if(!buf_lut11to16 || !buf_p0_table || !buf_x_table || !buf_z_table ||
+       !buf_packet || !buf_a || !buf_b || !buf_n || !buf_ir ||
+       !buf_a_filtered || !buf_b_filtered || !buf_edge_test ||
+       !buf_depth || !buf_ir_sum || !buf_filtered || !buf_params)
+    {
+      LOG_ERROR << "MetalDepthPacketProcessor: buffer allocation failed.";
+      return false;
+    }
+
+    return true;
+  }
+
+  /** Upload processing parameters to the GPU constant buffer. */
+  void uploadParams()
+  {
+    MetalDepthParamsBuffer *dst = (MetalDepthParamsBuffer *)[buf_params contents];
+    fillParamsBuffer(*dst, params, config);
+  }
+
+  /** Dispatch one compute pass using threadgroupsWithRemainder (non-uniform). */
+  void dispatchKernel(id<MTLComputeCommandEncoder> enc,
+                      id<MTLComputePipelineState> pso)
+  {
+    /* One thread per pixel over the 512x424 image. */
+    MTLSize threads_per_grid = MTLSizeMake(IMAGE_SIZE, 1, 1);
+    /* 64-thread threadgroups work well on all Apple GPU generations. */
+    NSUInteger tg_size = MIN(64u, [pso maxTotalThreadsPerThreadgroup]);
+    MTLSize threads_per_tg = MTLSizeMake(tg_size, 1, 1);
+    /* Non-uniform dispatch: handles IMAGE_SIZE not divisible by tg_size. */
+    [enc dispatchThreads:threads_per_grid threadsPerThreadgroup:threads_per_tg];
+  }
+
+  /** Run the full depth processing pipeline for one packet.
+   *
+   *  Stages dispatched:
+   *    1. processPixelStage1
+   *    2. filterPixelStage1  (only when bilateral filter is enabled)
+   *    3. processPixelStage2
+   *    4. filterPixelStage2  (only when edge-aware filter is enabled)
+   *
+   *  The call blocks until all GPU work is complete so that the output
+   *  frames are ready when process() returns. */
+  bool run(const DepthPacket &packet)
+  {
+    @autoreleasepool
+    {
+      /* Upload raw packet data (zero-copy on unified memory). */
+      memcpy([buf_packet contents], packet.buffer, MIN(packet.buffer_length, [buf_packet length]));
+
+      /* Upload current parameters. */
+      uploadParams();
+
+      id<MTLCommandBuffer> cmd = [command_queue commandBuffer];
+      if(!cmd)
+      {
+        LOG_ERROR << "MetalDepthPacketProcessor: failed to create command buffer.";
+        return false;
+      }
+
+      /* ------------------------------------------------------------------ */
+      /* Stage 1: processPixelStage1                                         */
+      /* ------------------------------------------------------------------ */
+      {
+        id<MTLComputeCommandEncoder> enc = [cmd computeCommandEncoder];
+        [enc setComputePipelineState:pipeline_stage1];
+        [enc setBuffer:buf_lut11to16 offset:0 atIndex:0];
+        [enc setBuffer:buf_z_table   offset:0 atIndex:1];
+        [enc setBuffer:buf_p0_table  offset:0 atIndex:2];
+        [enc setBuffer:buf_packet    offset:0 atIndex:3];
+        [enc setBuffer:buf_a         offset:0 atIndex:4];
+        [enc setBuffer:buf_b         offset:0 atIndex:5];
+        [enc setBuffer:buf_n         offset:0 atIndex:6];
+        [enc setBuffer:buf_ir        offset:0 atIndex:7];
+        [enc setBuffer:buf_params    offset:0 atIndex:8];
+        dispatchKernel(enc, pipeline_stage1);
+        [enc endEncoding];
+      }
+
+      /* ------------------------------------------------------------------ */
+      /* Stage 1 filter (optional)                                           */
+      /* ------------------------------------------------------------------ */
+      if(config.EnableBilateralFilter)
+      {
+        id<MTLComputeCommandEncoder> enc = [cmd computeCommandEncoder];
+        [enc setComputePipelineState:pipeline_filter_stage1];
+        [enc setBuffer:buf_a           offset:0 atIndex:0];
+        [enc setBuffer:buf_b           offset:0 atIndex:1];
+        [enc setBuffer:buf_n           offset:0 atIndex:2];
+        [enc setBuffer:buf_a_filtered  offset:0 atIndex:3];
+        [enc setBuffer:buf_b_filtered  offset:0 atIndex:4];
+        [enc setBuffer:buf_edge_test   offset:0 atIndex:5];
+        [enc setBuffer:buf_params      offset:0 atIndex:6];
+        dispatchKernel(enc, pipeline_filter_stage1);
+        [enc endEncoding];
+      }
+
+      /* ------------------------------------------------------------------ */
+      /* Stage 2: processPixelStage2                                         */
+      /* ------------------------------------------------------------------ */
+      {
+        id<MTLComputeCommandEncoder> enc = [cmd computeCommandEncoder];
+        [enc setComputePipelineState:pipeline_stage2];
+        /* Use filtered a/b if bilateral filter was run, otherwise raw. */
+        id<MTLBuffer> a_src = config.EnableBilateralFilter ? buf_a_filtered : buf_a;
+        id<MTLBuffer> b_src = config.EnableBilateralFilter ? buf_b_filtered : buf_b;
+        [enc setBuffer:a_src       offset:0 atIndex:0];
+        [enc setBuffer:b_src       offset:0 atIndex:1];
+        [enc setBuffer:buf_x_table offset:0 atIndex:2];
+        [enc setBuffer:buf_z_table offset:0 atIndex:3];
+        [enc setBuffer:buf_depth   offset:0 atIndex:4];
+        [enc setBuffer:buf_ir_sum  offset:0 atIndex:5];
+        [enc setBuffer:buf_params  offset:0 atIndex:6];
+        dispatchKernel(enc, pipeline_stage2);
+        [enc endEncoding];
+      }
+
+      /* ------------------------------------------------------------------ */
+      /* Stage 2 filter (optional)                                           */
+      /* ------------------------------------------------------------------ */
+      if(config.EnableEdgeAwareFilter)
+      {
+        id<MTLComputeCommandEncoder> enc = [cmd computeCommandEncoder];
+        [enc setComputePipelineState:pipeline_filter_stage2];
+        [enc setBuffer:buf_depth      offset:0 atIndex:0];
+        [enc setBuffer:buf_ir_sum     offset:0 atIndex:1];
+        [enc setBuffer:buf_edge_test  offset:0 atIndex:2];
+        [enc setBuffer:buf_filtered   offset:0 atIndex:3];
+        [enc setBuffer:buf_params     offset:0 atIndex:4];
+        dispatchKernel(enc, pipeline_filter_stage2);
+        [enc endEncoding];
+      }
+
+      /* Commit and wait — process() must return with frames ready. */
+      [cmd commit];
+      [cmd waitUntilCompleted];
+
+      if([cmd status] == MTLCommandBufferStatusError)
+      {
+        LOG_ERROR << "MetalDepthPacketProcessor: command buffer execution error: "
+                  << [[[cmd error] localizedDescription] UTF8String];
+        return false;
+      }
+
+      /* Copy results into output frames (zero-copy on shared storage). */
+      const float *depth_src = config.EnableEdgeAwareFilter
+                               ? (const float *)[buf_filtered contents]
+                               : (const float *)[buf_depth contents];
+      const float *ir_src = (const float *)[buf_ir contents];
+
+      memcpy(ir_frame->data,    ir_src,    IMAGE_SIZE * sizeof(float));
+      memcpy(depth_frame->data, depth_src, IMAGE_SIZE * sizeof(float));
+
+      return true;
+    }
+  }
+};
+
+/* -------------------------------------------------------------------------- */
+/* MetalDepthPacketProcessor public interface                                  */
+/* -------------------------------------------------------------------------- */
+
+MetalDepthPacketProcessor::MetalDepthPacketProcessor(const int /*deviceIndex*/)
+  : impl_(new MetalDepthPacketProcessorImpl())
+{
+}
+
+MetalDepthPacketProcessor::~MetalDepthPacketProcessor()
+{
+  delete impl_;
+}
+
+void MetalDepthPacketProcessor::setConfiguration(const libfreenect2::DepthPacketProcessor::Config &config)
+{
+  DepthPacketProcessor::setConfiguration(config);
+  impl_->config = config;
+  /* Re-upload parameters on next process() call via uploadParams(). */
+}
+
+void MetalDepthPacketProcessor::loadP0TablesFromCommandResponse(unsigned char *buffer,
+                                                                 size_t buffer_length)
+{
+  if(!impl_->device_initialized)
+  {
+    LOG_ERROR << "MetalDepthPacketProcessor: not initialized.";
+    return;
+  }
+
+  libfreenect2::protocol::P0TablesResponse *p0table =
+      (libfreenect2::protocol::P0TablesResponse *)buffer;
+
+  if(buffer_length < sizeof(libfreenect2::protocol::P0TablesResponse))
+  {
+    LOG_ERROR << "P0Table response too short!";
+    return;
+  }
+
+  /* Convert uint16 p0 values to float radians and pack into float3 (float4)
+   * layout matching the Metal shader buffer expectation. */
+  float *p0_dst = (float *)[impl_->buf_p0_table contents];
+
+  for(int r = 0; r < 424; ++r)
+  {
+    float *it = p0_dst + r * 512 * 4; /* 4 floats per float3 slot */
+    const uint16_t *it0 = &p0table->p0table0[r * 512];
+    const uint16_t *it1 = &p0table->p0table1[r * 512];
+    const uint16_t *it2 = &p0table->p0table2[r * 512];
+
+    for(int c = 0; c < 512; ++c, it += 4, ++it0, ++it1, ++it2)
+    {
+      it[0] = -((float)*it0) * 0.000031f * (float)M_PI;
+      it[1] = -((float)*it1) * 0.000031f * (float)M_PI;
+      it[2] = -((float)*it2) * 0.000031f * (float)M_PI;
+      it[3] = 0.0f;
+    }
+  }
+}
+
+void MetalDepthPacketProcessor::loadXZTables(const float *xtable, const float *ztable)
+{
+  if(!impl_->device_initialized)
+  {
+    LOG_ERROR << "MetalDepthPacketProcessor: not initialized.";
+    return;
+  }
+
+  memcpy([impl_->buf_x_table contents], xtable, TABLE_SIZE * sizeof(float));
+  memcpy([impl_->buf_z_table contents], ztable, TABLE_SIZE * sizeof(float));
+}
+
+void MetalDepthPacketProcessor::loadLookupTable(const short *lut)
+{
+  if(!impl_->device_initialized)
+  {
+    LOG_ERROR << "MetalDepthPacketProcessor: not initialized.";
+    return;
+  }
+
+  memcpy([impl_->buf_lut11to16 contents], lut, LUT_SIZE * sizeof(short));
+}
+
+bool MetalDepthPacketProcessor::good()
+{
+  return impl_->device_initialized && impl_->runtime_ok;
+}
+
+void MetalDepthPacketProcessor::process(const DepthPacket &packet)
+{
+  if(!listener_)
+    return;
+
+  if(!impl_->device_initialized)
+  {
+    LOG_ERROR << "MetalDepthPacketProcessor: not initialized, dropping packet.";
+    return;
+  }
+
+  impl_->ir_frame->timestamp = packet.timestamp;
+  impl_->depth_frame->timestamp = packet.timestamp;
+  impl_->ir_frame->sequence = packet.sequence;
+  impl_->depth_frame->sequence = packet.sequence;
+
+  impl_->runtime_ok = impl_->run(packet);
+
+  if(!impl_->runtime_ok)
+  {
+    impl_->ir_frame->status = 1;
+    impl_->depth_frame->status = 1;
+  }
+
+  if(listener_->onNewFrame(Frame::Ir, impl_->ir_frame))
+  {
+    impl_->ir_frame = NULL;   // listener took ownership; don't let newIrFrame() delete it
+    impl_->newIrFrame();
+  }
+  if(listener_->onNewFrame(Frame::Depth, impl_->depth_frame))
+  {
+    impl_->depth_frame = NULL; // listener took ownership; don't let newDepthFrame() delete it
+    impl_->newDepthFrame();
+  }
+}
+
+} /* namespace libfreenect2 */
diff --git a/src/packet_pipeline.cpp b/src/packet_pipeline.cpp
index e5d7ca315..f441a10c8 100644
--- a/src/packet_pipeline.cpp
+++ b/src/packet_pipeline.cpp
@@ -178,6 +178,15 @@ CudaKdePacketPipeline::CudaKdePacketPipeline(const int deviceId) : deviceId(devi
 CudaPacketPipeline::~CudaPacketPipeline() { }
 #endif // LIBFREENECT2_WITH_CUDA_SUPPORT
 
+#ifdef LIBFREENECT2_WITH_METAL_SUPPORT
+MetalPacketPipeline::MetalPacketPipeline(const int deviceId) : deviceId(deviceId)
+{
+  comp_->initialize(getDefaultRgbPacketProcessor(), new MetalDepthPacketProcessor(deviceId));
+}
+
+MetalPacketPipeline::~MetalPacketPipeline() { }
+#endif // LIBFREENECT2_WITH_METAL_SUPPORT
+
 DumpPacketPipeline::DumpPacketPipeline()
 {
   RgbPacketProcessor *rgb = new DumpRgbPacketProcessor();