From 2c6f904e9def9d9f543b1398385d0a55b06430da Mon Sep 17 00:00:00 2001 From: Matthew Kocher Date: Wed, 28 Jan 2026 09:37:07 -0800 Subject: [PATCH 1/2] Add cgroups v2 support for Jammy stemcells This enables warden stemcells to function on hosts that are using cgroups v2, which is increasingly common. This should not have any effects on other infrastructures as the stemcell kernel will continue to be booted with cgroups v1. --- .gitignore | 1 + .../bosh_monit/assets/monit-access-helper.sh | 24 ++++++++++++++----- .../assets/restrict-monit-api-access | 22 ++++++++++++----- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 0d92f36974..4fd9f7993c 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ acceptance-tests/os-conf-release **/*.log ci/docker/VMware-ovftool-*.bundle +tmp/ diff --git a/stemcell_builder/stages/bosh_monit/assets/monit-access-helper.sh b/stemcell_builder/stages/bosh_monit/assets/monit-access-helper.sh index 3f66ca3a2d..2117b9cd05 100644 --- a/stemcell_builder/stages/bosh_monit/assets/monit-access-helper.sh +++ b/stemcell_builder/stages/bosh_monit/assets/monit-access-helper.sh @@ -13,12 +13,24 @@ monit_isolation_classid=2958295041 permit_monit_access() { - net_cls_location="$(cat /proc/self/mounts | grep ^cgroup | grep net_cls | awk '{ print $2 }' )" - net_cls_subproc="$(grep net_cls /proc/self/cgroup | awk -F ":" '{ print $3 }' )" - monit_access_cgroup="${net_cls_location}/${net_cls_subproc}/monit-api-access" + if grep -q '^0::' /proc/self/cgroup 2>/dev/null; then + # cgroupv2 (unified hierarchy) + # Create a sub-cgroup under the current process's cgroup and move into it. + # The iptables rules match on this cgroup path. + cgroup_mount="$(awk '$3 == "cgroup2" { print $2 }' /proc/self/mounts)" + current_cgroup="$(grep '^0::' /proc/self/cgroup | cut -d: -f3)" + monit_access_cgroup="${cgroup_mount}${current_cgroup}/monit-api-access" - mkdir -p "${monit_access_cgroup}" - echo "${monit_isolation_classid}" > "${monit_access_cgroup}/net_cls.classid" + mkdir -p "${monit_access_cgroup}" + echo $$ > "${monit_access_cgroup}/cgroup.procs" + else + # cgroupv1 - use net_cls classid + net_cls_location="$(cat /proc/self/mounts | grep ^cgroup | grep net_cls | awk '{ print $2 }')" + net_cls_subproc="$(grep net_cls /proc/self/cgroup | awk -F ":" '{ print $3 }')" + monit_access_cgroup="${net_cls_location}/${net_cls_subproc}/monit-api-access" - echo $$ > "${monit_access_cgroup}/tasks" + mkdir -p "${monit_access_cgroup}" + echo "${monit_isolation_classid}" > "${monit_access_cgroup}/net_cls.classid" + echo $$ > "${monit_access_cgroup}/tasks" + fi } diff --git a/stemcell_builder/stages/bosh_monit/assets/restrict-monit-api-access b/stemcell_builder/stages/bosh_monit/assets/restrict-monit-api-access index 4b25e2fbaf..bce892ee30 100644 --- a/stemcell_builder/stages/bosh_monit/assets/restrict-monit-api-access +++ b/stemcell_builder/stages/bosh_monit/assets/restrict-monit-api-access @@ -2,13 +2,23 @@ source /var/vcap/bosh/etc/monit-access-helper.sh -if iptables -t mangle -C POSTROUTING -d 127.0.0.1 -p tcp --dport 2822 \ - -m cgroup \! --cgroup "${monit_isolation_classid}" -j DROP -then - /bin/true +if grep -q '^0::' /proc/self/cgroup 2>/dev/null; then + # cgroupv2: dynamically determine the cgroup path for this process. + # The agent calls permit_monit_access() to join the monit-api-access sub-cgroup. + current_cgroup="$(grep '^0::' /proc/self/cgroup | cut -d: -f3)" + cgroup_match="--path ${current_cgroup}/monit-api-access" else + # cgroupv1: use the classid from monit-access-helper.sh + cgroup_match="--cgroup ${monit_isolation_classid}" +fi + +# Add iptables rules if they don't already exist. +# The DROP rule blocks traffic to monit (port 2822) from processes outside the monit cgroup. +# The ESTABLISHED,RELATED rule ensures existing connections aren't broken. +if ! iptables -t mangle -C POSTROUTING -d 127.0.0.1 -p tcp --dport 2822 \ + -m cgroup ! ${cgroup_match} -j DROP 2>/dev/null; then iptables -t mangle -I POSTROUTING -d 127.0.0.1 -p tcp --dport 2822 \ - -m cgroup \! --cgroup "${monit_isolation_classid}" -j DROP + -m cgroup ! ${cgroup_match} -j DROP iptables -t mangle -I POSTROUTING -d 127.0.0.1 -p tcp --dport 2822 \ - -m state --state ESTABLISHED,RELATED -j ACCEPT + -m state --state ESTABLISHED,RELATED -j ACCEPT fi From 2b272538dc47e78e86a57105b65793f096ebdeea Mon Sep 17 00:00:00 2001 From: Clay Kauzlaric Date: Wed, 4 Feb 2026 17:40:14 -0500 Subject: [PATCH 2/2] Prevent systemd-binfmt from running in containers When stemcells run as privileged containers (e.g., Docker CPI on Apple Silicon), systemd-binfmt clears the host's binfmt_misc registrations, including Rosetta, causing "exec format error" for x86_64 processes. Add a drop-in override with ConditionVirtualization=!container to skip the service in containers while preserving normal behavior on VMs. Signed-off-by: Matthew Kocher --- stemcell_builder/stages/bosh_systemd/apply.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/stemcell_builder/stages/bosh_systemd/apply.sh b/stemcell_builder/stages/bosh_systemd/apply.sh index 9edd672da8..b5cc017584 100755 --- a/stemcell_builder/stages/bosh_systemd/apply.sh +++ b/stemcell_builder/stages/bosh_systemd/apply.sh @@ -12,3 +12,14 @@ source $base_dir/lib/prelude_bosh.bash run_in_chroot $chroot " echo 'RemoveIPC=no' >> /etc/systemd/logind.conf " + +# Prevent systemd-binfmt from running in containers. +# When running in a privileged container (e.g., Docker CPI on Apple Silicon), +# this service clears the host's binfmt_misc registrations (including Rosetta), +# causing "exec format error" for all subsequent x86_64 processes. +mkdir -p $chroot/etc/systemd/system/systemd-binfmt.service.d + +cat > $chroot/etc/systemd/system/systemd-binfmt.service.d/skip-in-container.conf <