diff --git a/.gitignore b/.gitignore index 0d92f36974..4fd9f7993c 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ acceptance-tests/os-conf-release **/*.log ci/docker/VMware-ovftool-*.bundle +tmp/ diff --git a/stemcell_builder/stages/bosh_monit/assets/monit-access-helper.sh b/stemcell_builder/stages/bosh_monit/assets/monit-access-helper.sh index 3f66ca3a2d..2117b9cd05 100644 --- a/stemcell_builder/stages/bosh_monit/assets/monit-access-helper.sh +++ b/stemcell_builder/stages/bosh_monit/assets/monit-access-helper.sh @@ -13,12 +13,24 @@ monit_isolation_classid=2958295041 permit_monit_access() { - net_cls_location="$(cat /proc/self/mounts | grep ^cgroup | grep net_cls | awk '{ print $2 }' )" - net_cls_subproc="$(grep net_cls /proc/self/cgroup | awk -F ":" '{ print $3 }' )" - monit_access_cgroup="${net_cls_location}/${net_cls_subproc}/monit-api-access" + if grep -q '^0::' /proc/self/cgroup 2>/dev/null; then + # cgroupv2 (unified hierarchy) + # Create a sub-cgroup under the current process's cgroup and move into it. + # The iptables rules match on this cgroup path. + cgroup_mount="$(awk '$3 == "cgroup2" { print $2 }' /proc/self/mounts)" + current_cgroup="$(grep '^0::' /proc/self/cgroup | cut -d: -f3)" + monit_access_cgroup="${cgroup_mount}${current_cgroup}/monit-api-access" - mkdir -p "${monit_access_cgroup}" - echo "${monit_isolation_classid}" > "${monit_access_cgroup}/net_cls.classid" + mkdir -p "${monit_access_cgroup}" + echo $$ > "${monit_access_cgroup}/cgroup.procs" + else + # cgroupv1 - use net_cls classid + net_cls_location="$(cat /proc/self/mounts | grep ^cgroup | grep net_cls | awk '{ print $2 }')" + net_cls_subproc="$(grep net_cls /proc/self/cgroup | awk -F ":" '{ print $3 }')" + monit_access_cgroup="${net_cls_location}/${net_cls_subproc}/monit-api-access" - echo $$ > "${monit_access_cgroup}/tasks" + mkdir -p "${monit_access_cgroup}" + echo "${monit_isolation_classid}" > "${monit_access_cgroup}/net_cls.classid" + echo $$ > "${monit_access_cgroup}/tasks" + fi } diff --git a/stemcell_builder/stages/bosh_monit/assets/restrict-monit-api-access b/stemcell_builder/stages/bosh_monit/assets/restrict-monit-api-access index 4b25e2fbaf..bce892ee30 100644 --- a/stemcell_builder/stages/bosh_monit/assets/restrict-monit-api-access +++ b/stemcell_builder/stages/bosh_monit/assets/restrict-monit-api-access @@ -2,13 +2,23 @@ source /var/vcap/bosh/etc/monit-access-helper.sh -if iptables -t mangle -C POSTROUTING -d 127.0.0.1 -p tcp --dport 2822 \ - -m cgroup \! --cgroup "${monit_isolation_classid}" -j DROP -then - /bin/true +if grep -q '^0::' /proc/self/cgroup 2>/dev/null; then + # cgroupv2: dynamically determine the cgroup path for this process. + # The agent calls permit_monit_access() to join the monit-api-access sub-cgroup. + current_cgroup="$(grep '^0::' /proc/self/cgroup | cut -d: -f3)" + cgroup_match="--path ${current_cgroup}/monit-api-access" else + # cgroupv1: use the classid from monit-access-helper.sh + cgroup_match="--cgroup ${monit_isolation_classid}" +fi + +# Add iptables rules if they don't already exist. +# The DROP rule blocks traffic to monit (port 2822) from processes outside the monit cgroup. +# The ESTABLISHED,RELATED rule ensures existing connections aren't broken. +if ! iptables -t mangle -C POSTROUTING -d 127.0.0.1 -p tcp --dport 2822 \ + -m cgroup ! ${cgroup_match} -j DROP 2>/dev/null; then iptables -t mangle -I POSTROUTING -d 127.0.0.1 -p tcp --dport 2822 \ - -m cgroup \! --cgroup "${monit_isolation_classid}" -j DROP + -m cgroup ! ${cgroup_match} -j DROP iptables -t mangle -I POSTROUTING -d 127.0.0.1 -p tcp --dport 2822 \ - -m state --state ESTABLISHED,RELATED -j ACCEPT + -m state --state ESTABLISHED,RELATED -j ACCEPT fi diff --git a/stemcell_builder/stages/bosh_systemd/apply.sh b/stemcell_builder/stages/bosh_systemd/apply.sh index 9edd672da8..b5cc017584 100755 --- a/stemcell_builder/stages/bosh_systemd/apply.sh +++ b/stemcell_builder/stages/bosh_systemd/apply.sh @@ -12,3 +12,14 @@ source $base_dir/lib/prelude_bosh.bash run_in_chroot $chroot " echo 'RemoveIPC=no' >> /etc/systemd/logind.conf " + +# Prevent systemd-binfmt from running in containers. +# When running in a privileged container (e.g., Docker CPI on Apple Silicon), +# this service clears the host's binfmt_misc registrations (including Rosetta), +# causing "exec format error" for all subsequent x86_64 processes. +mkdir -p $chroot/etc/systemd/system/systemd-binfmt.service.d + +cat > $chroot/etc/systemd/system/systemd-binfmt.service.d/skip-in-container.conf <