From 978d390c8dd0f45e66a9ac674013b512a3d15cd3 Mon Sep 17 00:00:00 2001 From: ttbombadil Date: Wed, 8 Apr 2026 16:07:51 +0200 Subject: [PATCH 1/2] fix(e2e): docker recovery retry + null-safe getSandboxStatus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root causes of E2E failures: 1. Docker Desktop becomes temporarily unresponsive after 74s of heavy Docker tests on macOS. The single 'docker info' check right after the tests was failing, causing the E2E server to start without FORCE_DOCKER=1. All 5 sandbox runners then attempted eager docker checks, found the socket broken, and fell back to local-only mode (which has race conditions). Fix: replace the one-shot 'if docker info' with a 30-second retry loop (10 × 3s) that waits for Docker Desktop to recover before starting the E2E server and setting FORCE_DOCKER=1. 2. ws.on('close') sets clientState.runner = null while handleRunMessage is awaiting runSketch(), creating a race with the subsequent clientState.runner.getSandboxStatus() call - producing 'Cannot read properties of null' errors. Fix: capture runner reference before the await gap so getSandboxStatus() always has a valid non-null reference. --- run-tests.sh | 20 +++++++++++++++++--- server/routes/simulation.ws.ts | 6 +++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/run-tests.sh b/run-tests.sh index 79bb6c32..c122748c 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -149,14 +149,28 @@ fi # --- VORBEREITUNG SERVER (Kein nummerierter Task) --- echo -e "\n${B}▸ [Vorbereitung] Server-Start${RS}" lsof -ti:3000 | xargs kill -9 2>/dev/null || true -sleep 1 + +# Docker-Stabilisierung: Nach intensiven Container-Tests kurz auf Erholung warten. +# Docker Desktop auf macOS wird nach 74s Heavy-Load temporär unresponsive; 30s Retry. +DOCKER_FOR_E2E=0 +for _i in {1..10}; do + if docker info > /dev/null 2>&1; then + DOCKER_FOR_E2E=1 + [ "$_i" -gt 1 ] && echo -e " ${OK} Docker nach $((_i * 3))s wiederhergestellt" + break + fi + [ "$_i" -eq 1 ] && echo -e " ${RUN} Docker antwortet nicht – warte auf Erholung (max. 30s)..." + sleep 3 +done export PORT=3000 # Server startet im Hintergrund (NODE_ENV=development für Vite-Snapshots) -# FORCE_DOCKER + DOCKER_SANDBOX_IMAGE werden gesetzt, wenn Docker verfügbar ist (s. oben) -if docker info > /dev/null 2>&1; then +# FORCE_DOCKER + DOCKER_SANDBOX_IMAGE werden gesetzt, wenn Docker stabil verfügbar ist +if [ "$DOCKER_FOR_E2E" -eq 1 ]; then + echo -e " ${OK} Docker verfügbar – E2E mit Sandbox-Unterstützung" FORCE_DOCKER=1 DOCKER_SANDBOX_IMAGE=$DOCKER_SANDBOX_IMAGE UNOSIM_SHARED_TEMP_DIR=$UNOSIM_SHARED_TEMP_DIR NODE_ENV=development npm run dev >> "$LOG_FILE" 2>&1 & else + echo -e " ${WARN} Docker nicht verfügbar – E2E im Lokal-Modus" NODE_ENV=development npm run dev >> "$LOG_FILE" 2>&1 & fi SERVER_PID=$! diff --git a/server/routes/simulation.ws.ts b/server/routes/simulation.ws.ts index d666ae5f..ba4265e8 100644 --- a/server/routes/simulation.ws.ts +++ b/server/routes/simulation.ws.ts @@ -416,6 +416,10 @@ export function registerSimulationWebSocket(httpServer: Server, deps: Simulation ); } + // Capture runner reference before await – ws-close may set clientState.runner=null + // concurrently while runSketch is awaited, causing a null-dereference on getSandboxStatus. + const runnerForStatus = clientState.runner; + // Start sketch execution and publish sandbox mode once the runner has resolved try { await clientState.runner.runSketch({ @@ -436,7 +440,7 @@ export function registerSimulationWebSocket(httpServer: Server, deps: Simulation logger.error(`[Simulation] runSketch failed: ${error}`); } - const sandboxStatus = clientState.runner.getSandboxStatus(); + const sandboxStatus = runnerForStatus.getSandboxStatus(); const poolStats = pool.getStats(); const workerIndex = pool.getRunnerIndex(clientState.runner); sendMessageToClient(ws, { From 1ae5fcb92d8a5c8172ee5a91b44efd1bdcbba73b Mon Sep 17 00:00:00 2001 From: ttbombadil Date: Wed, 8 Apr 2026 16:39:23 +0200 Subject: [PATCH 2/2] fix(ci): robust pre-flight checks and docker cleanup in run-tests.sh root causes of Docker Desktop crash + E2E failures: 1. docker-compose unosim-server occupied port 3000 (restart: unless-stopped). lsof kill only removed docker-proxy, container kept running/restarting. 2. stale exited sandbox containers piled up across runs (8 found). cleanup only filtered by current ancestor image, missing old image IDs. 3. no container cleanup between Docker tests (74s) and E2E server start. Docker Desktop became overloaded and temporarily unresponsive. fixes: - add pre-flight section: checks node, docker, sonarqube, port 3000 - stop docker-compose unosim-server before tests (port 3000 conflict) - find_sandbox_containers() helper: name + command-based matching catches containers from any image version (including unnamed/old-ID ones) - clean stale containers in pre-flight and after Docker tests - export FORCE_DOCKER=1 (not just inline) for reliable propagation - docker recovery retry loop only when docker was available initially proven: ./run-tests.sh passes all 7 steps + sonarqube quality gate 1402 unit tests, 32 docker tests, 16 E2E tests, 0 issues --- run-tests.sh | 111 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 20 deletions(-) diff --git a/run-tests.sh b/run-tests.sh index c122748c..bd593e65 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -28,17 +28,26 @@ OK="${G}✔${RS}"; FAIL="${R}✘${RS}"; RUN="${Y}◌${RS}"; WARN="${Y}⚠${RS}" div() { printf "${D}────────────────────────────────────────────────${RS}\n"; } +# Helfer: Alle Sandbox-Container finden (Name- UND Kommando-basiert, +# damit auch namenlose Container mit alten Image-IDs erfasst werden). +find_sandbox_containers() { + local filter="${1:---filter status=exited}" # default: nur beendete + { + docker ps -aq $filter --filter "name=unosim-sandbox" 2>/dev/null + docker ps -a $filter --format '{{.ID}} {{.Command}}' 2>/dev/null \ + | grep 'g++ /sandbox' | awk '{print $1}' + } | sort -u +} + # Aufräum-Funktion bei Abbruch oder Ende cleanup() { if [ -n "$SERVER_PID" ]; then kill "$SERVER_PID" 2>/dev/null fi - # Sandbox-Container aufräumen (ephemeral, aus unosim-sandbox:latest entstanden) if docker info > /dev/null 2>&1; then local containers - containers=$(docker ps -aq --filter "ancestor=$DOCKER_SANDBOX_IMAGE" 2>/dev/null) + containers=$(find_sandbox_containers) if [ -n "$containers" ]; then - echo "$containers" | xargs docker stop --time 5 > /dev/null 2>&1 || true echo "$containers" | xargs docker rm -f > /dev/null 2>&1 || true fi fi @@ -106,7 +115,63 @@ div rm -f "$LOG_FILE" [ -d temp ] && rm -rf temp/* -# Pre-Flight: Altlasten bereinigen (kein nummerierter Schritt) +# ─────── PRE-FLIGHT: Systemvoraussetzungen ─────── +echo -e "\n${B}▸ [Pre-Flight] Systemvoraussetzungen${RS}" + +# Node.js / npm +if ! command -v npm &>/dev/null; then + echo -e " ${FAIL} npm nicht gefunden – bitte Node.js installieren" + exit 1 +fi +echo -e " ${OK} Node.js $(node -v)" + +# Docker +DOCKER_AVAILABLE=0 +if command -v docker &>/dev/null && docker info >/dev/null 2>&1; then + DOCKER_AVAILABLE=1 + echo -e " ${OK} Docker $(docker version --format '{{.Client.Version}}' 2>/dev/null)" + + # docker-compose unosim-server prüfen (Port-3000-Konflikt mit dem E2E-Dev-Server) + if docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^unosim-server$"; then + echo -e " ${WARN} docker-compose unosim-server blockiert Port 3000 – wird für Tests gestoppt" + docker stop unosim-server >/dev/null 2>&1 || true + fi + + # Stale Sandbox-Container aufräumen (Name + Kommando-basiert → fängt auch alte Image-IDs) + stale=$(find_sandbox_containers) + if [ -n "$stale" ]; then + count=$(echo "$stale" | wc -l | tr -d ' ') + echo "$stale" | xargs docker rm -f >/dev/null 2>&1 + echo -e " ${OK} $count alte Sandbox-Container bereinigt" + fi + + # Sandbox Image + if docker image inspect "$DOCKER_SANDBOX_IMAGE" >/dev/null 2>&1; then + echo -e " ${OK} Sandbox Image vorhanden" + else + echo -e " ${WARN} Sandbox Image fehlt – wird bei Bedarf gebaut" + fi +else + echo -e " ${WARN} Docker nicht verfügbar – Docker-Tests und Sandbox werden übersprungen" +fi + +# Port 3000 freigeben (nach Docker-Stop, damit kein docker-proxy mehr übrig ist) +if lsof -ti:3000 >/dev/null 2>&1; then + lsof -ti:3000 | xargs kill -9 2>/dev/null || true + sleep 1 + echo -e " ${OK} Port 3000 freigegeben" +else + echo -e " ${OK} Port 3000 frei" +fi + +# SonarQube (optional, informativ) +if [ -n "$SONAR_TOKEN" ] && curl -sf http://localhost:9000/api/system/status >/dev/null 2>&1; then + echo -e " ${OK} SonarQube erreichbar" +else + echo -e " ${D}ℹ SonarQube nicht verfügbar (optional)${RS}" +fi + +# ─────── PRE-FLIGHT: Compiler-Prozess-Leaks ─────── echo -e "\n${B}▸ [Pre-Flight] Cleanup leaked compiler processes${RS}" ./check-leaks.sh --cleanup >> "$LOG_FILE" 2>&1 && echo -e " ${OK} Bereinigung abgeschlossen" || true @@ -118,9 +183,7 @@ run_task "Unit-Tests" "NODE_OPTIONS='--no-warnings' npm run test:fast -- --repor parse_test_results "Tests.*passed" # 3+4. Sandbox Image Build & Docker-Tests (optional, wenn Docker verfügbar) -# HINWEIS: unosim-server:latest wird von docker compose gebaut, nicht hier. -# Nur das Sandbox-Image wird benötigt und nur wenn es noch nicht existiert. -if docker info > /dev/null 2>&1; then +if [ "$DOCKER_AVAILABLE" -eq 1 ]; then # Sandbox Image nur bauen wenn es noch nicht existiert if ! docker image inspect "$DOCKER_SANDBOX_IMAGE" > /dev/null 2>&1; then run_task "Sandbox Image Build" "docker build -f Dockerfile.sandbox -t $DOCKER_SANDBOX_IMAGE ." @@ -141,6 +204,12 @@ if docker info > /dev/null 2>&1; then tests/server/services/sandbox-lifecycle.integration.test.ts \ tests/server/services/serial-backpressure.test.ts" parse_test_results "Tests.*passed" + + # Container-Cleanup nach Docker-Tests: entlastet Docker Desktop vor E2E-Phase + stale_after=$(find_sandbox_containers) + if [ -n "$stale_after" ]; then + echo "$stale_after" | xargs docker rm -f >/dev/null 2>&1 + fi else echo -e " ${WARN} Docker nicht verfügbar – Docker-Tests werden übersprungen (Steps 3+4)" STEP=$((STEP+2)) @@ -150,25 +219,27 @@ fi echo -e "\n${B}▸ [Vorbereitung] Server-Start${RS}" lsof -ti:3000 | xargs kill -9 2>/dev/null || true -# Docker-Stabilisierung: Nach intensiven Container-Tests kurz auf Erholung warten. -# Docker Desktop auf macOS wird nach 74s Heavy-Load temporär unresponsive; 30s Retry. +# Docker-Gesundheitsprüfung: Docker Desktop auf macOS braucht nach Heavy-Load +# manchmal einige Sekunden bis der Daemon wieder stabil antwortet. DOCKER_FOR_E2E=0 -for _i in {1..10}; do - if docker info > /dev/null 2>&1; then - DOCKER_FOR_E2E=1 - [ "$_i" -gt 1 ] && echo -e " ${OK} Docker nach $((_i * 3))s wiederhergestellt" - break - fi - [ "$_i" -eq 1 ] && echo -e " ${RUN} Docker antwortet nicht – warte auf Erholung (max. 30s)..." - sleep 3 -done +if [ "$DOCKER_AVAILABLE" -eq 1 ]; then + for _i in {1..10}; do + if docker info > /dev/null 2>&1; then + DOCKER_FOR_E2E=1 + [ "$_i" -gt 1 ] && echo -e " ${OK} Docker nach $((_i * 3))s wiederhergestellt" + break + fi + [ "$_i" -eq 1 ] && echo -e " ${RUN} Docker antwortet nicht – warte auf Erholung (max. 30s)..." + sleep 3 + done +fi export PORT=3000 # Server startet im Hintergrund (NODE_ENV=development für Vite-Snapshots) -# FORCE_DOCKER + DOCKER_SANDBOX_IMAGE werden gesetzt, wenn Docker stabil verfügbar ist if [ "$DOCKER_FOR_E2E" -eq 1 ]; then echo -e " ${OK} Docker verfügbar – E2E mit Sandbox-Unterstützung" - FORCE_DOCKER=1 DOCKER_SANDBOX_IMAGE=$DOCKER_SANDBOX_IMAGE UNOSIM_SHARED_TEMP_DIR=$UNOSIM_SHARED_TEMP_DIR NODE_ENV=development npm run dev >> "$LOG_FILE" 2>&1 & + export FORCE_DOCKER=1 + DOCKER_SANDBOX_IMAGE=$DOCKER_SANDBOX_IMAGE UNOSIM_SHARED_TEMP_DIR=$UNOSIM_SHARED_TEMP_DIR NODE_ENV=development npm run dev >> "$LOG_FILE" 2>&1 & else echo -e " ${WARN} Docker nicht verfügbar – E2E im Lokal-Modus" NODE_ENV=development npm run dev >> "$LOG_FILE" 2>&1 &