@@ -15,16 +15,8 @@ const logger = createLogger('CopilotChatAbortAPI')
1515const GO_EXPLICIT_ABORT_TIMEOUT_MS = 3000
1616const STREAM_ABORT_SETTLE_TIMEOUT_MS = 8000
1717
18- /**
19- * POST /api/copilot/chat/abort
20- *
21- * Hang-critical: the client calls this when the user hits "stop". It
22- * fans out to Go (explicit-abort marker) and then waits up to
23- * STREAM_ABORT_SETTLE_TIMEOUT_MS (8s) for the prior chat stream to
24- * unwind. If EITHER the Go fetch or the settle-wait hangs, the user
25- * sees a "still shutting down" 409 — or worse, an unresolved Promise
26- * on the client. The spans below pinpoint which phase stalled.
27- */
18+ // POST /api/copilot/chat/abort — fires on user Stop; marks the Go
19+ // side aborted then waits for the prior stream to settle.
2820export async function POST ( request : Request ) {
2921 return withIncomingGoSpan (
3022 request . headers ,
@@ -71,36 +63,11 @@ export async function POST(request: Request) {
7163 }
7264 if ( chatId ) rootSpan . setAttribute ( TraceAttr . ChatId , chatId )
7365
74- // ORDER MATTERS: local abort FIRST, Go explicit-abort SECOND.
75- //
76- // Sim and Go each own a separate Redis instance and do not share
77- // state through it — the only signal that crosses the service
78- // boundary is this HTTP call. So the race to win is purely
79- // Sim-internal:
80- //
81- // - `abortActiveStream` flips the AbortController (reason =
82- // AbortReason.UserStop) that's wrapped around the in-flight
83- // `fetchGo('/api/mothership', ...)` SSE stream. Once flipped,
84- // the stream throws AbortError on the next chunk read, and
85- // the lifecycle catch block's classifier sees
86- // `signal.aborted = true` with an explicit-stop reason → the
87- // root span gets stamped `cancel_reason = explicit_stop` and
88- // the `request.cancelled` event fires correctly.
89- //
90- // - If we call Go first (old order), Go's context cancels from
91- // its own explicit-abort handler, the /api/mothership stream
92- // errors with "context canceled", and Sim's catch block fires
93- // BEFORE we've flipped the local AbortController. At that
94- // point `signal.aborted` is still false, so the classifier
95- // falls through to `client_disconnect` / `unknown` and the
96- // root ends up as `outcome = error` — which is what we saw
97- // in trace 25f31730082078cef54653b1740caf12.
98- //
99- // Go's explicit-abort endpoint still runs second: it's what tells
100- // Go-side billing "this was intentional, flush the paused ledger"
101- // and is unaffected by the reorder (Go's context is already
102- // cancelled by the time we get there; the endpoint's job is
103- // billing semantics, not cancelling in-flight work).
66+ // Local abort before Go — lets the lifecycle classifier see
67+ // `signal.aborted` with an explicit-stop reason before Go's
68+ // context-canceled error propagates back. Go's endpoint runs
69+ // second for billing-ledger flush; Go's context is already
70+ // cancelled by then.
10471 const aborted = await abortActiveStream ( streamId )
10572 rootSpan . setAttribute ( TraceAttr . CopilotAbortLocalAborted , aborted )
10673
@@ -144,16 +111,12 @@ export async function POST(request: Request) {
144111 rootSpan . setAttribute ( TraceAttr . CopilotAbortGoMarkerOk , goAbortOk )
145112
146113 if ( chatId ) {
147- // `waitForPendingChatStream` blocks up to 8s waiting for the
148- // prior stream's release. It's THE single most likely stall
149- // point in this handler — isolate it so a slow unwind shows up
150- // as this child span rather than unexplained root latency.
151114 const settled = await withCopilotSpan (
152115 TraceSpan . CopilotChatAbortWaitSettle ,
153116 {
154- 'chat.id' : chatId ,
155- 'stream.id' : streamId ,
156- 'settle.timeout_ms' : STREAM_ABORT_SETTLE_TIMEOUT_MS ,
117+ [ TraceAttr . ChatId ] : chatId ,
118+ [ TraceAttr . StreamId ] : streamId ,
119+ [ TraceAttr . SettleTimeoutMs ] : STREAM_ABORT_SETTLE_TIMEOUT_MS ,
157120 } ,
158121 async ( settleSpan ) => {
159122 const start = Date . now ( )
0 commit comments