Skip to content

Commit 4ab6706

Browse files
committed
Otel
1 parent 0bb67c9 commit 4ab6706

File tree

36 files changed

+436
-1072
lines changed

36 files changed

+436
-1072
lines changed

apps/sim/app/api/billing/update-cost/route.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ export async function POST(req: NextRequest) {
4141
req.headers,
4242
TraceSpan.CopilotBillingUpdateCost,
4343
{
44-
'http.method': 'POST',
45-
'http.route': '/api/billing/update-cost',
44+
[TraceAttr.HttpMethod]: 'POST',
45+
[TraceAttr.HttpRoute]: '/api/billing/update-cost',
4646
},
4747
async (span) => updateCostInner(req, span)
4848
)
@@ -95,7 +95,6 @@ async function updateCostInner(
9595
if (!validation.success) {
9696
logger.warn(`[${requestId}] Invalid request body`, {
9797
errors: validation.error.issues,
98-
body,
9998
})
10099
span.setAttribute(TraceAttr.BillingOutcome, BillingRouteOutcome.InvalidBody)
101100
span.setAttribute(TraceAttr.HttpStatusCode, 400)

apps/sim/app/api/copilot/api-keys/validate/route.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ export async function POST(req: NextRequest) {
2626
req.headers,
2727
TraceSpan.CopilotAuthValidateApiKey,
2828
{
29-
'http.method': 'POST',
30-
'http.route': '/api/copilot/api-keys/validate',
29+
[TraceAttr.HttpMethod]: 'POST',
30+
[TraceAttr.HttpRoute]: '/api/copilot/api-keys/validate',
3131
},
3232
async (span) => {
3333
try {

apps/sim/app/api/copilot/chat/abort/route.ts

Lines changed: 10 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,8 @@ const logger = createLogger('CopilotChatAbortAPI')
1515
const GO_EXPLICIT_ABORT_TIMEOUT_MS = 3000
1616
const STREAM_ABORT_SETTLE_TIMEOUT_MS = 8000
1717

18-
/**
19-
* POST /api/copilot/chat/abort
20-
*
21-
* Hang-critical: the client calls this when the user hits "stop". It
22-
* fans out to Go (explicit-abort marker) and then waits up to
23-
* STREAM_ABORT_SETTLE_TIMEOUT_MS (8s) for the prior chat stream to
24-
* unwind. If EITHER the Go fetch or the settle-wait hangs, the user
25-
* sees a "still shutting down" 409 — or worse, an unresolved Promise
26-
* on the client. The spans below pinpoint which phase stalled.
27-
*/
18+
// POST /api/copilot/chat/abort — fires on user Stop; marks the Go
19+
// side aborted then waits for the prior stream to settle.
2820
export async function POST(request: Request) {
2921
return withIncomingGoSpan(
3022
request.headers,
@@ -71,36 +63,11 @@ export async function POST(request: Request) {
7163
}
7264
if (chatId) rootSpan.setAttribute(TraceAttr.ChatId, chatId)
7365

74-
// ORDER MATTERS: local abort FIRST, Go explicit-abort SECOND.
75-
//
76-
// Sim and Go each own a separate Redis instance and do not share
77-
// state through it — the only signal that crosses the service
78-
// boundary is this HTTP call. So the race to win is purely
79-
// Sim-internal:
80-
//
81-
// - `abortActiveStream` flips the AbortController (reason =
82-
// AbortReason.UserStop) that's wrapped around the in-flight
83-
// `fetchGo('/api/mothership', ...)` SSE stream. Once flipped,
84-
// the stream throws AbortError on the next chunk read, and
85-
// the lifecycle catch block's classifier sees
86-
// `signal.aborted = true` with an explicit-stop reason → the
87-
// root span gets stamped `cancel_reason = explicit_stop` and
88-
// the `request.cancelled` event fires correctly.
89-
//
90-
// - If we call Go first (old order), Go's context cancels from
91-
// its own explicit-abort handler, the /api/mothership stream
92-
// errors with "context canceled", and Sim's catch block fires
93-
// BEFORE we've flipped the local AbortController. At that
94-
// point `signal.aborted` is still false, so the classifier
95-
// falls through to `client_disconnect` / `unknown` and the
96-
// root ends up as `outcome = error` — which is what we saw
97-
// in trace 25f31730082078cef54653b1740caf12.
98-
//
99-
// Go's explicit-abort endpoint still runs second: it's what tells
100-
// Go-side billing "this was intentional, flush the paused ledger"
101-
// and is unaffected by the reorder (Go's context is already
102-
// cancelled by the time we get there; the endpoint's job is
103-
// billing semantics, not cancelling in-flight work).
66+
// Local abort before Go — lets the lifecycle classifier see
67+
// `signal.aborted` with an explicit-stop reason before Go's
68+
// context-canceled error propagates back. Go's endpoint runs
69+
// second for billing-ledger flush; Go's context is already
70+
// cancelled by then.
10471
const aborted = await abortActiveStream(streamId)
10572
rootSpan.setAttribute(TraceAttr.CopilotAbortLocalAborted, aborted)
10673

@@ -144,16 +111,12 @@ export async function POST(request: Request) {
144111
rootSpan.setAttribute(TraceAttr.CopilotAbortGoMarkerOk, goAbortOk)
145112

146113
if (chatId) {
147-
// `waitForPendingChatStream` blocks up to 8s waiting for the
148-
// prior stream's release. It's THE single most likely stall
149-
// point in this handler — isolate it so a slow unwind shows up
150-
// as this child span rather than unexplained root latency.
151114
const settled = await withCopilotSpan(
152115
TraceSpan.CopilotChatAbortWaitSettle,
153116
{
154-
'chat.id': chatId,
155-
'stream.id': streamId,
156-
'settle.timeout_ms': STREAM_ABORT_SETTLE_TIMEOUT_MS,
117+
[TraceAttr.ChatId]: chatId,
118+
[TraceAttr.StreamId]: streamId,
119+
[TraceAttr.SettleTimeoutMs]: STREAM_ABORT_SETTLE_TIMEOUT_MS,
157120
},
158121
async (settleSpan) => {
159122
const start = Date.now()

apps/sim/app/api/copilot/chat/stop/route.ts

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -58,27 +58,14 @@ const StopSchema = z.object({
5858
streamId: z.string(),
5959
content: z.string(),
6060
contentBlocks: z.array(ContentBlockSchema).optional(),
61-
/**
62-
* Optional because older clients may not send it, but strongly
63-
* recommended: without it, the stopped assistant message persisted
64-
* below loses its `requestId`, which breaks the "Copy request ID"
65-
* button in the UI (it's the only handle the user has for filing
66-
* bug reports about a hung / bad turn).
67-
*/
61+
// Optional for older clients; when present, flows into msg.requestId
62+
// so the UI's copy-request-ID button survives a stopped turn.
6863
requestId: z.string().optional(),
6964
})
7065

71-
/**
72-
* POST /api/copilot/chat/stop
73-
* Persists partial assistant content when the user stops a stream mid-response.
74-
* Clears conversationId so the server-side onComplete won't duplicate the message.
75-
* The chat stream lock is intentionally left alone here; it is released only once
76-
* the aborted server stream actually unwinds.
77-
*
78-
* Hang-critical: runs a DB SELECT + UPDATE + pubsub publish. A slow DB
79-
* here makes the UI look frozen after the user clicks Stop. The root
80-
* span lets us tell whether stalls are DB-bound or pubsub-bound.
81-
*/
66+
// POST /api/copilot/chat/stop — persists partial assistant content
67+
// when the user stops mid-stream. Lock release is handled by the
68+
// aborted server stream unwinding, not this handler.
8269
export async function POST(req: NextRequest) {
8370
return withIncomingGoSpan(
8471
req.headers,
@@ -152,12 +139,7 @@ export async function POST(req: NextRequest) {
152139
content,
153140
timestamp: new Date().toISOString(),
154141
contentBlocks: synthesizedStoppedBlocks,
155-
// Preserve the requestId onto the persisted aborted message
156-
// so the UI's "Copy request ID" button keeps working after
157-
// the chat history refetches and replaces the in-memory
158-
// streaming message with this persisted version. Without
159-
// this, the button blinks out ~1-2s after the user hits
160-
// Stop because the refetched message has no requestId.
142+
// Persist so the UI copy-request-id button survives refetch.
161143
...(requestId ? { requestId } : {}),
162144
})
163145
const assistantMessage: PersistedMessage = normalized
@@ -187,7 +169,10 @@ export async function POST(req: NextRequest) {
187169
} catch (error) {
188170
if (error instanceof z.ZodError) {
189171
span.setAttribute(TraceAttr.CopilotStopOutcome, CopilotStopOutcome.ValidationError)
190-
return NextResponse.json({ error: 'Invalid request' }, { status: 400 })
172+
return NextResponse.json(
173+
{ error: 'Invalid request data', details: error.errors },
174+
{ status: 400 }
175+
)
191176
}
192177
logger.error('Error stopping chat stream:', error)
193178
span.setAttribute(TraceAttr.CopilotStopOutcome, CopilotStopOutcome.InternalError)

apps/sim/app/api/copilot/chat/stream/route.ts

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { context as otelContext, SpanStatusCode, trace } from '@opentelemetry/api'
1+
import { context as otelContext, trace } from '@opentelemetry/api'
22
import { createLogger } from '@sim/logger'
33
import { type NextRequest, NextResponse } from 'next/server'
44
import { getLatestRunForStream } from '@/lib/copilot/async-runs/repository'
@@ -11,9 +11,10 @@ import {
1111
CopilotTransport,
1212
} from '@/lib/copilot/generated/trace-attribute-values-v1'
1313
import { TraceAttr } from '@/lib/copilot/generated/trace-attributes-v1'
14+
import { TraceSpan } from '@/lib/copilot/generated/trace-spans-v1'
1415
import { contextFromRequestHeaders } from '@/lib/copilot/request/go/propagation'
1516
import { authenticateCopilotRequestSessionOnly } from '@/lib/copilot/request/http'
16-
import { getCopilotTracer } from '@/lib/copilot/request/otel'
17+
import { getCopilotTracer, markSpanForError } from '@/lib/copilot/request/otel'
1718
import {
1819
checkForReplayGap,
1920
createEvent,
@@ -141,7 +142,7 @@ export async function GET(request: NextRequest) {
141142
// linking behavior; no regression.
142143
const incomingContext = contextFromRequestHeaders(request.headers)
143144
const rootSpan = getCopilotTracer().startSpan(
144-
'copilot.resume.request',
145+
TraceSpan.CopilotResumeRequest,
145146
{
146147
attributes: {
147148
[TraceAttr.CopilotTransport]: batchMode ? CopilotTransport.Batch : CopilotTransport.Stream,
@@ -267,6 +268,16 @@ async function handleResumeRequestBody({
267268
let controllerClosed = false
268269
let sawTerminalEvent = false
269270
let currentRequestId = extractRunRequestId(run)
271+
// Stamp the logical request id + chat id on the resume root as soon
272+
// as we resolve them from the run row, so TraceQL joins work on
273+
// resume legs the same way they do on the original POST.
274+
if (currentRequestId) {
275+
rootSpan.setAttribute(TraceAttr.RequestId, currentRequestId)
276+
rootSpan.setAttribute(TraceAttr.SimRequestId, currentRequestId)
277+
}
278+
if (run?.chatId) {
279+
rootSpan.setAttribute(TraceAttr.ChatId, run.chatId)
280+
}
270281

271282
const closeController = () => {
272283
if (controllerClosed) return
@@ -298,7 +309,7 @@ async function handleResumeRequestBody({
298309
const events = await readEvents(streamId, cursor)
299310
if (events.length > 0) {
300311
totalEventsFlushed += events.length
301-
logger.info('[Resume] Flushing events', {
312+
logger.debug('[Resume] Flushing events', {
302313
streamId,
303314
afterCursor: cursor,
304315
eventCount: events.length,
@@ -420,11 +431,7 @@ async function handleResumeRequestBody({
420431
reason: 'stream_replay_failed',
421432
})
422433
}
423-
rootSpan.setStatus({
424-
code: SpanStatusCode.ERROR,
425-
message: error instanceof Error ? error.message : String(error),
426-
})
427-
rootSpan.recordException(error instanceof Error ? error : new Error(String(error)))
434+
markSpanForError(rootSpan, error)
428435
} finally {
429436
request.signal.removeEventListener('abort', abortListener)
430437
closeController()

apps/sim/app/api/copilot/confirm/route.ts

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -116,25 +116,16 @@ async function updateToolCallStatus(
116116
}
117117
}
118118

119-
/**
120-
* POST /api/copilot/confirm
121-
* Accept client tool completion or detach confirmations.
122-
*
123-
* Hang-critical: this is the delivery path for client-executed tool
124-
* results. If this handler stalls (DB lock, Redis timeout, pubsub
125-
* failure), the `copilot.tool.wait_for_client_result` span on the
126-
* originating chat stream never resolves and the whole request looks
127-
* hung. The root span here gives us per-request visibility so we can
128-
* correlate a slow confirm with the chat-stream that was waiting on it
129-
* via `toolCallId`.
130-
*/
119+
// POST /api/copilot/confirm — delivery path for client-executed tool
120+
// results. Correlate via `toolCallId` when the awaiting chat stream
121+
// stalls.
131122
export async function POST(req: NextRequest) {
132123
const tracker = createRequestTracker()
133124

134125
return withIncomingGoSpan(
135126
req.headers,
136127
TraceSpan.CopilotConfirmToolResult,
137-
{ 'request.id': tracker.requestId },
128+
{ [TraceAttr.RequestId]: tracker.requestId },
138129
async (span) => {
139130
try {
140131
const { userId: authenticatedUserId, isAuthenticated } =
@@ -195,9 +186,8 @@ export async function POST(req: NextRequest) {
195186
message,
196187
})
197188
span.setAttribute(TraceAttr.CopilotConfirmOutcome, CopilotConfirmOutcome.UpdateFailed)
198-
return createBadRequestResponse(
199-
'Failed to update tool call status or tool call not found'
200-
)
189+
// DB write failed — 500, not 400. 400 is a client-shape error.
190+
return createInternalServerErrorResponse('Failed to update tool call status')
201191
}
202192

203193
span.setAttribute(TraceAttr.CopilotConfirmOutcome, CopilotConfirmOutcome.Delivered)

apps/sim/app/workspace/[workspaceId]/components/message-actions/message-actions.tsx

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -140,12 +140,6 @@ export function MessageActions({ content, chatId, userQuery, requestId }: Messag
140140

141141
const hasContent = Boolean(content)
142142
const canSubmitFeedback = Boolean(chatId && userQuery)
143-
144-
// Render the action row whenever there's something the user can
145-
// actually act on: copy the message, or open the feedback modal
146-
// (thumbs up / down). Request ID alone is not a reason to render the
147-
// row anymore — it's only exposed from inside the thumbs-down modal,
148-
// which requires both chatId and userQuery.
149143
if (!hasContent && !canSubmitFeedback) return null
150144

151145
return (
@@ -198,15 +192,6 @@ export function MessageActions({ content, chatId, userQuery, requestId }: Messag
198192
</Tooltip.Root>
199193
</>
200194
)}
201-
{/*
202-
Intentionally NO root-row "Copy request ID" button here — it
203-
rendered as an ambiguous standalone Copy icon next to the
204-
message Copy icon, which was confusing (two indistinguishable
205-
copy buttons side by side). The request ID only needs to be
206-
grabbable from the thumbs-down feedback modal below, which is
207-
the surface we actually want people to use when reporting a
208-
bad response.
209-
*/}
210195
</div>
211196

212197
<Modal open={pendingFeedback !== null} onOpenChange={handleModalClose}>

apps/sim/app/workspace/[workspaceId]/home/components/mothership-chat/mothership-chat.tsx

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -179,26 +179,16 @@ export function MothershipChat({
179179
onOptionSelect={isLastMessage ? onSubmit : undefined}
180180
onWorkspaceResourceSelect={onWorkspaceResourceSelect}
181181
/>
182-
{/*
183-
Render the actions row whenever the assistant turn has
184-
settled (not streaming) AND there's anything to act on.
185-
We intentionally include `requestId` in the trigger so
186-
that aborted or content-less turns still surface the
187-
copy-trace-ID button — dropping the row in those cases
188-
makes it impossible for a user to grab the request ID
189-
needed for bug reports.
190-
*/}
191-
{!isThisStreaming &&
192-
(msg.content || msg.contentBlocks?.length || msg.requestId) && (
193-
<div className='mt-2.5'>
194-
<MessageActions
195-
content={msg.content}
196-
chatId={chatId}
197-
userQuery={precedingUserMsg?.content}
198-
requestId={msg.requestId}
199-
/>
200-
</div>
201-
)}
182+
{!isThisStreaming && (msg.content || msg.contentBlocks?.length) && (
183+
<div className='mt-2.5'>
184+
<MessageActions
185+
content={msg.content}
186+
chatId={chatId}
187+
userQuery={precedingUserMsg?.content}
188+
requestId={msg.requestId}
189+
/>
190+
</div>
191+
)}
202192
</div>
203193
)
204194
})}

0 commit comments

Comments
 (0)