Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 20 additions & 20 deletions cloudflare-gastown/gastown-grafana-dash-1.json
Original file line number Diff line number Diff line change
Expand Up @@ -681,8 +681,8 @@
"interval": "",
"intervalFactor": 1,
"nullifySparse": false,
"query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter GROUP BY t, label ORDER BY t LIMIT 500",
"rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter GROUP BY t, label ORDER BY t LIMIT 500",
"query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter GROUP BY t, label ORDER BY t",
"rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter GROUP BY t, label ORDER BY t",
"refId": "A",
"round": "0s",
"showFormattedSQL": false,
Expand Down Expand Up @@ -1026,8 +1026,8 @@
"interval": "",
"intervalFactor": 1,
"nullifySparse": false,
"query": "SELECT blob1 AS event, SUM(_sample_interval) AS total, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(IF(blob5 = '', _sample_interval, 0)) AS success_count, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate FROM gastown_events WHERE $timeFilter GROUP BY event HAVING error_count > 0 ORDER BY error_count DESC LIMIT 50",
"rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS total, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(IF(blob5 = '', _sample_interval, 0)) AS success_count, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate FROM gastown_events WHERE $timeFilter GROUP BY event HAVING error_count > 0 ORDER BY error_count DESC LIMIT 50",
"query": "SELECT blob1 AS event, SUM(_sample_interval) AS total, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(IF(blob5 = '', _sample_interval, 0)) AS success_count, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate FROM gastown_events WHERE $timeFilter GROUP BY event HAVING error_count > 0 ORDER BY error_count DESC",
"rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS total, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(IF(blob5 = '', _sample_interval, 0)) AS success_count, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate FROM gastown_events WHERE $timeFilter GROUP BY event HAVING error_count > 0 ORDER BY error_count DESC",
"refId": "A",
"round": "0s",
"showFormattedSQL": false,
Expand Down Expand Up @@ -1106,8 +1106,8 @@
"interval": "",
"intervalFactor": 1,
"nullifySparse": false,
"query": "SELECT blob5 AS error_message, blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter AND blob5 != '' GROUP BY error_message, event ORDER BY count DESC LIMIT 30",
"rawSql": "SELECT blob5 AS error_message, blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter AND blob5 != '' GROUP BY error_message, event ORDER BY count DESC LIMIT 30",
"query": "SELECT blob5 AS error_message, blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter AND blob5 != '' GROUP BY error_message, event ORDER BY count DESC",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Keep a row cap on this high-cardinality panel

Removing the LIMIT turns the "Top Error Messages" table into an unbounded GROUP BY error_message, event over the whole time window. On production data that can return thousands of rows, which is likely to slow the ClickHouse query and make the dashboard much heavier to load.

"rawSql": "SELECT blob5 AS error_message, blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter AND blob5 != '' GROUP BY error_message, event ORDER BY count DESC",
"refId": "A",
"round": "0s",
"showFormattedSQL": false,
Expand Down Expand Up @@ -1323,8 +1323,8 @@
"interval": "",
"intervalFactor": 1,
"nullifySparse": false,
"query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_duration FROM gastown_events WHERE $timeFilter AND blob3 IN ('http', 'trpc') GROUP BY t, label ORDER BY t LIMIT 500",
"rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_duration FROM gastown_events WHERE $timeFilter AND blob3 IN ('http', 'trpc') GROUP BY t, label ORDER BY t LIMIT 500",
"query": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_duration FROM gastown_events WHERE $timeFilter AND blob3 IN ('http', 'trpc') GROUP BY t, label ORDER BY t",
"rawSql": "SELECT $timeSeries AS t, blob1 AS label, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_duration FROM gastown_events WHERE $timeFilter AND blob3 IN ('http', 'trpc') GROUP BY t, label ORDER BY t",
"refId": "A",
"round": "0s",
"showFormattedSQL": false,
Expand Down Expand Up @@ -1451,8 +1451,8 @@
"interval": "",
"intervalFactor": 1,
"nullifySparse": false,
"query": "SELECT blob4 AS route, blob3 AS delivery, SUM(_sample_interval) AS count, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms, MAX(double1) AS max_ms FROM gastown_events WHERE $timeFilter AND blob3 IN ('http', 'trpc') AND blob4 != '' GROUP BY route, delivery HAVING count > 5 ORDER BY avg_latency_ms DESC LIMIT 40",
"rawSql": "SELECT blob4 AS route, blob3 AS delivery, SUM(_sample_interval) AS count, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms, MAX(double1) AS max_ms FROM gastown_events WHERE $timeFilter AND blob3 IN ('http', 'trpc') AND blob4 != '' GROUP BY route, delivery HAVING count > 5 ORDER BY avg_latency_ms DESC LIMIT 40",
"query": "SELECT blob4 AS route, blob3 AS delivery, SUM(_sample_interval) AS count, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms, MAX(double1) AS max_ms FROM gastown_events WHERE $timeFilter AND blob3 IN ('http', 'trpc') AND blob4 != '' GROUP BY route, delivery HAVING count > 5 ORDER BY avg_latency_ms DESC",
"rawSql": "SELECT blob4 AS route, blob3 AS delivery, SUM(_sample_interval) AS count, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms, MAX(double1) AS max_ms FROM gastown_events WHERE $timeFilter AND blob3 IN ('http', 'trpc') AND blob4 != '' GROUP BY route, delivery HAVING count > 5 ORDER BY avg_latency_ms DESC",
"refId": "A",
"round": "0s",
"showFormattedSQL": false,
Expand Down Expand Up @@ -1817,8 +1817,8 @@
"interval": "",
"intervalFactor": 1,
"nullifySparse": false,
"query": "SELECT blob2 AS user_id, SUM(_sample_interval) AS total_events, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms, COUNT(DISTINCT blob6) AS town_count FROM gastown_events WHERE $timeFilter AND blob2 != '' GROUP BY user_id ORDER BY total_events DESC LIMIT 25",
"rawSql": "SELECT blob2 AS user_id, SUM(_sample_interval) AS total_events, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms, COUNT(DISTINCT blob6) AS town_count FROM gastown_events WHERE $timeFilter AND blob2 != '' GROUP BY user_id ORDER BY total_events DESC LIMIT 25",
"query": "SELECT blob2 AS user_id, SUM(_sample_interval) AS total_events, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms, COUNT(DISTINCT blob6) AS town_count FROM gastown_events WHERE $timeFilter AND blob2 != '' GROUP BY user_id ORDER BY total_events DESC",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Bound this per-user query before shipping

This panel now groups every distinct user_id in the selected window with no LIMIT. On busy towns that can explode to thousands of rows and materially increase both query time and Grafana render time; the previous top-N cap kept the panel useful without making it unbounded.

"rawSql": "SELECT blob2 AS user_id, SUM(_sample_interval) AS total_events, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms, COUNT(DISTINCT blob6) AS town_count FROM gastown_events WHERE $timeFilter AND blob2 != '' GROUP BY user_id ORDER BY total_events DESC",
"refId": "A",
"round": "0s",
"showFormattedSQL": false,
Expand Down Expand Up @@ -1937,8 +1937,8 @@
"interval": "",
"intervalFactor": 1,
"nullifySparse": false,
"query": "SELECT blob2 AS user_id, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(_sample_interval) AS total_events, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate FROM gastown_events WHERE $timeFilter AND blob2 != '' GROUP BY user_id HAVING error_count > 0 ORDER BY error_count DESC LIMIT 25",
"rawSql": "SELECT blob2 AS user_id, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(_sample_interval) AS total_events, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate FROM gastown_events WHERE $timeFilter AND blob2 != '' GROUP BY user_id HAVING error_count > 0 ORDER BY error_count DESC LIMIT 25",
"query": "SELECT blob2 AS user_id, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(_sample_interval) AS total_events, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate FROM gastown_events WHERE $timeFilter AND blob2 != '' GROUP BY user_id HAVING error_count > 0 ORDER BY error_count DESC",
"rawSql": "SELECT blob2 AS user_id, SUM(IF(blob5 != '', _sample_interval, 0)) AS error_count, SUM(_sample_interval) AS total_events, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate FROM gastown_events WHERE $timeFilter AND blob2 != '' GROUP BY user_id HAVING error_count > 0 ORDER BY error_count DESC",
"refId": "A",
"round": "0s",
"showFormattedSQL": false,
Expand Down Expand Up @@ -2108,12 +2108,12 @@
"editorMode": "sql",
"extrapolate": true,
"format": "table",
"formattedQuery": "/* grafana dashboard='Gastown Operations', user=admin */\nSELECT blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE timestamp >= toDateTime(1773451903) AND timestamp <= toDateTime(1773453703) GROUP BY event ORDER BY count DESC LIMIT 10",
"formattedQuery": "/* grafana dashboard='Gastown Operations', user=admin */\nSELECT blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE timestamp >= toDateTime(1773451903) AND timestamp <= toDateTime(1773453703) GROUP BY event ORDER BY count DESC",
"interval": "",
"intervalFactor": 1,
"nullifySparse": false,
"query": "SELECT blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter GROUP BY event ORDER BY count DESC LIMIT 10",
"rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter GROUP BY event ORDER BY count DESC LIMIT 10",
"query": "SELECT blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter GROUP BY event ORDER BY count DESC",
"rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS count FROM gastown_events WHERE $timeFilter GROUP BY event ORDER BY count DESC",
"refId": "A",
"round": "0s",
"showFormattedSQL": false,
Expand All @@ -2123,7 +2123,7 @@
"useWindowFuncForMacros": true
}
],
"title": "Top 10 Events (share of total)",
"title": "Events by Count",
"type": "table"
},
{
Expand Down Expand Up @@ -2232,8 +2232,8 @@
"interval": "",
"intervalFactor": 1,
"nullifySparse": false,
"query": "SELECT blob1 AS event, SUM(_sample_interval) AS total, SUM(IF(blob5 = '', _sample_interval, 0)) AS success, SUM(IF(blob5 != '', _sample_interval, 0)) AS errors, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms FROM gastown_events WHERE $timeFilter GROUP BY event ORDER BY total DESC LIMIT 50",
"rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS total, SUM(IF(blob5 = '', _sample_interval, 0)) AS success, SUM(IF(blob5 != '', _sample_interval, 0)) AS errors, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms FROM gastown_events WHERE $timeFilter GROUP BY event ORDER BY total DESC LIMIT 50",
"query": "SELECT blob1 AS event, SUM(_sample_interval) AS total, SUM(IF(blob5 = '', _sample_interval, 0)) AS success, SUM(IF(blob5 != '', _sample_interval, 0)) AS errors, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms FROM gastown_events WHERE $timeFilter GROUP BY event ORDER BY total DESC",
"rawSql": "SELECT blob1 AS event, SUM(_sample_interval) AS total, SUM(IF(blob5 = '', _sample_interval, 0)) AS success, SUM(IF(blob5 != '', _sample_interval, 0)) AS errors, SUM(IF(blob5 != '', _sample_interval, 0)) / SUM(_sample_interval) AS error_rate, SUM(_sample_interval * double1) / SUM(_sample_interval) AS avg_latency_ms FROM gastown_events WHERE $timeFilter GROUP BY event ORDER BY total DESC",
"refId": "A",
"round": "0s",
"showFormattedSQL": false,
Expand Down
14 changes: 14 additions & 0 deletions cloudflare-gastown/src/dos/Town.do.ts
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,20 @@ export class TownDO extends DurableObject<Env> {
return result;
}

/**
* Force-refresh the container token, bypassing the 1-hour throttle.
* Called from the user-facing tRPC mutation so operators can manually
* push a fresh JWT to the running container.
*/
async forceRefreshContainerToken(): Promise<void> {
const townId = this.townId;
if (!townId) throw new Error('townId not set');
const townConfig = await this.getTownConfig();
const userId = townConfig.owner_user_id ?? townId;
await dispatch.refreshContainerToken(this.env, townId, userId);
this.lastContainerTokenRefreshAt = Date.now();
}

// ══════════════════════════════════════════════════════════════════
// Rig Registry
// ══════════════════════════════════════════════════════════════════
Expand Down
9 changes: 9 additions & 0 deletions cloudflare-gastown/src/trpc/router.ts
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,15 @@ export const gastownRouter = router({
return townStub.updateTownConfig(input.config);
}),

refreshContainerToken: gastownProcedure
.input(z.object({ townId: z.string().uuid() }))
.mutation(async ({ ctx, input }) => {
await verifyTownOwnership(ctx.env, ctx.userId, input.townId);
const townStub = getTownDOStub(ctx.env, input.townId);
await townStub.setTownId(input.townId);
await townStub.forceRefreshContainerToken();
}),

// ── Events ──────────────────────────────────────────────────────────

getBeadEvents: gastownProcedure
Expand Down
43 changes: 43 additions & 0 deletions src/app/(app)/gastown/[townId]/settings/TownSettingsPageClient.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import {
Shield,
Variable,
Layers,
RefreshCw,
Container,
} from 'lucide-react';
import { motion } from 'motion/react';

Expand All @@ -37,6 +39,7 @@ const SECTIONS = [
{ id: 'convoys', label: 'Convoys', icon: Layers },
{ id: 'merge-strategy', label: 'Merge Strategy', icon: GitPullRequest },
{ id: 'refinery', label: 'Refinery', icon: Shield },
{ id: 'container', label: 'Container', icon: Container },
] as const;

function useScrollSpy(sectionIds: readonly string[]) {
Expand Down Expand Up @@ -93,6 +96,13 @@ export function TownSettingsPageClient({ townId }: Props) {
})
);

const refreshToken = useMutation(
trpc.gastown.refreshContainerToken.mutationOptions({
onSuccess: () => toast.success('Container token refreshed'),
onError: err => toast.error(`Token refresh failed: ${err.message}`),
})
);

// Local state for form fields
const [envVars, setEnvVars] = useState<EnvVarEntry[]>([]);
const [githubToken, setGithubToken] = useState('');
Expand Down Expand Up @@ -468,6 +478,39 @@ export function TownSettingsPageClient({ townId }: Props) {
</div>
</div>
</SettingsSection>

{/* ── Container ──────────────────────────────────────── */}
<SettingsSection
id="container"
title="Container"
description="Manage the town's container runtime and authentication tokens."
icon={Container}
index={6}
>
<div className="space-y-3">
<div className="flex items-center justify-between rounded-lg border border-white/[0.06] bg-white/[0.02] px-4 py-3">
<div>
<p className="text-sm text-white/70">Container Token</p>
<p className="text-[11px] text-white/30">
JWT shared by all agents in the container. Auto-refreshed hourly (8h
expiry). Force a refresh if agents are experiencing auth failures.
</p>
</div>
<Button
onClick={() => refreshToken.mutate({ townId })}
disabled={refreshToken.isPending}
variant="secondary"
size="sm"
className="ml-4 shrink-0 gap-1.5"
>
<RefreshCw
className={`size-3 ${refreshToken.isPending ? 'animate-spin' : ''}`}
/>
{refreshToken.isPending ? 'Refreshing...' : 'Refresh Token'}
</Button>
</div>
</div>
</SettingsSection>
</div>
</div>

Expand Down
14 changes: 14 additions & 0 deletions src/lib/gastown/types/router.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,13 @@ export declare const gastownRouter: import('@trpc/server').TRPCBuiltRouter<
};
meta: object;
}>;
refreshContainerToken: import('@trpc/server').TRPCMutationProcedure<{
input: {
townId: string;
};
output: void;
meta: object;
}>;
getBeadEvents: import('@trpc/server').TRPCQueryProcedure<{
input: {
rigId: string;
Expand Down Expand Up @@ -1457,6 +1464,13 @@ export declare const wrappedGastownRouter: import('@trpc/server').TRPCBuiltRoute
};
meta: object;
}>;
refreshContainerToken: import('@trpc/server').TRPCMutationProcedure<{
input: {
townId: string;
};
output: void;
meta: object;
}>;
getBeadEvents: import('@trpc/server').TRPCQueryProcedure<{
input: {
rigId: string;
Expand Down
Loading