diff --git a/pkg/evaluation/progress.go b/pkg/evaluation/progress.go index 0f3dca563..95340a0c7 100644 --- a/pkg/evaluation/progress.go +++ b/pkg/evaluation/progress.go @@ -13,19 +13,22 @@ import ( // progressBar provides a live-updating progress display for evaluation runs. type progressBar struct { - ttyOut io.Writer // output for progress bar rendering (TTY only) - resultOut io.Writer // output for results (can be tee'd to log) - fd int // file descriptor for terminal size queries - total int - completed atomic.Int32 - passed atomic.Int32 - failed atomic.Int32 - running sync.Map // map[string]bool for currently running evals - done chan struct{} - stopped chan struct{} // signals that the goroutine has finished - ticker *time.Ticker - isTTY bool - mu sync.Mutex // protects output + ttyOut io.Writer // output for progress bar rendering (TTY only) + resultOut io.Writer // output for results (can be tee'd to log) + fd int // file descriptor for terminal size queries + total int + completed atomic.Int32 + passed atomic.Int32 + failed atomic.Int32 + relevanceFailed atomic.Int32 // count of evals with relevance failures + sizeFailed atomic.Int32 // count of evals with size failures + toolCallsFailed atomic.Int32 // count of evals with tool call failures + running sync.Map // map[string]bool for currently running evals + done chan struct{} + stopped chan struct{} // signals that the goroutine has finished + ticker *time.Ticker + isTTY bool + mu sync.Mutex // protects output } func newProgressBar(ttyOut, resultOut io.Writer, fd, total int, isTTY bool) *progressBar { @@ -89,6 +92,20 @@ func (p *progressBar) printResult(result Result) { successes, failures := result.checkResults() success := len(failures) == 0 + // Track failure categories + if !success { + for _, f := range failures { + switch { + case strings.HasPrefix(f, "relevance"): + p.relevanceFailed.Add(1) + case strings.HasPrefix(f, "size"): + p.sizeFailed.Add(1) + case strings.HasPrefix(f, "tool calls"): + p.toolCallsFailed.Add(1) + } + } + } + // Print session title with icon (to result output, which may be tee'd to log) if success { fmt.Fprintf(p.resultOut, "%s %s ($%.6f)\n", p.green("✓"), result.Title, result.Cost) @@ -138,6 +155,9 @@ func (p *progressBar) render(final bool) { completed := int(p.completed.Load()) passed := int(p.passed.Load()) failed := int(p.failed.Load()) + relevanceFailed := int(p.relevanceFailed.Load()) + sizeFailed := int(p.sizeFailed.Load()) + toolCallsFailed := int(p.toolCallsFailed.Load()) // Get current terminal width for dynamic sizing termWidth := p.getTerminalWidth() @@ -170,6 +190,24 @@ func (p *progressBar) render(final bool) { // Build status line counts := fmt.Sprintf("%s %s", p.green(fmt.Sprintf("✓%d", passed)), p.red(fmt.Sprintf("✗%d", failed))) + + // Add detailed failure breakdown if there are failures (show during run, not just at end) + if failed > 0 { + breakdown := []string{} + if relevanceFailed > 0 { + breakdown = append(breakdown, "relevance "+p.red(fmt.Sprintf("✗%d", relevanceFailed))) + } + if sizeFailed > 0 { + breakdown = append(breakdown, "size "+p.red(fmt.Sprintf("✗%d", sizeFailed))) + } + if toolCallsFailed > 0 { + breakdown = append(breakdown, "tool calls "+p.red(fmt.Sprintf("✗%d", toolCallsFailed))) + } + if len(breakdown) > 0 { + counts += fmt.Sprintf(" (%s)", strings.Join(breakdown, ", ")) + } + } + status := fmt.Sprintf("[%s] %3d%% (%d/%d) %s", bar, percent, completed, p.total, counts) if runningCount > 0 {