Live metrics (--profile): - New metricsTracker instruments OnPTYOut, viewport renderer, stdout writes, libghostty-vt Write/Title CGO calls, sidebar / tabbar / status draws (with cache-hit accounting), snapshot replays, and the chrome ticker (so we can see ticker fires that did nothing). - Writes metrics.jsonl (one snapshot per second) and metrics.json + summary.txt on exit, alongside the existing pprof files. - All record* methods are nil-safe so disabled paths pay only a cheap nil check; counters are atomic so the per-PTY-chunk hot path stays lock-free. Benchmark suite (go test -bench=.): - Three workload fixtures — plain ASCII, SGR-styled lines, and a ratatui-style cursor-shuffling burst — plus a containsOSC microbenchmark. Reports ns/op, MB/s, allocs/op, B/op. - Initial baseline numbers added to TODO under the perf-audit section, alongside two new findings (renderer allocs ~1 per 4 bytes on styled chunks; styled throughput tops out near 90 MB/s) those benchmarks surfaced.
170 lines
5.1 KiB
Go
170 lines
5.1 KiB
Go
package app
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
// Benchmarks for patterm's hot paths. Run with:
|
|
//
|
|
// go test -bench=. -benchmem ./internal/app/
|
|
//
|
|
// or target one:
|
|
//
|
|
// go test -bench=BenchmarkViewportRenderer_PlainASCII -benchmem ./internal/app/
|
|
//
|
|
// The fixtures below model the three workloads we care about most:
|
|
//
|
|
// - PlainASCII: long-running text output (claude streaming a code
|
|
// diff, codex outputting a tool result body). Fast-path territory.
|
|
// - StyledLines: SGR-heavy output (claude/codex chat history with
|
|
// coloured tokens). State-machine path.
|
|
// - RatatuiBurst: many short cursor-positioning / SGR transitions in
|
|
// a tight chunk, matching codex/ratatui's incremental diff
|
|
// updates.
|
|
// - SnapshotReplay: full styled-grid replay (focus switch).
|
|
|
|
// buildPlainASCIIChunk returns a roughly N-byte chunk of pure
|
|
// printable ASCII text with the occasional newline — the cheapest
|
|
// workload, exercises the fast path in viewport_renderer.Render.
|
|
func buildPlainASCIIChunk(n int) []byte {
|
|
var b strings.Builder
|
|
b.Grow(n)
|
|
line := "The quick brown fox jumps over the lazy dog 0123456789 "
|
|
for b.Len() < n {
|
|
b.WriteString(line)
|
|
if b.Len()%80 < len(line) {
|
|
b.WriteByte('\n')
|
|
}
|
|
}
|
|
return []byte(b.String()[:n])
|
|
}
|
|
|
|
// buildStyledLinesChunk simulates SGR-heavy output: every word wears
|
|
// a colour, so the renderer breaks out of its fast path on every
|
|
// escape sequence.
|
|
func buildStyledLinesChunk(n int) []byte {
|
|
var b strings.Builder
|
|
b.Grow(n)
|
|
colours := []string{"31", "32", "33", "34", "35", "36"}
|
|
words := []string{"package", "func", "return", "import", "struct", "type", "const", "var"}
|
|
i := 0
|
|
for b.Len() < n {
|
|
fmt.Fprintf(&b, "\x1b[%sm%s\x1b[0m ", colours[i%len(colours)], words[i%len(words)])
|
|
if i%10 == 9 {
|
|
b.WriteByte('\n')
|
|
}
|
|
i++
|
|
}
|
|
return []byte(b.String()[:n])
|
|
}
|
|
|
|
// buildRatatuiBurst simulates a single ratatui-style diff frame:
|
|
// CUP, SGR, a few chars, CUP, SGR, a few chars… for a viewport's
|
|
// worth of cells.
|
|
func buildRatatuiBurst(cells int) []byte {
|
|
var b strings.Builder
|
|
for i := 0; i < cells; i++ {
|
|
row := (i / 80) + 1
|
|
col := (i % 80) + 1
|
|
fmt.Fprintf(&b, "\x1b[%d;%dH\x1b[3%dm%c", row, col, i%8, byte('A'+(i%26)))
|
|
}
|
|
b.WriteString("\x1b[0m")
|
|
return []byte(b.String())
|
|
}
|
|
|
|
// BenchmarkViewportRenderer_PlainASCII drives a 16 KiB plain-text
|
|
// chunk through Render once per iteration. Reports ns/op,
|
|
// allocations, and B/op.
|
|
func BenchmarkViewportRenderer_PlainASCII(b *testing.B) {
|
|
chunk := buildPlainASCIIChunk(16 * 1024)
|
|
b.SetBytes(int64(len(chunk)))
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
vr := newViewportRenderer(newTerminalLayout(120, 40))
|
|
_ = vr.Render(chunk)
|
|
}
|
|
}
|
|
|
|
// BenchmarkViewportRenderer_StyledLines exercises the per-byte CSI
|
|
// path on SGR-heavy output. Most claude/codex chat resume traffic
|
|
// looks like this — coloured prose with frequent style toggles.
|
|
func BenchmarkViewportRenderer_StyledLines(b *testing.B) {
|
|
chunk := buildStyledLinesChunk(16 * 1024)
|
|
b.SetBytes(int64(len(chunk)))
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
vr := newViewportRenderer(newTerminalLayout(120, 40))
|
|
_ = vr.Render(chunk)
|
|
}
|
|
}
|
|
|
|
// BenchmarkViewportRenderer_RatatuiBurst measures the worst-case
|
|
// cursor-shuffling workload: full-frame diff updates dominated by
|
|
// CUP + SGR + single-char writes.
|
|
func BenchmarkViewportRenderer_RatatuiBurst(b *testing.B) {
|
|
chunk := buildRatatuiBurst(80 * 24) // one screenful of cells
|
|
b.SetBytes(int64(len(chunk)))
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
vr := newViewportRenderer(newTerminalLayout(120, 40))
|
|
_ = vr.Render(chunk)
|
|
}
|
|
}
|
|
|
|
// BenchmarkContainsOSC measures the OSC-gate fast path used by
|
|
// pumpChild before deciding whether to fire the per-chunk Title()
|
|
// CGO call. Inputs:
|
|
// - "hot": SGR-styled output without OSC — the common case for
|
|
// codex/ratatui. We want this near zero.
|
|
// - "cold": chunk with an OSC sequence in the middle.
|
|
func BenchmarkContainsOSC_NoOSC(b *testing.B) {
|
|
chunk := buildStyledLinesChunk(8 * 1024)
|
|
b.SetBytes(int64(len(chunk)))
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
_ = containsOSC(chunk)
|
|
}
|
|
}
|
|
|
|
func BenchmarkContainsOSC_WithOSC(b *testing.B) {
|
|
chunk := append(buildStyledLinesChunk(8*1024), []byte("\x1b]0;new title\x07")...)
|
|
b.SetBytes(int64(len(chunk)))
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
_ = containsOSC(chunk)
|
|
}
|
|
}
|
|
|
|
// BenchmarkRendererThroughput_ReuseInstance approximates real
|
|
// session behaviour: a single viewport renderer fed many chunks in
|
|
// sequence, no per-iteration allocation. Reports a throughput
|
|
// closer to the steady-state OnPTYOut path. Chunks are 4 KiB to
|
|
// match typical PTY read sizes; the renderer is reset every
|
|
// benchmark run.
|
|
func BenchmarkRendererThroughput_ReuseInstance(b *testing.B) {
|
|
chunks := make([][]byte, 16)
|
|
for i := range chunks {
|
|
chunks[i] = buildStyledLinesChunk(4 * 1024)
|
|
}
|
|
totalBytes := 0
|
|
for _, c := range chunks {
|
|
totalBytes += len(c)
|
|
}
|
|
b.SetBytes(int64(totalBytes))
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
vr := newViewportRenderer(newTerminalLayout(120, 40))
|
|
for _, c := range chunks {
|
|
_ = vr.Render(c)
|
|
}
|
|
}
|
|
}
|