package app import ( "fmt" "io" "strings" "testing" "github.com/hjbdev/patterm/internal/vt" ) // Benchmarks for patterm's hot paths. Run with: // // go test -bench=. -benchmem ./internal/app/ // // or target one: // // go test -bench=BenchmarkViewportRenderer_PlainASCII -benchmem ./internal/app/ // // The fixtures below model the three workloads we care about most: // // - PlainASCII: long-running text output (claude streaming a code // diff, codex outputting a tool result body). Fast-path territory. // - StyledLines: SGR-heavy output (claude/codex chat history with // coloured tokens). State-machine path. // - RatatuiBurst: many short cursor-positioning / SGR transitions in // a tight chunk, matching codex/ratatui's incremental diff // updates. // - SnapshotReplay: full styled-grid replay (focus switch). // buildPlainASCIIChunk returns a roughly N-byte chunk of pure // printable ASCII text with the occasional newline — the cheapest // workload, exercises the fast path in viewport_renderer.Render. func buildPlainASCIIChunk(n int) []byte { var b strings.Builder b.Grow(n) line := "The quick brown fox jumps over the lazy dog 0123456789 " for b.Len() < n { b.WriteString(line) if b.Len()%80 < len(line) { b.WriteByte('\n') } } return []byte(b.String()[:n]) } // buildStyledLinesChunk simulates SGR-heavy output: every word wears // a colour, so the renderer breaks out of its fast path on every // escape sequence. func buildStyledLinesChunk(n int) []byte { var b strings.Builder b.Grow(n) colours := []string{"31", "32", "33", "34", "35", "36"} words := []string{"package", "func", "return", "import", "struct", "type", "const", "var"} i := 0 for b.Len() < n { fmt.Fprintf(&b, "\x1b[%sm%s\x1b[0m ", colours[i%len(colours)], words[i%len(words)]) if i%10 == 9 { b.WriteByte('\n') } i++ } return []byte(b.String()[:n]) } // buildRatatuiBurst simulates a single ratatui-style diff frame: // CUP, SGR, a few chars, CUP, SGR, a few chars… for a viewport's // worth of cells. func buildRatatuiBurst(cells int) []byte { var b strings.Builder for i := 0; i < cells; i++ { row := (i / 80) + 1 col := (i % 80) + 1 fmt.Fprintf(&b, "\x1b[%d;%dH\x1b[3%dm%c", row, col, i%8, byte('A'+(i%26))) } b.WriteString("\x1b[0m") return []byte(b.String()) } // BenchmarkViewportRenderer_PlainASCII drives a 16 KiB plain-text // chunk through Render once per iteration. Reports ns/op, // allocations, and B/op. func BenchmarkViewportRenderer_PlainASCII(b *testing.B) { chunk := buildPlainASCIIChunk(16 * 1024) b.SetBytes(int64(len(chunk))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) _ = vr.Render(chunk) } } // BenchmarkViewportRenderer_StyledLines exercises the per-byte CSI // path on SGR-heavy output. Most claude/codex chat resume traffic // looks like this — coloured prose with frequent style toggles. func BenchmarkViewportRenderer_StyledLines(b *testing.B) { chunk := buildStyledLinesChunk(16 * 1024) b.SetBytes(int64(len(chunk))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) _ = vr.Render(chunk) } } // BenchmarkViewportRenderer_RatatuiBurst measures the worst-case // cursor-shuffling workload: full-frame diff updates dominated by // CUP + SGR + single-char writes. func BenchmarkViewportRenderer_RatatuiBurst(b *testing.B) { chunk := buildRatatuiBurst(80 * 24) // one screenful of cells b.SetBytes(int64(len(chunk))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) _ = vr.Render(chunk) } } // BenchmarkContainsOSC measures the OSC-gate fast path used by // pumpChild before deciding whether to fire the per-chunk Title() // CGO call. Inputs: // - "hot": SGR-styled output without OSC — the common case for // codex/ratatui. We want this near zero. // - "cold": chunk with an OSC sequence in the middle. func BenchmarkContainsOSC_NoOSC(b *testing.B) { chunk := buildStyledLinesChunk(8 * 1024) b.SetBytes(int64(len(chunk))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { _ = containsOSC(chunk) } } func BenchmarkContainsOSC_WithOSC(b *testing.B) { chunk := append(buildStyledLinesChunk(8*1024), []byte("\x1b]0;new title\x07")...) b.SetBytes(int64(len(chunk))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { _ = containsOSC(chunk) } } // BenchmarkRendererThroughput_ReuseInstance approximates real // session behaviour: a single viewport renderer fed many chunks in // sequence, no per-iteration allocation. Reports a throughput // closer to the steady-state OnPTYOut path. Chunks are 4 KiB to // match typical PTY read sizes; the renderer is reset every // benchmark run. func BenchmarkRendererThroughput_ReuseInstance(b *testing.B) { chunks := make([][]byte, 16) for i := range chunks { chunks[i] = buildStyledLinesChunk(4 * 1024) } totalBytes := 0 for _, c := range chunks { totalBytes += len(c) } b.SetBytes(int64(totalBytes)) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) for _, c := range chunks { _ = vr.Render(c) } } } // Stress workloads — these model the worst things a real session // can throw at us. The headline target is "ASCII video": every cell // of an 80x40 viewport carries an SGR colour change and a printable // character, rendered as one chunk per frame. Real ASCII-video CLIs // (ascii-image-converter, asciinema-render, towel.blinkenlights, the // Bad Apple meme) hit patterm with exactly this pattern at 24-30 fps // for minutes at a time. // // We synthesise the workload rather than ship a captured corpus so // the benchmarks stay deterministic and the repo doesn't carry tens // of MiB of fixture data. The encoding is faithful to what those // tools actually emit. // buildASCIIVideoFrame builds a single full-viewport frame with // 8-colour SGR per cell (`\x1b[3Nm`). One frame ≈ 30 KiB for an // 80x40 viewport, which lines up with what ascii-video tools emit. func buildASCIIVideoFrame(cols, rows int) []byte { var b strings.Builder b.WriteString("\x1b[H") // home cursor before the frame starts for r := 0; r < rows; r++ { for c := 0; c < cols; c++ { fmt.Fprintf(&b, "\x1b[3%dm%c", (r+c)%8, byte(' '+(r*c)%(0x7e-' '))) } b.WriteString("\x1b[0m\r\n") } return []byte(b.String()) } // buildASCIIVideoFrameTrueColor builds the same frame but with // 24-bit RGB SGR (`\x1b[38;2;R;G;Bm`). Every cell is ~20 bytes of // escape + 1 byte glyph, so a frame is ≈ 70 KiB. This is what // chafa --colors=full and modern terminal video players emit, and // it's the heaviest SGR variant the renderer's CSI path sees. func buildASCIIVideoFrameTrueColor(cols, rows int) []byte { var b strings.Builder b.WriteString("\x1b[H") for r := 0; r < rows; r++ { for c := 0; c < cols; c++ { rd := (r * 7) % 256 gd := (c * 11) % 256 bd := ((r + c) * 13) % 256 fmt.Fprintf(&b, "\x1b[38;2;%d;%d;%dm%c", rd, gd, bd, byte(' '+(r*c)%(0x7e-' '))) } b.WriteString("\x1b[0m\r\n") } return []byte(b.String()) } // buildBadApplePattern builds the simplest possible ASCII video // frame: alternating black/white cells (the Bad Apple meme is // essentially a 1-bit silhouette video). This is the pattern that // stresses the SGR state-machine without exercising truecolor parse // — useful for isolating "is the cost in the colour parsing or in // the cell-by-cell switching?" func buildBadApplePattern(cols, rows int) []byte { var b strings.Builder b.WriteString("\x1b[H") for r := 0; r < rows; r++ { for c := 0; c < cols; c++ { if (r+c)%2 == 0 { b.WriteString("\x1b[37m█") } else { b.WriteString("\x1b[30m█") } } b.WriteString("\x1b[0m\r\n") } return []byte(b.String()) } // BenchmarkASCIIVideo_Frame_8Color renders a single full-screen // frame as one chunk. The headline number is MB/s — at 30 fps a // frame is one PTY chunk every ~33 ms, so this should comfortably // stay well under 1 ms. func BenchmarkASCIIVideo_Frame_8Color(b *testing.B) { frame := buildASCIIVideoFrame(80, 40) b.SetBytes(int64(len(frame))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) _ = vr.Render(frame) } } // BenchmarkASCIIVideo_Frame_TrueColor renders a single truecolor // frame. ~70 KiB per frame. Compare this to the 8-colour number to // see how much extra cost the truecolor SGR parse imposes — the // `\x1b[38;2;R;G;Bm` form is the longest and most parameter-rich // CSI patterm sees in practice. func BenchmarkASCIIVideo_Frame_TrueColor(b *testing.B) { frame := buildASCIIVideoFrameTrueColor(80, 40) b.SetBytes(int64(len(frame))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) _ = vr.Render(frame) } } // BenchmarkASCIIVideo_Frame_BadApple is the 1-bit pattern: simplest // SGR (two colours, alternating). Isolates the renderer's cell-by- // cell SGR cycling cost from the truecolor parse cost. func BenchmarkASCIIVideo_Frame_BadApple(b *testing.B) { frame := buildBadApplePattern(80, 40) b.SetBytes(int64(len(frame))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) _ = vr.Render(frame) } } // runStreamBench is the shared body for the per-fps stream // benchmarks. It feeds a fixed frame N times through a single // renderer instance and reports µs/frame + an achievable-fps // ceiling alongside the standard ns/op + MB/s. The fps value in // the benchmark name is the *target* — the workload itself doesn't // rate-limit; we just decide how many frames make a benchmark op // (3 seconds' worth) so steady-state cost dominates warm-up. func runStreamBench(b *testing.B, frame []byte, fps int) { frames := fps * 3 // 3 seconds at the target rate totalBytes := int64(len(frame) * frames) b.SetBytes(totalBytes) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) for f := 0; f < frames; f++ { _ = vr.Render(frame) } } nsPerFrame := float64(b.Elapsed().Nanoseconds()) / float64(b.N*frames) b.ReportMetric(nsPerFrame/1000.0, "µs/frame") b.ReportMetric(1e9/nsPerFrame, "fps_ceiling") // budget_pct = how much of the per-frame budget at the target // rate we burn. Under 100 means we can hit the target; over // means we can't. budgetNs := 1e9 / float64(fps) b.ReportMetric(nsPerFrame/budgetNs*100, "budget_pct") } // BenchmarkASCIIVideo_Stream_8Color_30fps / _60fps / _120fps reuse // one renderer across (3 × fps) frames. The headline numbers are // µs/frame, fps_ceiling (= 1e9 / ns/frame), and budget_pct (= // percent of the per-frame budget at the target rate we consume). // // 30 fps is the typical ASCII-video baseline (towel, chafa, Bad // Apple ports). 60 is the "smooth playback" target. 120 is a // future-proofing stress level matching modern high-refresh // terminals. func BenchmarkASCIIVideo_Stream_8Color_30fps(b *testing.B) { runStreamBench(b, buildASCIIVideoFrame(80, 40), 30) } func BenchmarkASCIIVideo_Stream_8Color_60fps(b *testing.B) { runStreamBench(b, buildASCIIVideoFrame(80, 40), 60) } func BenchmarkASCIIVideo_Stream_8Color_120fps(b *testing.B) { runStreamBench(b, buildASCIIVideoFrame(80, 40), 120) } // BenchmarkASCIIVideo_Stream_TrueColor_* same set but with the // truecolor frames. Compare against the 8-colour numbers to see // what the longer `\x1b[38;2;R;G;Bm` parse costs us. func BenchmarkASCIIVideo_Stream_TrueColor_30fps(b *testing.B) { runStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 30) } func BenchmarkASCIIVideo_Stream_TrueColor_60fps(b *testing.B) { runStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 60) } func BenchmarkASCIIVideo_Stream_TrueColor_120fps(b *testing.B) { runStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 120) } // BenchmarkASCIIVideo_Stream_BadApple_* tracks the 1-bit alternating // pattern. Isolates per-cell SGR cycling cost from the truecolor // parse cost above — useful when reading the diff between the two // stream variants. func BenchmarkASCIIVideo_Stream_BadApple_30fps(b *testing.B) { runStreamBench(b, buildBadApplePattern(80, 40), 30) } func BenchmarkASCIIVideo_Stream_BadApple_60fps(b *testing.B) { runStreamBench(b, buildBadApplePattern(80, 40), 60) } func BenchmarkASCIIVideo_Stream_BadApple_120fps(b *testing.B) { runStreamBench(b, buildBadApplePattern(80, 40), 120) } // BenchmarkEmulator_Write_8Color / _TrueColor isolate the // libghostty-vt CGO cost — same frames the Pipeline benchmarks use, // but feeding only the emulator. The delta between this and // BenchmarkASCIIVideo_Stream_… is the renderer's share; the rest // is libghostty-vt. func BenchmarkEmulator_Write_8Color_Frame(b *testing.B) { frame := buildASCIIVideoFrame(80, 40) b.SetBytes(int64(len(frame))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { em, err := vt.NewGhosttyEmulator(80, 40) if err != nil { b.Fatalf("emulator: %v", err) } if _, werr := em.Write(frame); werr != nil { b.Fatalf("emulator.Write: %v", werr) } _ = em.Close() } } func BenchmarkEmulator_Write_TrueColor_Frame(b *testing.B) { frame := buildASCIIVideoFrameTrueColor(80, 40) b.SetBytes(int64(len(frame))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { em, err := vt.NewGhosttyEmulator(80, 40) if err != nil { b.Fatalf("emulator: %v", err) } if _, werr := em.Write(frame); werr != nil { b.Fatalf("emulator.Write: %v", werr) } _ = em.Close() } } // BenchmarkEmulator_Write_Stream_120fps reuses one emulator across // 360 frames (3 sec × 120 fps). This is the cleanest measurement // of em.Write steady-state cost. func BenchmarkEmulator_Write_Stream_8Color_120fps(b *testing.B) { frame := buildASCIIVideoFrame(80, 40) const frames = 360 b.SetBytes(int64(len(frame) * frames)) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { em, err := vt.NewGhosttyEmulator(80, 40) if err != nil { b.Fatalf("emulator: %v", err) } for f := 0; f < frames; f++ { if _, werr := em.Write(frame); werr != nil { b.Fatalf("emulator.Write: %v", werr) } } _ = em.Close() } nsPerFrame := float64(b.Elapsed().Nanoseconds()) / float64(b.N*frames) b.ReportMetric(nsPerFrame/1000.0, "µs/frame") b.ReportMetric(1e9/nsPerFrame, "fps_ceiling") } func BenchmarkEmulator_Write_Stream_TrueColor_120fps(b *testing.B) { frame := buildASCIIVideoFrameTrueColor(80, 40) const frames = 360 b.SetBytes(int64(len(frame) * frames)) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { em, err := vt.NewGhosttyEmulator(80, 40) if err != nil { b.Fatalf("emulator: %v", err) } for f := 0; f < frames; f++ { if _, werr := em.Write(frame); werr != nil { b.Fatalf("emulator.Write: %v", werr) } } _ = em.Close() } nsPerFrame := float64(b.Elapsed().Nanoseconds()) / float64(b.N*frames) b.ReportMetric(nsPerFrame/1000.0, "µs/frame") b.ReportMetric(1e9/nsPerFrame, "fps_ceiling") } // runPipelineStreamBench includes the libghostty-vt emulator.Write // CGO call and a stdout write to io.Discard alongside the renderer // — i.e. everything OnPTYOut does in production except the host // terminal's own paint time (which patterm doesn't control). This // is the honest "can we hit N fps end-to-end?" measurement. func runPipelineStreamBench(b *testing.B, frame []byte, fps int) { frames := fps * 3 totalBytes := int64(len(frame) * frames) b.SetBytes(totalBytes) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { em, err := vt.NewGhosttyEmulator(80, 40) if err != nil { b.Fatalf("emulator: %v", err) } vr := newViewportRenderer(newTerminalLayout(120, 40)) for f := 0; f < frames; f++ { if _, werr := em.Write(frame); werr != nil { b.Fatalf("emulator.Write: %v", werr) } out := vr.Render(frame) // Match OnPTYOut's autowrap prelude/postlude wrapping so // the byte count is faithful. _, _ = io.Discard.Write([]byte("\x1b[?7l")) _, _ = io.Discard.Write(out) _, _ = io.Discard.Write([]byte("\x1b[?7h")) } _ = em.Close() } nsPerFrame := float64(b.Elapsed().Nanoseconds()) / float64(b.N*frames) b.ReportMetric(nsPerFrame/1000.0, "µs/frame") b.ReportMetric(1e9/nsPerFrame, "fps_ceiling") budgetNs := 1e9 / float64(fps) b.ReportMetric(nsPerFrame/budgetNs*100, "budget_pct") } // BenchmarkPipeline_ASCIIVideo_* — the FULL OnPTYOut path // (emulator.Write CGO + viewport renderer + a stdout write to // io.Discard) running at 30/60/120 fps targets. These are the // numbers to trust when asking "can we sustain N fps?" The // renderer-only Stream benchmarks above isolate one stage and // understate the real cost. // // 120 fps is the explicit baseline: anything under 100% of the // per-frame budget here means we hit 120 fps with margin to spare. func BenchmarkPipeline_ASCIIVideo_8Color_30fps(b *testing.B) { runPipelineStreamBench(b, buildASCIIVideoFrame(80, 40), 30) } func BenchmarkPipeline_ASCIIVideo_8Color_60fps(b *testing.B) { runPipelineStreamBench(b, buildASCIIVideoFrame(80, 40), 60) } func BenchmarkPipeline_ASCIIVideo_8Color_120fps(b *testing.B) { runPipelineStreamBench(b, buildASCIIVideoFrame(80, 40), 120) } func BenchmarkPipeline_ASCIIVideo_TrueColor_30fps(b *testing.B) { runPipelineStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 30) } func BenchmarkPipeline_ASCIIVideo_TrueColor_60fps(b *testing.B) { runPipelineStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 60) } func BenchmarkPipeline_ASCIIVideo_TrueColor_120fps(b *testing.B) { runPipelineStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 120) } // BenchmarkSessionResume_5MiBStyled simulates the user's // motivating case: claude resuming a long chat session and dumping // the whole history. 5 MiB of styled output as a single Render // call. Numbers here tell us how long the visible "scrolling // while resume loads" window will be. func BenchmarkSessionResume_5MiBStyled(b *testing.B) { chunk := buildStyledLinesChunk(5 * 1024 * 1024) b.SetBytes(int64(len(chunk))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) _ = vr.Render(chunk) } } // BenchmarkSessionResume_5MiBPlain same as above but pure text. // Lower bound — what we'd hit if the resume content were styling- // free. func BenchmarkSessionResume_5MiBPlain(b *testing.B) { chunk := buildPlainASCIIChunk(5 * 1024 * 1024) b.SetBytes(int64(len(chunk))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { vr := newViewportRenderer(newTerminalLayout(120, 40)) _ = vr.Render(chunk) } }