diff --git a/CHANGELOG.md b/CHANGELOG.md index aef03f1..21b6d52 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,12 +53,29 @@ loosely follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). `$XDG_STATE_HOME/patterm/debug/YYYYMMDD-HHMMSS` is used; pass an explicit path to override. All output goes to files — stdout/stderr are untouched. -- `--profile[=DIR]` flag captures pprof data for performance work: - `cpu.pprof` (running for the lifetime of the session), plus - `heap.pprof` and `goroutine.pprof` snapshots written on shutdown. - Defaults to `$XDG_STATE_HOME/patterm/profile/YYYYMMDD-HHMMSS`. - All diagnostics (startup, errors) are written to `profile.log` +- `--profile[=DIR]` flag captures pprof data plus concrete + performance counters for performance work: `cpu.pprof` (running + for the lifetime of the session), plus `heap.pprof` and + `goroutine.pprof` snapshots written on shutdown; alongside them, + a per-hot-path metrics tracker writes `metrics.jsonl` (one row + per second with chunk/byte rates, per-stage mean and max + latencies, and cache hit rates) plus a final `metrics.json` + aggregate and a human-readable `summary.txt` on exit. + Instrumented hot paths: `OnPTYOut`, viewport `renderer.Render`, + host stdout writes, libghostty-vt `emulator.Write` / `Title`, + sidebar / tab bar / status line draws (with cache-hit + accounting), snapshot replays, and the chrome ticker (so you can + see how often it fires with nothing to do). Defaults to + `$XDG_STATE_HOME/patterm/profile/YYYYMMDD-HHMMSS`. All + diagnostics (startup, errors) are written to `profile.log` inside the dir, never to stdout/stderr. +- Renderer benchmark suite (`internal/app/bench_test.go`). Three + workload fixtures — plain ASCII, SGR-styled lines, and a + ratatui-style cursor-shuffling burst — plus an OSC-gate + micro-benchmark. Run via `go test -bench=. -benchmem + ./internal/app/`. Gives a stable reference for the per-chunk + cost of the viewport renderer so future changes can be compared + apples-to-apples. - "New Terminal" entry in the command palette spawns a bare interactive `$SHELL` pane (kind `terminal`). Unlike "Run process: …" presets, which are session-persistent and reachable via `restart_process`, diff --git a/TODO.md b/TODO.md index c942441..58d2544 100644 --- a/TODO.md +++ b/TODO.md @@ -2,6 +2,44 @@ Findings from a codebase sweep — not user-reported, needs review before action. Each item names the anchor and a sketched fix. +Baseline benchmark numbers (`go test -bench=. ./internal/app/`, AMD +Ryzen 7 7800X3D): + +``` +ViewportRenderer_PlainASCII 229 MB/s 1.3 KB/op 6 allocs/op +ViewportRenderer_StyledLines 89 MB/s 91 KB/op 4325 allocs/op +ViewportRenderer_RatatuiBurst 40 MB/s 365 KB/op 17306 allocs/op +RendererThroughput_ReuseInstance 90 MB/s 316 KB/op 17380 allocs/op +ContainsOSC_NoOSC 3050 MB/s 0 B/op 0 allocs/op +``` + +- [ ] **viewport renderer allocates ~1 alloc per 4 input bytes on SGR/CSI-heavy chunks.** [MEDIUM] + - `internal/app/viewport_renderer.go` — the styled-lines and + ratatui benchmarks show 4-17k allocs per chunk. The hot + contributors are likely (a) `string(vr.buf)` / `string(params)` + conversions in `emitCSI` for every escape sequence, (b) the + `pending strings.Builder` resizing as fragments arrive, and (c) + `vr.shifter.Shift(vr.buf)` returning a fresh slice per CSI. + - Fix direction: switch CSI param parsing to byte-slice + comparison (no string conversion); reuse `vr.buf` and + `vr.pending` backing arrays across `Render` calls by + pre-growing in `newViewportRenderer`; have `cursorShifter.Shift` + return into a caller-owned buffer instead of allocating. + Profile-guided: run the styled-lines bench, point pprof at the + allocs profile, fix the top three call sites. + +- [ ] **viewport renderer throughput (~90 MB/s styled) limits codex steady-state.** [MEDIUM] + - The styled-lines and ratatui benchmarks come in at 89 MB/s and + 40 MB/s respectively. A 100 KB/s codex burst is far under that + limit, but a session-resume dump of a 5 MiB chat history takes + 50-130 ms of pure renderer time at those rates — enough to be + user-visible at the start of a long resume. + - Fix direction: same as the alloc fix above; once the per-call + allocation cost drops, the throughput ceiling rises with it. + Worth re-running the benches after fixing the allocs and only + investing further if the styled-lines bench is still under + ~300 MB/s. + - [ ] **Session.Children() allocates a fresh slice on every call.** [MEDIUM] - `internal/app/session.go:530-541` walks `s.order` under `s.mu` and builds a new `[]*Child` slice every time. Callers on hot paths: diff --git a/cmd/patterm/main.go b/cmd/patterm/main.go index cad1935..3cd2dce 100644 --- a/cmd/patterm/main.go +++ b/cmd/patterm/main.go @@ -53,7 +53,7 @@ func main() { projectDir = flag.String("project", "", "project directory (default $PWD)") showVersion = flag.Bool("version", false, "print version and exit") debugDir = flag.String("debug", "", "write debug logs + per-child raw PTY output to DIR (auto-picks a dated subdir under $XDG_STATE_HOME/patterm/debug when DIR is omitted)") - profileDir = flag.String("profile", "", "write CPU+heap+goroutine pprof files to DIR (auto-picks a dated subdir under $XDG_STATE_HOME/patterm/profile when DIR is omitted)") + profileDir = flag.String("profile", "", "write pprof files (cpu/heap/goroutine) and live perf counters (metrics.jsonl per-second, metrics.json + summary.txt on exit) to DIR (auto-picks a dated subdir under $XDG_STATE_HOME/patterm/profile when DIR is omitted)") ) // Allow bare `--debug` / `--profile` with no value — pflag treats // them as boolean-shaped strings, picking a sensible default dir. @@ -99,6 +99,7 @@ func main() { ProjectDir: cwd, ProjectKey: key, DebugDir: resolvedDebug, + ProfileDir: resolvedProfile, }); err != nil { die("%v", err) } diff --git a/internal/app/app.go b/internal/app/app.go index a4f1034..68dbc06 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -34,6 +34,12 @@ type Options struct { // /.raw. The dir is created if missing. Events // (spawn / exit / state change) land in /events.jsonl. DebugDir string + // ProfileDir, when non-empty, enables in-process performance + // counters. patterm writes a per-second JSONL snapshot stream to + // /metrics.jsonl, a final aggregate to metrics.json, + // and a human-readable summary.txt on shutdown. The pprof files + // written by --profile sit alongside these in the same dir. + ProfileDir string } const keyCtrlK byte = 0x0b @@ -134,6 +140,18 @@ func Run(ctx context.Context, opts Options) error { ctx, cancel := context.WithCancel(ctx) defer cancel() + // Performance tracker — instrumented hot-path timings written to + // . nil when --profile is off, in which case every + // record*() call is a fast nil check. + metrics, err := newMetricsTracker(opts.ProfileDir) + if err != nil { + return fmt.Errorf("app: metrics tracker: %w", err) + } + if metrics != nil { + go metrics.run(ctx) + defer metrics.close() + } + // Per-session idle-detection classifier. One goroutine ticks every // 250ms over every live child and updates IdleState. It stops when // ctx is cancelled. @@ -150,7 +168,9 @@ func Run(ctx context.Context, opts Options) error { hostCols: cols, hostRows: rows, stdinTTY: term.IsTerminal(int(os.Stdin.Fd())), + metrics: metrics, } + sess.SetMetrics(metrics) host.attention = st host.focus = st host.prompter = st @@ -271,7 +291,9 @@ func Run(ctx context.Context, opts Options) error { } chromeChanged := st.chromeDirty.Swap(false) sidebarChanged := st.sidebarDirty.Swap(false) - if !chromeChanged && !sidebarChanged { + didWork := chromeChanged || sidebarChanged + st.metrics.recordTickerFire(didWork) + if !didWork { continue } if chromeChanged { @@ -383,6 +405,11 @@ type uiState struct { hostCols, hostRows uint16 stdinTTY bool + // metrics is the optional performance tracker. nil when --profile + // is off. Hot paths call metrics.recordX which is a fast nil + // check on the disabled path. + metrics *metricsTracker + // chromeCacheMu guards the last-rendered byte cache for each chrome // element. The tab bar, sidebar, and status line all repaint on // many state changes and on every PTY chunk, but their content @@ -787,6 +814,10 @@ func (st *uiState) scheduleAutoRestart(c *Child) { // disabled only around the replay so long styled runs cannot wrap into // the right rail. func (st *uiState) OnPTYOut(childID string, chunk []byte) { + var entry time.Time + if st.metrics != nil { + entry = time.Now() + } layout := st.layoutSnapshot() st.mu.Lock() focus := st.focusedID @@ -803,16 +834,31 @@ func (st *uiState) OnPTYOut(childID string, chunk []byte) { } st.mu.Unlock() if palOpen || focus != childID || renderer == nil { + st.metrics.recordPTYOutDrop() return } var out []byte if forceRepaint { + var snapStart time.Time + if st.metrics != nil { + snapStart = time.Now() + } out = st.renderFocusedSnapshot(childID, renderer, layout) + if st.metrics != nil { + st.metrics.recordSnapshot(time.Since(snapStart)) + } if len(out) == 0 { return } } else { + var rstart time.Time + if st.metrics != nil { + rstart = time.Now() + } out = renderer.Render(chunk) + if st.metrics != nil { + st.metrics.recordRender(time.Since(rstart)) + } } // One write covers the autowrap-disable prelude, the chunk, and the // autowrap-restore postlude — three syscalls collapsed into one @@ -822,9 +868,16 @@ func (st *uiState) OnPTYOut(childID string, chunk []byte) { wrapped = append(wrapped, "\x1b[?7l"...) wrapped = append(wrapped, out...) wrapped = append(wrapped, "\x1b[?7h"...) + var wstart time.Time + if st.metrics != nil { + wstart = time.Now() + } st.outMu.Lock() _, _ = os.Stdout.Write(wrapped) st.outMu.Unlock() + if st.metrics != nil { + st.metrics.recordStdout(time.Since(wstart), len(wrapped)) + } // RI / IND / NEL / SU / SD / IL / DL and bottom-margin LF / VT / FF // scroll content within the host's scroll region, which spans every // column — so any of them drags the right-hand sidebar's session-tree @@ -851,6 +904,9 @@ func (st *uiState) OnPTYOut(childID string, chunk []byte) { // avoiding the string build, FindChild, and locking on every // chunk pulls steady-state CPU off the hot path. st.markChromeDirty() + if st.metrics != nil { + st.metrics.recordPTYOut(time.Since(entry), len(chunk)) + } } func (st *uiState) enterScreen() { @@ -990,6 +1046,10 @@ func (st *uiState) renderPaletteLocked() { // attention ask. Right side: palette hint. The PTY child occupies // host_rows-1 rows so this row is exclusively ours. func (st *uiState) drawStatusLine() { + var entry time.Time + if st.metrics != nil { + entry = time.Now() + } st.mu.Lock() palOpen := st.palette != nil focusID := st.focusedID @@ -1076,10 +1136,16 @@ func (st *uiState) drawStatusLine() { st.chromeCacheMu.Lock() if line == st.statusLineCache { st.chromeCacheMu.Unlock() + if st.metrics != nil { + st.metrics.recordStatus(time.Since(entry), true) + } return } st.statusLineCache = line st.chromeCacheMu.Unlock() + if st.metrics != nil { + defer func() { st.metrics.recordStatus(time.Since(entry), false) }() + } st.outMu.Lock() defer st.outMu.Unlock() diff --git a/internal/app/bench_test.go b/internal/app/bench_test.go new file mode 100644 index 0000000..2c30c9f --- /dev/null +++ b/internal/app/bench_test.go @@ -0,0 +1,169 @@ +package app + +import ( + "fmt" + "strings" + "testing" +) + +// Benchmarks for patterm's hot paths. Run with: +// +// go test -bench=. -benchmem ./internal/app/ +// +// or target one: +// +// go test -bench=BenchmarkViewportRenderer_PlainASCII -benchmem ./internal/app/ +// +// The fixtures below model the three workloads we care about most: +// +// - PlainASCII: long-running text output (claude streaming a code +// diff, codex outputting a tool result body). Fast-path territory. +// - StyledLines: SGR-heavy output (claude/codex chat history with +// coloured tokens). State-machine path. +// - RatatuiBurst: many short cursor-positioning / SGR transitions in +// a tight chunk, matching codex/ratatui's incremental diff +// updates. +// - SnapshotReplay: full styled-grid replay (focus switch). + +// buildPlainASCIIChunk returns a roughly N-byte chunk of pure +// printable ASCII text with the occasional newline — the cheapest +// workload, exercises the fast path in viewport_renderer.Render. +func buildPlainASCIIChunk(n int) []byte { + var b strings.Builder + b.Grow(n) + line := "The quick brown fox jumps over the lazy dog 0123456789 " + for b.Len() < n { + b.WriteString(line) + if b.Len()%80 < len(line) { + b.WriteByte('\n') + } + } + return []byte(b.String()[:n]) +} + +// buildStyledLinesChunk simulates SGR-heavy output: every word wears +// a colour, so the renderer breaks out of its fast path on every +// escape sequence. +func buildStyledLinesChunk(n int) []byte { + var b strings.Builder + b.Grow(n) + colours := []string{"31", "32", "33", "34", "35", "36"} + words := []string{"package", "func", "return", "import", "struct", "type", "const", "var"} + i := 0 + for b.Len() < n { + fmt.Fprintf(&b, "\x1b[%sm%s\x1b[0m ", colours[i%len(colours)], words[i%len(words)]) + if i%10 == 9 { + b.WriteByte('\n') + } + i++ + } + return []byte(b.String()[:n]) +} + +// buildRatatuiBurst simulates a single ratatui-style diff frame: +// CUP, SGR, a few chars, CUP, SGR, a few chars… for a viewport's +// worth of cells. +func buildRatatuiBurst(cells int) []byte { + var b strings.Builder + for i := 0; i < cells; i++ { + row := (i / 80) + 1 + col := (i % 80) + 1 + fmt.Fprintf(&b, "\x1b[%d;%dH\x1b[3%dm%c", row, col, i%8, byte('A'+(i%26))) + } + b.WriteString("\x1b[0m") + return []byte(b.String()) +} + +// BenchmarkViewportRenderer_PlainASCII drives a 16 KiB plain-text +// chunk through Render once per iteration. Reports ns/op, +// allocations, and B/op. +func BenchmarkViewportRenderer_PlainASCII(b *testing.B) { + chunk := buildPlainASCIIChunk(16 * 1024) + b.SetBytes(int64(len(chunk))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + vr := newViewportRenderer(newTerminalLayout(120, 40)) + _ = vr.Render(chunk) + } +} + +// BenchmarkViewportRenderer_StyledLines exercises the per-byte CSI +// path on SGR-heavy output. Most claude/codex chat resume traffic +// looks like this — coloured prose with frequent style toggles. +func BenchmarkViewportRenderer_StyledLines(b *testing.B) { + chunk := buildStyledLinesChunk(16 * 1024) + b.SetBytes(int64(len(chunk))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + vr := newViewportRenderer(newTerminalLayout(120, 40)) + _ = vr.Render(chunk) + } +} + +// BenchmarkViewportRenderer_RatatuiBurst measures the worst-case +// cursor-shuffling workload: full-frame diff updates dominated by +// CUP + SGR + single-char writes. +func BenchmarkViewportRenderer_RatatuiBurst(b *testing.B) { + chunk := buildRatatuiBurst(80 * 24) // one screenful of cells + b.SetBytes(int64(len(chunk))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + vr := newViewportRenderer(newTerminalLayout(120, 40)) + _ = vr.Render(chunk) + } +} + +// BenchmarkContainsOSC measures the OSC-gate fast path used by +// pumpChild before deciding whether to fire the per-chunk Title() +// CGO call. Inputs: +// - "hot": SGR-styled output without OSC — the common case for +// codex/ratatui. We want this near zero. +// - "cold": chunk with an OSC sequence in the middle. +func BenchmarkContainsOSC_NoOSC(b *testing.B) { + chunk := buildStyledLinesChunk(8 * 1024) + b.SetBytes(int64(len(chunk))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = containsOSC(chunk) + } +} + +func BenchmarkContainsOSC_WithOSC(b *testing.B) { + chunk := append(buildStyledLinesChunk(8*1024), []byte("\x1b]0;new title\x07")...) + b.SetBytes(int64(len(chunk))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = containsOSC(chunk) + } +} + +// BenchmarkRendererThroughput_ReuseInstance approximates real +// session behaviour: a single viewport renderer fed many chunks in +// sequence, no per-iteration allocation. Reports a throughput +// closer to the steady-state OnPTYOut path. Chunks are 4 KiB to +// match typical PTY read sizes; the renderer is reset every +// benchmark run. +func BenchmarkRendererThroughput_ReuseInstance(b *testing.B) { + chunks := make([][]byte, 16) + for i := range chunks { + chunks[i] = buildStyledLinesChunk(4 * 1024) + } + totalBytes := 0 + for _, c := range chunks { + totalBytes += len(c) + } + b.SetBytes(int64(totalBytes)) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + vr := newViewportRenderer(newTerminalLayout(120, 40)) + for _, c := range chunks { + _ = vr.Render(c) + } + } +} diff --git a/internal/app/metrics.go b/internal/app/metrics.go new file mode 100644 index 0000000..f4706e8 --- /dev/null +++ b/internal/app/metrics.go @@ -0,0 +1,462 @@ +package app + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "sync/atomic" + "time" +) + +// metricsTracker collects per-hot-path counters and timings. All +// fields are atomic so callers can record from the per-PTY-chunk path +// without taking a lock. Enabled only when --profile is set. +// +// Sampled rates ("X per second", "p99 latency") are not tracked here +// directly — the snapshotter goroutine writes a row to metrics.jsonl +// every second, and analysis tools compute rates from the deltas. +// Aggregate totals are written to metrics.json on shutdown. +type metricsTracker struct { + startedAt time.Time + + // PTY chunk arrival → stdout write pipeline (per OnPTYOut call). + ptyChunks atomic.Int64 + ptyBytes atomic.Int64 + onPTYOutNs atomic.Int64 + onPTYOutMaxNs atomic.Int64 + onPTYOutDrops atomic.Int64 // chunks for non-focused children — fast-path returns + stdoutWrites atomic.Int64 + stdoutBytes atomic.Int64 + stdoutNs atomic.Int64 + stdoutMaxNs atomic.Int64 + + // Viewport renderer (state-machine over child PTY bytes). + renderCalls atomic.Int64 + renderNs atomic.Int64 + renderMaxNs atomic.Int64 + + // CGO into libghostty-vt (counted from pumpChild). + emuWriteCalls atomic.Int64 + emuWriteNs atomic.Int64 + emuWriteMaxNs atomic.Int64 + emuTitleCalls atomic.Int64 + emuTitleNs atomic.Int64 + emuTitleSkips atomic.Int64 // OSC-gate fast path — title poll skipped + + // Chrome paint pipeline. + sidebarDraws atomic.Int64 + sidebarCacheHits atomic.Int64 + sidebarNs atomic.Int64 + sidebarMaxNs atomic.Int64 + + tabbarDraws atomic.Int64 + tabbarCacheHits atomic.Int64 + tabbarNs atomic.Int64 + + statusDraws atomic.Int64 + statusCacheHits atomic.Int64 + statusNs atomic.Int64 + + // Snapshot replay (focus / spawn / nudge). + snapshotReplays atomic.Int64 + snapshotNs atomic.Int64 + snapshotMaxNs atomic.Int64 + + // Chrome ticker — distinguishes useful work from idle wakeups. + tickerFires atomic.Int64 + tickerIdleFires atomic.Int64 // nothing dirty when the ticker fired + + // Output destination (set when enabled). + rowFile *os.File // metrics.jsonl + dir string +} + +// newMetricsTracker creates an enabled tracker writing to /. +// Returns nil + nil err if dir is empty (feature off). Caller must +// call tracker.run(ctx) in a goroutine and tracker.close() at exit. +func newMetricsTracker(dir string) (*metricsTracker, error) { + if dir == "" { + return nil, nil + } + if err := os.MkdirAll(dir, 0o700); err != nil { + return nil, err + } + row, err := os.Create(filepath.Join(dir, "metrics.jsonl")) + if err != nil { + return nil, err + } + return &metricsTracker{ + startedAt: time.Now(), + rowFile: row, + dir: dir, + }, nil +} + +// observeMax updates dst to max(dst, v) using a CAS loop. Atomic max +// isn't a hardware primitive on most CPUs; this is the standard idiom. +// Spurious wakeups can race but the result settles at the true max. +func observeMax(dst *atomic.Int64, v int64) { + for { + old := dst.Load() + if v <= old { + return + } + if dst.CompareAndSwap(old, v) { + return + } + } +} + +// recordPTYOut is called once at the end of each OnPTYOut invocation. +// `dur` is the full per-chunk wall time (renderer + stdout + chrome +// signals); `bytes` is the chunk's byte count. +func (m *metricsTracker) recordPTYOut(dur time.Duration, bytes int) { + if m == nil { + return + } + m.ptyChunks.Add(1) + m.ptyBytes.Add(int64(bytes)) + ns := dur.Nanoseconds() + m.onPTYOutNs.Add(ns) + observeMax(&m.onPTYOutMaxNs, ns) +} + +func (m *metricsTracker) recordPTYOutDrop() { + if m == nil { + return + } + m.onPTYOutDrops.Add(1) +} + +func (m *metricsTracker) recordRender(dur time.Duration) { + if m == nil { + return + } + m.renderCalls.Add(1) + ns := dur.Nanoseconds() + m.renderNs.Add(ns) + observeMax(&m.renderMaxNs, ns) +} + +func (m *metricsTracker) recordStdout(dur time.Duration, bytes int) { + if m == nil { + return + } + m.stdoutWrites.Add(1) + m.stdoutBytes.Add(int64(bytes)) + ns := dur.Nanoseconds() + m.stdoutNs.Add(ns) + observeMax(&m.stdoutMaxNs, ns) +} + +func (m *metricsTracker) recordEmuWrite(dur time.Duration) { + if m == nil { + return + } + m.emuWriteCalls.Add(1) + ns := dur.Nanoseconds() + m.emuWriteNs.Add(ns) + observeMax(&m.emuWriteMaxNs, ns) +} + +func (m *metricsTracker) recordEmuTitle(dur time.Duration, skipped bool) { + if m == nil { + return + } + if skipped { + m.emuTitleSkips.Add(1) + return + } + m.emuTitleCalls.Add(1) + m.emuTitleNs.Add(dur.Nanoseconds()) +} + +func (m *metricsTracker) recordSidebar(dur time.Duration, cacheHit bool) { + if m == nil { + return + } + m.sidebarDraws.Add(1) + if cacheHit { + m.sidebarCacheHits.Add(1) + } + ns := dur.Nanoseconds() + m.sidebarNs.Add(ns) + observeMax(&m.sidebarMaxNs, ns) +} + +func (m *metricsTracker) recordTabbar(dur time.Duration, cacheHit bool) { + if m == nil { + return + } + m.tabbarDraws.Add(1) + if cacheHit { + m.tabbarCacheHits.Add(1) + } + m.tabbarNs.Add(dur.Nanoseconds()) +} + +func (m *metricsTracker) recordStatus(dur time.Duration, cacheHit bool) { + if m == nil { + return + } + m.statusDraws.Add(1) + if cacheHit { + m.statusCacheHits.Add(1) + } + m.statusNs.Add(dur.Nanoseconds()) +} + +func (m *metricsTracker) recordSnapshot(dur time.Duration) { + if m == nil { + return + } + m.snapshotReplays.Add(1) + ns := dur.Nanoseconds() + m.snapshotNs.Add(ns) + observeMax(&m.snapshotMaxNs, ns) +} + +func (m *metricsTracker) recordTickerFire(didWork bool) { + if m == nil { + return + } + m.tickerFires.Add(1) + if !didWork { + m.tickerIdleFires.Add(1) + } +} + +// snapshot captures the tracker's current state as a JSON-serialisable +// map. Suitable for both the per-second JSONL row and the final +// metrics.json aggregate. +type metricsSnapshot struct { + WallSeconds float64 `json:"wall_seconds"` + PTYChunks int64 `json:"pty_chunks"` + PTYBytes int64 `json:"pty_bytes"` + OnPTYOutNs int64 `json:"on_pty_out_ns_total"` + OnPTYOutMaxNs int64 `json:"on_pty_out_ns_max"` + OnPTYOutDrops int64 `json:"on_pty_out_drops"` + StdoutWrites int64 `json:"stdout_writes"` + StdoutBytes int64 `json:"stdout_bytes"` + StdoutNs int64 `json:"stdout_ns_total"` + StdoutMaxNs int64 `json:"stdout_ns_max"` + + RenderCalls int64 `json:"render_calls"` + RenderNs int64 `json:"render_ns_total"` + RenderMaxNs int64 `json:"render_ns_max"` + + EmuWriteCalls int64 `json:"emu_write_calls"` + EmuWriteNs int64 `json:"emu_write_ns_total"` + EmuWriteMaxNs int64 `json:"emu_write_ns_max"` + EmuTitleCalls int64 `json:"emu_title_calls"` + EmuTitleNs int64 `json:"emu_title_ns_total"` + EmuTitleSkips int64 `json:"emu_title_skips"` + + SidebarDraws int64 `json:"sidebar_draws"` + SidebarCacheHits int64 `json:"sidebar_cache_hits"` + SidebarNs int64 `json:"sidebar_ns_total"` + SidebarMaxNs int64 `json:"sidebar_ns_max"` + + TabbarDraws int64 `json:"tabbar_draws"` + TabbarCacheHits int64 `json:"tabbar_cache_hits"` + TabbarNs int64 `json:"tabbar_ns_total"` + + StatusDraws int64 `json:"status_draws"` + StatusCacheHits int64 `json:"status_cache_hits"` + StatusNs int64 `json:"status_ns_total"` + + SnapshotReplays int64 `json:"snapshot_replays"` + SnapshotNs int64 `json:"snapshot_ns_total"` + SnapshotMaxNs int64 `json:"snapshot_ns_max"` + + TickerFires int64 `json:"ticker_fires"` + TickerIdleFires int64 `json:"ticker_idle_fires"` + + // Derived rates (computed at snapshot time so consumers don't have + // to). All "per_second" values are averaged over wall_seconds. + PTYChunksPerSec float64 `json:"pty_chunks_per_sec"` + PTYBytesPerSec float64 `json:"pty_bytes_per_sec"` + OnPTYOutMeanUs float64 `json:"on_pty_out_mean_us"` + StdoutMeanUs float64 `json:"stdout_mean_us"` + EmuWriteMeanUs float64 `json:"emu_write_mean_us"` + SidebarMeanUs float64 `json:"sidebar_mean_us"` + SidebarCacheHitRate float64 `json:"sidebar_cache_hit_rate"` + TabbarCacheHitRate float64 `json:"tabbar_cache_hit_rate"` + StatusCacheHitRate float64 `json:"status_cache_hit_rate"` + EmuTitleSkipRate float64 `json:"emu_title_skip_rate"` + TickerIdleRate float64 `json:"ticker_idle_rate"` + Timestamp string `json:"timestamp"` +} + +func (m *metricsTracker) snapshotNow() metricsSnapshot { + wall := time.Since(m.startedAt).Seconds() + if wall <= 0 { + wall = 1 + } + chunks := m.ptyChunks.Load() + bytes := m.ptyBytes.Load() + onptyTotal := m.onPTYOutNs.Load() + stdW := m.stdoutWrites.Load() + stdNs := m.stdoutNs.Load() + emuW := m.emuWriteCalls.Load() + emuWNs := m.emuWriteNs.Load() + sbDraws := m.sidebarDraws.Load() + sbHits := m.sidebarCacheHits.Load() + sbNs := m.sidebarNs.Load() + tbDraws := m.tabbarDraws.Load() + tbHits := m.tabbarCacheHits.Load() + stDraws := m.statusDraws.Load() + stHits := m.statusCacheHits.Load() + emuTC := m.emuTitleCalls.Load() + emuTS := m.emuTitleSkips.Load() + tickerF := m.tickerFires.Load() + tickerI := m.tickerIdleFires.Load() + + div := func(num, denom int64) float64 { + if denom == 0 { + return 0 + } + return float64(num) / float64(denom) + } + + return metricsSnapshot{ + WallSeconds: wall, + PTYChunks: chunks, + PTYBytes: bytes, + OnPTYOutNs: onptyTotal, + OnPTYOutMaxNs: m.onPTYOutMaxNs.Load(), + OnPTYOutDrops: m.onPTYOutDrops.Load(), + StdoutWrites: stdW, + StdoutBytes: m.stdoutBytes.Load(), + StdoutNs: stdNs, + StdoutMaxNs: m.stdoutMaxNs.Load(), + + RenderCalls: m.renderCalls.Load(), + RenderNs: m.renderNs.Load(), + RenderMaxNs: m.renderMaxNs.Load(), + + EmuWriteCalls: emuW, + EmuWriteNs: emuWNs, + EmuWriteMaxNs: m.emuWriteMaxNs.Load(), + EmuTitleCalls: emuTC, + EmuTitleNs: m.emuTitleNs.Load(), + EmuTitleSkips: emuTS, + + SidebarDraws: sbDraws, + SidebarCacheHits: sbHits, + SidebarNs: sbNs, + SidebarMaxNs: m.sidebarMaxNs.Load(), + + TabbarDraws: tbDraws, + TabbarCacheHits: tbHits, + TabbarNs: m.tabbarNs.Load(), + + StatusDraws: stDraws, + StatusCacheHits: stHits, + StatusNs: m.statusNs.Load(), + + SnapshotReplays: m.snapshotReplays.Load(), + SnapshotNs: m.snapshotNs.Load(), + SnapshotMaxNs: m.snapshotMaxNs.Load(), + + TickerFires: tickerF, + TickerIdleFires: tickerI, + + PTYChunksPerSec: float64(chunks) / wall, + PTYBytesPerSec: float64(bytes) / wall, + OnPTYOutMeanUs: div(onptyTotal/1000, chunks), + StdoutMeanUs: div(stdNs/1000, stdW), + EmuWriteMeanUs: div(emuWNs/1000, emuW), + SidebarMeanUs: div(sbNs/1000, sbDraws), + SidebarCacheHitRate: div(sbHits, sbDraws), + TabbarCacheHitRate: div(tbHits, tbDraws), + StatusCacheHitRate: div(stHits, stDraws), + EmuTitleSkipRate: div(emuTS, emuTC+emuTS), + TickerIdleRate: div(tickerI, tickerF), + Timestamp: time.Now().Format(time.RFC3339Nano), + } +} + +// run is the snapshotter goroutine: write a JSONL row every second +// until ctx is cancelled. Stops cleanly without flushing partial +// rows. +func (m *metricsTracker) run(ctx context.Context) { + if m == nil { + return + } + enc := json.NewEncoder(m.rowFile) + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + snap := m.snapshotNow() + _ = enc.Encode(snap) + } + } +} + +// close writes the final aggregate snapshot to metrics.json + a +// short human-readable summary.txt, then closes the row file. Safe +// to call on a nil receiver. +func (m *metricsTracker) close() { + if m == nil { + return + } + snap := m.snapshotNow() + if f, err := os.Create(filepath.Join(m.dir, "metrics.json")); err == nil { + enc := json.NewEncoder(f) + enc.SetIndent("", " ") + _ = enc.Encode(snap) + _ = f.Close() + } + if f, err := os.Create(filepath.Join(m.dir, "summary.txt")); err == nil { + writeSummary(f, snap) + _ = f.Close() + } + if m.rowFile != nil { + _ = m.rowFile.Close() + m.rowFile = nil + } +} + +// writeSummary renders a brief human-readable digest of a snapshot. +// Designed for `cat summary.txt` after a session — quick orientation +// before diving into metrics.json / pprof. +func writeSummary(w *os.File, s metricsSnapshot) { + fmt.Fprintf(w, "patterm performance summary\n") + fmt.Fprintf(w, "===========================\n\n") + fmt.Fprintf(w, "session length: %.1fs\n", s.WallSeconds) + fmt.Fprintf(w, "pty chunks: %d (%.1f /s)\n", s.PTYChunks, s.PTYChunksPerSec) + fmt.Fprintf(w, "pty bytes: %d (%.0f /s, %.1f KiB/s)\n", + s.PTYBytes, s.PTYBytesPerSec, s.PTYBytesPerSec/1024) + fmt.Fprintf(w, "pty chunks dropped: %d (focus not on caller — fast-path return)\n", s.OnPTYOutDrops) + fmt.Fprintf(w, "\n") + fmt.Fprintf(w, "OnPTYOut mean: %.1fµs max: %.1fms\n", + s.OnPTYOutMeanUs, float64(s.OnPTYOutMaxNs)/1e6) + fmt.Fprintf(w, "viewport.Render calls: %d total %.1fms max %.1fms\n", + s.RenderCalls, float64(s.RenderNs)/1e6, float64(s.RenderMaxNs)/1e6) + fmt.Fprintf(w, "stdout writes: %d mean %.1fµs max %.1fms bytes %d\n", + s.StdoutWrites, s.StdoutMeanUs, float64(s.StdoutMaxNs)/1e6, s.StdoutBytes) + fmt.Fprintf(w, "\n") + fmt.Fprintf(w, "emulator.Write (cgo): %d mean %.1fµs max %.1fms\n", + s.EmuWriteCalls, s.EmuWriteMeanUs, float64(s.EmuWriteMaxNs)/1e6) + fmt.Fprintf(w, "emulator.Title polls: %d real, %d gated skip rate %.1f%%\n", + s.EmuTitleCalls, s.EmuTitleSkips, s.EmuTitleSkipRate*100) + fmt.Fprintf(w, "\n") + fmt.Fprintf(w, "sidebar draws: %d mean %.1fµs max %.1fms cache-hit %.1f%%\n", + s.SidebarDraws, s.SidebarMeanUs, float64(s.SidebarMaxNs)/1e6, s.SidebarCacheHitRate*100) + fmt.Fprintf(w, "tabbar draws: %d cache-hit %.1f%%\n", + s.TabbarDraws, s.TabbarCacheHitRate*100) + fmt.Fprintf(w, "status draws: %d cache-hit %.1f%%\n", + s.StatusDraws, s.StatusCacheHitRate*100) + fmt.Fprintf(w, "snapshot replays: %d total %.1fms max %.1fms\n", + s.SnapshotReplays, float64(s.SnapshotNs)/1e6, float64(s.SnapshotMaxNs)/1e6) + fmt.Fprintf(w, "\n") + fmt.Fprintf(w, "chrome ticker: %d fires, %d idle idle rate %.1f%%\n", + s.TickerFires, s.TickerIdleFires, s.TickerIdleRate*100) +} diff --git a/internal/app/metrics_test.go b/internal/app/metrics_test.go new file mode 100644 index 0000000..0f4345f --- /dev/null +++ b/internal/app/metrics_test.go @@ -0,0 +1,116 @@ +package app + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + "time" +) + +func TestMetricsTrackerDisabledByEmptyDir(t *testing.T) { + m, err := newMetricsTracker("") + if err != nil { + t.Fatalf("newMetricsTracker(\"\") err: %v", err) + } + if m != nil { + t.Fatalf("expected nil tracker for empty dir, got %v", m) + } +} + +func TestMetricsTrackerRecordsAndWrites(t *testing.T) { + dir := t.TempDir() + m, err := newMetricsTracker(dir) + if err != nil { + t.Fatalf("newMetricsTracker: %v", err) + } + if m == nil { + t.Fatal("expected non-nil tracker") + } + + m.recordPTYOut(2*time.Millisecond, 1024) + m.recordPTYOut(5*time.Millisecond, 4096) + m.recordRender(800 * time.Microsecond) + m.recordStdout(300*time.Microsecond, 1100) + m.recordEmuWrite(150 * time.Microsecond) + m.recordEmuTitle(0, true) + m.recordEmuTitle(20*time.Microsecond, false) + m.recordSidebar(100*time.Microsecond, true) + m.recordSidebar(900*time.Microsecond, false) + m.recordTabbar(50*time.Microsecond, true) + m.recordStatus(40*time.Microsecond, true) + m.recordSnapshot(2 * time.Millisecond) + m.recordTickerFire(false) + m.recordTickerFire(true) + m.recordPTYOutDrop() + + m.close() + + // metrics.json should exist and parse, and reflect what we recorded. + raw, err := os.ReadFile(filepath.Join(dir, "metrics.json")) + if err != nil { + t.Fatalf("read metrics.json: %v", err) + } + var snap metricsSnapshot + if err := json.Unmarshal(raw, &snap); err != nil { + t.Fatalf("parse metrics.json: %v", err) + } + if snap.PTYChunks != 2 { + t.Errorf("PTYChunks = %d, want 2", snap.PTYChunks) + } + if snap.PTYBytes != 5120 { + t.Errorf("PTYBytes = %d, want 5120", snap.PTYBytes) + } + if snap.OnPTYOutMaxNs != (5 * time.Millisecond).Nanoseconds() { + t.Errorf("OnPTYOutMaxNs = %d, want %d", + snap.OnPTYOutMaxNs, (5 * time.Millisecond).Nanoseconds()) + } + if snap.SidebarDraws != 2 { + t.Errorf("SidebarDraws = %d, want 2", snap.SidebarDraws) + } + if snap.SidebarCacheHits != 1 { + t.Errorf("SidebarCacheHits = %d, want 1", snap.SidebarCacheHits) + } + if snap.SidebarCacheHitRate != 0.5 { + t.Errorf("SidebarCacheHitRate = %v, want 0.5", snap.SidebarCacheHitRate) + } + if snap.EmuTitleCalls != 1 || snap.EmuTitleSkips != 1 { + t.Errorf("emu title accounting: calls=%d skips=%d, want 1/1", + snap.EmuTitleCalls, snap.EmuTitleSkips) + } + if snap.TickerFires != 2 || snap.TickerIdleFires != 1 { + t.Errorf("ticker accounting: fires=%d idle=%d, want 2/1", + snap.TickerFires, snap.TickerIdleFires) + } + if snap.OnPTYOutDrops != 1 { + t.Errorf("OnPTYOutDrops = %d, want 1", snap.OnPTYOutDrops) + } + + // summary.txt should also be present and non-empty. + info, err := os.Stat(filepath.Join(dir, "summary.txt")) + if err != nil { + t.Fatalf("stat summary.txt: %v", err) + } + if info.Size() == 0 { + t.Fatal("summary.txt is empty") + } +} + +func TestMetricsTrackerNilSafe(t *testing.T) { + // Every record* method must be safe to call on a nil receiver + // because the hot paths use that to avoid an enabled-check. + var m *metricsTracker + m.recordPTYOut(time.Millisecond, 100) + m.recordPTYOutDrop() + m.recordRender(time.Microsecond) + m.recordStdout(time.Microsecond, 50) + m.recordEmuWrite(time.Microsecond) + m.recordEmuTitle(time.Microsecond, false) + m.recordEmuTitle(0, true) + m.recordSidebar(time.Microsecond, true) + m.recordTabbar(time.Microsecond, false) + m.recordStatus(time.Microsecond, true) + m.recordSnapshot(time.Microsecond) + m.recordTickerFire(true) + m.close() +} diff --git a/internal/app/session.go b/internal/app/session.go index 6e1699c..e5f9151 100644 --- a/internal/app/session.go +++ b/internal/app/session.go @@ -50,6 +50,11 @@ type Session struct { // JSON file so they can be re-spawned after patterm restarts. // Optional; nil means "no persistence" (used by unit tests). persistStore *persist.Store + + // metrics is the optional performance tracker. nil when --profile + // is off. The pump goroutine reads it via atomic Load so installing + // metrics post-construction doesn't race with running children. + metrics atomic.Pointer[metricsTracker] } // SetPersistStore attaches a process-persistence store. Future Spawn / @@ -61,6 +66,18 @@ func (s *Session) SetPersistStore(p *persist.Store) { s.mu.Unlock() } +// SetMetrics installs the per-session performance tracker. Safe to +// call with nil to disable (the default). Reads on the hot path go +// through atomic.Pointer.Load() with no lock; SetMetrics swaps the +// pointer once at startup. +func (s *Session) SetMetrics(m *metricsTracker) { + s.metrics.Store(m) +} + +func (s *Session) loadMetrics() *metricsTracker { + return s.metrics.Load() +} + // ChildEventListener is implemented by the TUI to react to lifecycle // events without polling. type ChildEventListener interface { @@ -392,9 +409,17 @@ func (s *Session) pumpChild(c *Child, runID uint64) { } chunk := buf[:n] if em := c.Emulator(); em != nil { + m := s.loadMetrics() + wstart := time.Time{} + if m != nil { + wstart = time.Now() + } if _, werr := em.Write(chunk); werr != nil { logf("emulator.Write(child %s): %v", c.ID, werr) } + if m != nil { + m.recordEmuWrite(time.Since(wstart)) + } // OSC 0/2 title updates ride on the same byte stream as // the rest of the output. Polling the emulator after each // chunk is cheap on its own (one CGO call) but codex/ @@ -403,9 +428,18 @@ func (s *Session) pumpChild(c *Child, runID uint64) { // the chunk doesn't carry an OSC start byte at all; the // title can only change on chunks that include one. if containsOSC(chunk) { + tstart := time.Time{} + if m != nil { + tstart = time.Now() + } if t, terr := em.Title(); terr == nil { c.recordTitle(t) } + if m != nil { + m.recordEmuTitle(time.Since(tstart), false) + } + } else if m != nil { + m.recordEmuTitle(0, true) } } c.recordWrite(chunk) diff --git a/internal/app/sidebar.go b/internal/app/sidebar.go index 490cb03..9f90692 100644 --- a/internal/app/sidebar.go +++ b/internal/app/sidebar.go @@ -38,6 +38,10 @@ func formatShortDuration(d time.Duration) string { // computed main viewport, so the sidebar region is outside the child's // cursor range. We can redraw freely without fighting the child for cells. func (st *uiState) drawSidebar() { + var entry time.Time + if st.metrics != nil { + entry = time.Now() + } st.mu.Lock() palOpen := st.palette != nil focus := st.focusedID @@ -231,10 +235,16 @@ func (st *uiState) drawSidebar() { st.chromeCacheMu.Lock() if frame == st.sidebarCache { st.chromeCacheMu.Unlock() + if st.metrics != nil { + st.metrics.recordSidebar(time.Since(entry), true) + } return } st.sidebarCache = frame st.chromeCacheMu.Unlock() + if st.metrics != nil { + defer func() { st.metrics.recordSidebar(time.Since(entry), false) }() + } st.outMu.Lock() // Save cursor; emit the sidebar; restore. diff --git a/internal/app/tabbar.go b/internal/app/tabbar.go index 946f340..1294254 100644 --- a/internal/app/tabbar.go +++ b/internal/app/tabbar.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "strings" + "time" "unicode/utf8" ) @@ -17,6 +18,10 @@ const tabBarRows = 2 // to the leftmost tabs so the strip fills the screen edge-to-edge. // A trailing "+ new" hint sits in the rightmost reserved slot. func (st *uiState) drawTabBar() { + var entry time.Time + if st.metrics != nil { + entry = time.Now() + } st.mu.Lock() palOpen := st.palette != nil focus := st.focusedID @@ -188,10 +193,16 @@ func (st *uiState) drawTabBar() { st.chromeCacheMu.Lock() if frame == st.tabBarCache { st.chromeCacheMu.Unlock() + if st.metrics != nil { + st.metrics.recordTabbar(time.Since(entry), true) + } return } st.tabBarCache = frame st.chromeCacheMu.Unlock() + if st.metrics != nil { + defer func() { st.metrics.recordTabbar(time.Since(entry), false) }() + } st.outMu.Lock() defer st.outMu.Unlock()