package app

import (
	"fmt"
	"io"
	"strings"
	"testing"

	"github.com/hjbdev/patterm/internal/vt"
)

// Benchmarks for patterm's hot paths. Run with:
//
//	go test -bench=. -benchmem ./internal/app/
//
// or target one:
//
//	go test -bench=BenchmarkViewportRenderer_PlainASCII -benchmem ./internal/app/
//
// The fixtures below model the three workloads we care about most:
//
//   - PlainASCII: long-running text output (claude streaming a code
//     diff, codex outputting a tool result body). Fast-path territory.
//   - StyledLines: SGR-heavy output (claude/codex chat history with
//     coloured tokens). State-machine path.
//   - RatatuiBurst: many short cursor-positioning / SGR transitions in
//     a tight chunk, matching codex/ratatui's incremental diff
//     updates.
//   - SnapshotReplay: full styled-grid replay (focus switch).

// buildPlainASCIIChunk returns a roughly N-byte chunk of pure
// printable ASCII text with the occasional newline — the cheapest
// workload, exercises the fast path in viewport_renderer.Render.
func buildPlainASCIIChunk(n int) []byte {
	var b strings.Builder
	b.Grow(n)
	line := "The quick brown fox jumps over the lazy dog 0123456789 "
	for b.Len() < n {
		b.WriteString(line)
		if b.Len()%80 < len(line) {
			b.WriteByte('\n')
		}
	}
	return []byte(b.String()[:n])
}

// buildStyledLinesChunk simulates SGR-heavy output: every word wears
// a colour, so the renderer breaks out of its fast path on every
// escape sequence.
func buildStyledLinesChunk(n int) []byte {
	var b strings.Builder
	b.Grow(n)
	colours := []string{"31", "32", "33", "34", "35", "36"}
	words := []string{"package", "func", "return", "import", "struct", "type", "const", "var"}
	i := 0
	for b.Len() < n {
		fmt.Fprintf(&b, "\x1b[%sm%s\x1b[0m ", colours[i%len(colours)], words[i%len(words)])
		if i%10 == 9 {
			b.WriteByte('\n')
		}
		i++
	}
	return []byte(b.String()[:n])
}

// buildRatatuiBurst simulates a single ratatui-style diff frame:
// CUP, SGR, a few chars, CUP, SGR, a few chars… for a viewport's
// worth of cells.
func buildRatatuiBurst(cells int) []byte {
	var b strings.Builder
	for i := 0; i < cells; i++ {
		row := (i / 80) + 1
		col := (i % 80) + 1
		fmt.Fprintf(&b, "\x1b[%d;%dH\x1b[3%dm%c", row, col, i%8, byte('A'+(i%26)))
	}
	b.WriteString("\x1b[0m")
	return []byte(b.String())
}

// BenchmarkViewportRenderer_PlainASCII drives a 16 KiB plain-text
// chunk through Render once per iteration. Reports ns/op,
// allocations, and B/op.
func BenchmarkViewportRenderer_PlainASCII(b *testing.B) {
	chunk := buildPlainASCIIChunk(16 * 1024)
	b.SetBytes(int64(len(chunk)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		_ = vr.Render(chunk)
	}
}

// BenchmarkViewportRenderer_StyledLines exercises the per-byte CSI
// path on SGR-heavy output. Most claude/codex chat resume traffic
// looks like this — coloured prose with frequent style toggles.
func BenchmarkViewportRenderer_StyledLines(b *testing.B) {
	chunk := buildStyledLinesChunk(16 * 1024)
	b.SetBytes(int64(len(chunk)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		_ = vr.Render(chunk)
	}
}

// BenchmarkViewportRenderer_RatatuiBurst measures the worst-case
// cursor-shuffling workload: full-frame diff updates dominated by
// CUP + SGR + single-char writes.
func BenchmarkViewportRenderer_RatatuiBurst(b *testing.B) {
	chunk := buildRatatuiBurst(80 * 24) // one screenful of cells
	b.SetBytes(int64(len(chunk)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		_ = vr.Render(chunk)
	}
}

// BenchmarkContainsOSC measures the OSC-gate fast path used by
// pumpChild before deciding whether to fire the per-chunk Title()
// CGO call. Inputs:
//   - "hot": SGR-styled output without OSC — the common case for
//     codex/ratatui. We want this near zero.
//   - "cold": chunk with an OSC sequence in the middle.
func BenchmarkContainsOSC_NoOSC(b *testing.B) {
	chunk := buildStyledLinesChunk(8 * 1024)
	b.SetBytes(int64(len(chunk)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		_ = containsOSC(chunk)
	}
}

func BenchmarkContainsOSC_WithOSC(b *testing.B) {
	chunk := append(buildStyledLinesChunk(8*1024), []byte("\x1b]0;new title\x07")...)
	b.SetBytes(int64(len(chunk)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		_ = containsOSC(chunk)
	}
}

// BenchmarkRendererThroughput_ReuseInstance approximates real
// session behaviour: a single viewport renderer fed many chunks in
// sequence, no per-iteration allocation. Reports a throughput
// closer to the steady-state OnPTYOut path. Chunks are 4 KiB to
// match typical PTY read sizes; the renderer is reset every
// benchmark run.
func BenchmarkRendererThroughput_ReuseInstance(b *testing.B) {
	chunks := make([][]byte, 16)
	for i := range chunks {
		chunks[i] = buildStyledLinesChunk(4 * 1024)
	}
	totalBytes := 0
	for _, c := range chunks {
		totalBytes += len(c)
	}
	b.SetBytes(int64(totalBytes))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		for _, c := range chunks {
			_ = vr.Render(c)
		}
	}
}

// Stress workloads — these model the worst things a real session
// can throw at us. The headline target is "ASCII video": every cell
// of an 80x40 viewport carries an SGR colour change and a printable
// character, rendered as one chunk per frame. Real ASCII-video CLIs
// (ascii-image-converter, asciinema-render, towel.blinkenlights, the
// Bad Apple meme) hit patterm with exactly this pattern at 24-30 fps
// for minutes at a time.
//
// We synthesise the workload rather than ship a captured corpus so
// the benchmarks stay deterministic and the repo doesn't carry tens
// of MiB of fixture data. The encoding is faithful to what those
// tools actually emit.

// buildASCIIVideoFrame builds a single full-viewport frame with
// 8-colour SGR per cell (`\x1b[3Nm`). One frame ≈ 30 KiB for an
// 80x40 viewport, which lines up with what ascii-video tools emit.
func buildASCIIVideoFrame(cols, rows int) []byte {
	var b strings.Builder
	b.WriteString("\x1b[H") // home cursor before the frame starts
	for r := 0; r < rows; r++ {
		for c := 0; c < cols; c++ {
			fmt.Fprintf(&b, "\x1b[3%dm%c", (r+c)%8, byte(' '+(r*c)%(0x7e-' ')))
		}
		b.WriteString("\x1b[0m\r\n")
	}
	return []byte(b.String())
}

// buildASCIIVideoFrameTrueColor builds the same frame but with
// 24-bit RGB SGR (`\x1b[38;2;R;G;Bm`). Every cell is ~20 bytes of
// escape + 1 byte glyph, so a frame is ≈ 70 KiB. This is what
// chafa --colors=full and modern terminal video players emit, and
// it's the heaviest SGR variant the renderer's CSI path sees.
func buildASCIIVideoFrameTrueColor(cols, rows int) []byte {
	var b strings.Builder
	b.WriteString("\x1b[H")
	for r := 0; r < rows; r++ {
		for c := 0; c < cols; c++ {
			rd := (r * 7) % 256
			gd := (c * 11) % 256
			bd := ((r + c) * 13) % 256
			fmt.Fprintf(&b, "\x1b[38;2;%d;%d;%dm%c", rd, gd, bd, byte(' '+(r*c)%(0x7e-' ')))
		}
		b.WriteString("\x1b[0m\r\n")
	}
	return []byte(b.String())
}

// buildBadApplePattern builds the simplest possible ASCII video
// frame: alternating black/white cells (the Bad Apple meme is
// essentially a 1-bit silhouette video). This is the pattern that
// stresses the SGR state-machine without exercising truecolor parse
// — useful for isolating "is the cost in the colour parsing or in
// the cell-by-cell switching?"
func buildBadApplePattern(cols, rows int) []byte {
	var b strings.Builder
	b.WriteString("\x1b[H")
	for r := 0; r < rows; r++ {
		for c := 0; c < cols; c++ {
			if (r+c)%2 == 0 {
				b.WriteString("\x1b[37m█")
			} else {
				b.WriteString("\x1b[30m█")
			}
		}
		b.WriteString("\x1b[0m\r\n")
	}
	return []byte(b.String())
}

// BenchmarkASCIIVideo_Frame_8Color renders a single full-screen
// frame as one chunk. The headline number is MB/s — at 30 fps a
// frame is one PTY chunk every ~33 ms, so this should comfortably
// stay well under 1 ms.
func BenchmarkASCIIVideo_Frame_8Color(b *testing.B) {
	frame := buildASCIIVideoFrame(80, 40)
	b.SetBytes(int64(len(frame)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		_ = vr.Render(frame)
	}
}

// BenchmarkASCIIVideo_Frame_TrueColor renders a single truecolor
// frame. ~70 KiB per frame. Compare this to the 8-colour number to
// see how much extra cost the truecolor SGR parse imposes — the
// `\x1b[38;2;R;G;Bm` form is the longest and most parameter-rich
// CSI patterm sees in practice.
func BenchmarkASCIIVideo_Frame_TrueColor(b *testing.B) {
	frame := buildASCIIVideoFrameTrueColor(80, 40)
	b.SetBytes(int64(len(frame)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		_ = vr.Render(frame)
	}
}

// BenchmarkASCIIVideo_Frame_BadApple is the 1-bit pattern: simplest
// SGR (two colours, alternating). Isolates the renderer's cell-by-
// cell SGR cycling cost from the truecolor parse cost.
func BenchmarkASCIIVideo_Frame_BadApple(b *testing.B) {
	frame := buildBadApplePattern(80, 40)
	b.SetBytes(int64(len(frame)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		_ = vr.Render(frame)
	}
}

// runStreamBench is the shared body for the per-fps stream
// benchmarks. It feeds a fixed frame N times through a single
// renderer instance and reports µs/frame + an achievable-fps
// ceiling alongside the standard ns/op + MB/s. The fps value in
// the benchmark name is the *target* — the workload itself doesn't
// rate-limit; we just decide how many frames make a benchmark op
// (3 seconds' worth) so steady-state cost dominates warm-up.
func runStreamBench(b *testing.B, frame []byte, fps int) {
	frames := fps * 3 // 3 seconds at the target rate
	totalBytes := int64(len(frame) * frames)
	b.SetBytes(totalBytes)
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		for f := 0; f < frames; f++ {
			_ = vr.Render(frame)
		}
	}
	nsPerFrame := float64(b.Elapsed().Nanoseconds()) / float64(b.N*frames)
	b.ReportMetric(nsPerFrame/1000.0, "µs/frame")
	b.ReportMetric(1e9/nsPerFrame, "fps_ceiling")
	// budget_pct = how much of the per-frame budget at the target
	// rate we burn. Under 100 means we can hit the target; over
	// means we can't.
	budgetNs := 1e9 / float64(fps)
	b.ReportMetric(nsPerFrame/budgetNs*100, "budget_pct")
}

// BenchmarkASCIIVideo_Stream_8Color_30fps / _60fps / _120fps reuse
// one renderer across (3 × fps) frames. The headline numbers are
// µs/frame, fps_ceiling (= 1e9 / ns/frame), and budget_pct (=
// percent of the per-frame budget at the target rate we consume).
//
// 30 fps is the typical ASCII-video baseline (towel, chafa, Bad
// Apple ports). 60 is the "smooth playback" target. 120 is a
// future-proofing stress level matching modern high-refresh
// terminals.
func BenchmarkASCIIVideo_Stream_8Color_30fps(b *testing.B) {
	runStreamBench(b, buildASCIIVideoFrame(80, 40), 30)
}
func BenchmarkASCIIVideo_Stream_8Color_60fps(b *testing.B) {
	runStreamBench(b, buildASCIIVideoFrame(80, 40), 60)
}
func BenchmarkASCIIVideo_Stream_8Color_120fps(b *testing.B) {
	runStreamBench(b, buildASCIIVideoFrame(80, 40), 120)
}

// BenchmarkASCIIVideo_Stream_TrueColor_* same set but with the
// truecolor frames. Compare against the 8-colour numbers to see
// what the longer `\x1b[38;2;R;G;Bm` parse costs us.
func BenchmarkASCIIVideo_Stream_TrueColor_30fps(b *testing.B) {
	runStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 30)
}
func BenchmarkASCIIVideo_Stream_TrueColor_60fps(b *testing.B) {
	runStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 60)
}
func BenchmarkASCIIVideo_Stream_TrueColor_120fps(b *testing.B) {
	runStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 120)
}

// BenchmarkASCIIVideo_Stream_BadApple_* tracks the 1-bit alternating
// pattern. Isolates per-cell SGR cycling cost from the truecolor
// parse cost above — useful when reading the diff between the two
// stream variants.
func BenchmarkASCIIVideo_Stream_BadApple_30fps(b *testing.B) {
	runStreamBench(b, buildBadApplePattern(80, 40), 30)
}
func BenchmarkASCIIVideo_Stream_BadApple_60fps(b *testing.B) {
	runStreamBench(b, buildBadApplePattern(80, 40), 60)
}
func BenchmarkASCIIVideo_Stream_BadApple_120fps(b *testing.B) {
	runStreamBench(b, buildBadApplePattern(80, 40), 120)
}

// BenchmarkEmulator_Write_8Color / _TrueColor isolate the
// libghostty-vt CGO cost — same frames the Pipeline benchmarks use,
// but feeding only the emulator. The delta between this and
// BenchmarkASCIIVideo_Stream_… is the renderer's share; the rest
// is libghostty-vt.
func BenchmarkEmulator_Write_8Color_Frame(b *testing.B) {
	frame := buildASCIIVideoFrame(80, 40)
	b.SetBytes(int64(len(frame)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		em, err := vt.NewGhosttyEmulator(80, 40)
		if err != nil {
			b.Fatalf("emulator: %v", err)
		}
		if _, werr := em.Write(frame); werr != nil {
			b.Fatalf("emulator.Write: %v", werr)
		}
		_ = em.Close()
	}
}

func BenchmarkEmulator_Write_TrueColor_Frame(b *testing.B) {
	frame := buildASCIIVideoFrameTrueColor(80, 40)
	b.SetBytes(int64(len(frame)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		em, err := vt.NewGhosttyEmulator(80, 40)
		if err != nil {
			b.Fatalf("emulator: %v", err)
		}
		if _, werr := em.Write(frame); werr != nil {
			b.Fatalf("emulator.Write: %v", werr)
		}
		_ = em.Close()
	}
}

// BenchmarkEmulator_Write_Stream_120fps reuses one emulator across
// 360 frames (3 sec × 120 fps). This is the cleanest measurement
// of em.Write steady-state cost.
func BenchmarkEmulator_Write_Stream_8Color_120fps(b *testing.B) {
	frame := buildASCIIVideoFrame(80, 40)
	const frames = 360
	b.SetBytes(int64(len(frame) * frames))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		em, err := vt.NewGhosttyEmulator(80, 40)
		if err != nil {
			b.Fatalf("emulator: %v", err)
		}
		for f := 0; f < frames; f++ {
			if _, werr := em.Write(frame); werr != nil {
				b.Fatalf("emulator.Write: %v", werr)
			}
		}
		_ = em.Close()
	}
	nsPerFrame := float64(b.Elapsed().Nanoseconds()) / float64(b.N*frames)
	b.ReportMetric(nsPerFrame/1000.0, "µs/frame")
	b.ReportMetric(1e9/nsPerFrame, "fps_ceiling")
}

func BenchmarkEmulator_Write_Stream_TrueColor_120fps(b *testing.B) {
	frame := buildASCIIVideoFrameTrueColor(80, 40)
	const frames = 360
	b.SetBytes(int64(len(frame) * frames))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		em, err := vt.NewGhosttyEmulator(80, 40)
		if err != nil {
			b.Fatalf("emulator: %v", err)
		}
		for f := 0; f < frames; f++ {
			if _, werr := em.Write(frame); werr != nil {
				b.Fatalf("emulator.Write: %v", werr)
			}
		}
		_ = em.Close()
	}
	nsPerFrame := float64(b.Elapsed().Nanoseconds()) / float64(b.N*frames)
	b.ReportMetric(nsPerFrame/1000.0, "µs/frame")
	b.ReportMetric(1e9/nsPerFrame, "fps_ceiling")
}

// runPipelineStreamBench includes the libghostty-vt emulator.Write
// CGO call and a stdout write to io.Discard alongside the renderer
// — i.e. everything OnPTYOut does in production except the host
// terminal's own paint time (which patterm doesn't control). This
// is the honest "can we hit N fps end-to-end?" measurement.
func runPipelineStreamBench(b *testing.B, frame []byte, fps int) {
	frames := fps * 3
	totalBytes := int64(len(frame) * frames)
	b.SetBytes(totalBytes)
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		em, err := vt.NewGhosttyEmulator(80, 40)
		if err != nil {
			b.Fatalf("emulator: %v", err)
		}
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		for f := 0; f < frames; f++ {
			if _, werr := em.Write(frame); werr != nil {
				b.Fatalf("emulator.Write: %v", werr)
			}
			out := vr.Render(frame)
			// Match OnPTYOut's autowrap prelude/postlude wrapping so
			// the byte count is faithful.
			_, _ = io.Discard.Write([]byte("\x1b[?7l"))
			_, _ = io.Discard.Write(out)
			_, _ = io.Discard.Write([]byte("\x1b[?7h"))
		}
		_ = em.Close()
	}
	nsPerFrame := float64(b.Elapsed().Nanoseconds()) / float64(b.N*frames)
	b.ReportMetric(nsPerFrame/1000.0, "µs/frame")
	b.ReportMetric(1e9/nsPerFrame, "fps_ceiling")
	budgetNs := 1e9 / float64(fps)
	b.ReportMetric(nsPerFrame/budgetNs*100, "budget_pct")
}

// BenchmarkPipeline_ASCIIVideo_* — the FULL OnPTYOut path
// (emulator.Write CGO + viewport renderer + a stdout write to
// io.Discard) running at 30/60/120 fps targets. These are the
// numbers to trust when asking "can we sustain N fps?" The
// renderer-only Stream benchmarks above isolate one stage and
// understate the real cost.
//
// 120 fps is the explicit baseline: anything under 100% of the
// per-frame budget here means we hit 120 fps with margin to spare.
func BenchmarkPipeline_ASCIIVideo_8Color_30fps(b *testing.B) {
	runPipelineStreamBench(b, buildASCIIVideoFrame(80, 40), 30)
}
func BenchmarkPipeline_ASCIIVideo_8Color_60fps(b *testing.B) {
	runPipelineStreamBench(b, buildASCIIVideoFrame(80, 40), 60)
}
func BenchmarkPipeline_ASCIIVideo_8Color_120fps(b *testing.B) {
	runPipelineStreamBench(b, buildASCIIVideoFrame(80, 40), 120)
}

func BenchmarkPipeline_ASCIIVideo_TrueColor_30fps(b *testing.B) {
	runPipelineStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 30)
}
func BenchmarkPipeline_ASCIIVideo_TrueColor_60fps(b *testing.B) {
	runPipelineStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 60)
}
func BenchmarkPipeline_ASCIIVideo_TrueColor_120fps(b *testing.B) {
	runPipelineStreamBench(b, buildASCIIVideoFrameTrueColor(80, 40), 120)
}

// BenchmarkSessionResume_5MiBStyled simulates the user's
// motivating case: claude resuming a long chat session and dumping
// the whole history. 5 MiB of styled output as a single Render
// call. Numbers here tell us how long the visible "scrolling
// while resume loads" window will be.
func BenchmarkSessionResume_5MiBStyled(b *testing.B) {
	chunk := buildStyledLinesChunk(5 * 1024 * 1024)
	b.SetBytes(int64(len(chunk)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		_ = vr.Render(chunk)
	}
}

// BenchmarkSessionResume_5MiBPlain same as above but pure text.
// Lower bound — what we'd hit if the resume content were styling-
// free.
func BenchmarkSessionResume_5MiBPlain(b *testing.B) {
	chunk := buildPlainASCIIChunk(5 * 1024 * 1024)
	b.SetBytes(int64(len(chunk)))
	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		vr := newViewportRenderer(newTerminalLayout(120, 40))
		_ = vr.Render(chunk)
	}
}