From 412b1167a2149f13ccfd6c42dca76ebeff8b72f5 Mon Sep 17 00:00:00 2001 From: Harry Bayliss Date: Mon, 18 May 2026 12:46:50 +0100 Subject: [PATCH] Cancel pending timers when a child is closed (#6) Co-authored-by: Harry Bayliss Co-committed-by: Harry Bayliss --- CHANGELOG.md | 4 + internal/app/app.go | 7 ++ internal/app/debug.go | 7 ++ internal/app/host.go | 12 ++- internal/app/session.go | 18 ++++ internal/app/timers.go | 59 ++++++++++++ internal/app/timers_test.go | 175 ++++++++++++++++++++++++++++++++++++ 7 files changed, 278 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8657129..d9de62c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,10 @@ loosely follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). repaint. - Removed the redundant "Back to Settings" row from the Agents / Auto-summarization settings screen. +- Pending `timer_*` entries are now cancelled when their owning or + watched child is closed via `close_process`, preventing stale + timer bodies from being re-delivered to the orchestrator pane + after the work has already been handled. ## [0.0.6] - 2026-05-15 diff --git a/internal/app/app.go b/internal/app/app.go index 427dcc4..990ef5c 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -837,6 +837,13 @@ func (st *uiState) OnChildStateChanged(string, IdleState) { st.drawSidebar() } +// OnChildClosed is the explicit-removal hook (close_process or the +// terminal-corpse cleanup in reapChild). The UI already reflects +// removals via the OnChildExited path and the children-map view, so +// this is a no-op here — the timerManager is the consumer that +// cares. +func (st *uiState) OnChildClosed(string) {} + // OnChildExited drops focus and shows the empty state if it was the // focused child. func (st *uiState) OnChildExited(c *Child) { diff --git a/internal/app/debug.go b/internal/app/debug.go index fad4b0b..2f6b2a0 100644 --- a/internal/app/debug.go +++ b/internal/app/debug.go @@ -111,6 +111,13 @@ func (d *debugCapture) OnChildStateChanged(id string, state IdleState) { }) } +func (d *debugCapture) OnChildClosed(id string) { + d.writeEvent("child_closed", map[string]any{ + "time": time.Now().Format(time.RFC3339Nano), + "id": id, + }) +} + func (d *debugCapture) OnPTYOut(childID string, chunk []byte) { if len(chunk) == 0 { return diff --git a/internal/app/host.go b/internal/app/host.go index bdb36c7..651ca91 100644 --- a/internal/app/host.go +++ b/internal/app/host.go @@ -86,10 +86,10 @@ func newToolHost(sess *Session, pads *scratchpad.Store, launcher *Launcher, pres return h } -// timerListenerAdapter forwards OnChildStateChanged into the timer -// manager and ignores the other ChildEventListener methods. The -// session's listener API is by-interface, so we wrap the manager -// rather than make it implement the full surface. +// timerListenerAdapter forwards OnChildStateChanged and OnChildClosed +// into the timer manager and ignores the other ChildEventListener +// methods. The session's listener API is by-interface, so we wrap +// the manager rather than make it implement the full surface. type timerListenerAdapter struct{ m *timerManager } func (a timerListenerAdapter) OnChildSpawned(*Child) {} @@ -98,6 +98,9 @@ func (a timerListenerAdapter) OnPTYOut(string, []byte) {} func (a timerListenerAdapter) OnChildStateChanged(id string, st IdleState) { a.m.onChildStateChanged(id, st) } +func (a timerListenerAdapter) OnChildClosed(id string) { + a.m.onChildClosed(id) +} func (h *toolHost) SetSize(cols, rows uint16) { h.sizeMu.Lock() @@ -553,6 +556,7 @@ func (n *chunkNotifier) OnPTYOut(id string, chunk []byte) { } } func (n *chunkNotifier) OnChildStateChanged(string, IdleState) {} +func (n *chunkNotifier) OnChildClosed(string) {} func (h *toolHost) GetProcessPorts(callerID, processID string) ([]mcp.PortSighting, error) { c := h.sess.FindChild(processID) diff --git a/internal/app/session.go b/internal/app/session.go index e5f9151..d4d4b3d 100644 --- a/internal/app/session.go +++ b/internal/app/session.go @@ -91,6 +91,12 @@ type ChildEventListener interface { // updates a child's IdleState. Listeners use this to repaint the // sidebar badge and to evaluate idle-aware timers. OnChildStateChanged(childID string, state IdleState) + // OnChildClosed fires when a child is being removed from the + // session (either via close_process, or — for agent/terminal + // kinds — when the PTY exits and the entry will never be + // restarted). It signals that any pending references to childID + // (e.g. timers owned by or watching it) should be dropped. + OnChildClosed(childID string) } func NewSession(projectDir, projectKey string) *Session { @@ -167,6 +173,12 @@ func (s *Session) emitStateChanged(id string, state IdleState) { } } +func (s *Session) emitClosed(id string) { + for _, l := range s.listenersSnapshot() { + l.OnChildClosed(id) + } +} + func (s *Session) ChildEnv() []string { env := os.Environ() // Mark patterm-owned PTYs so a recursive `patterm` invocation can @@ -374,6 +386,11 @@ func (s *Session) Close(id string, sig syscall.Signal) error { } } s.mu.Unlock() + // Notify listeners outside s.mu so they can take their own locks + // without inversion. Timer manager uses this to drop pending + // timers owned by or watching the closed child — otherwise the + // next classifier tick can deliver a stale fire to the parent. + s.emitClosed(id) s.forgetPersisted(id) return nil } @@ -486,6 +503,7 @@ func (s *Session) reapChild(c *Child, runID uint64) { } } s.mu.Unlock() + s.emitClosed(c.ID) } } diff --git a/internal/app/timers.go b/internal/app/timers.go index f9fc2f1..e1f5570 100644 --- a/internal/app/timers.go +++ b/internal/app/timers.go @@ -296,6 +296,65 @@ func (m *timerManager) onChildStateChanged(childID string, state IdleState) { } } +// onChildClosed drops pending timer references to childID. Called +// from Session.Close (and the terminal-corpse cleanup in reapChild) +// via the session listener bus — a deliberate signal from the host +// that childID is gone and the parent is not waiting on it anymore. +// +// Semantics: +// - timers owned by childID are cancelled and deleted: their owner +// is gone, so even if defaultFireFn's IsLive guard would no-op +// the delivery, the entry has no business surviving a close. +// - timers watching childID have childID pruned from t.watched +// (and t.idleBaseline). If t.watched becomes empty the timer is +// cancelled and deleted; we deliberately do NOT synthesise a +// fire here. The parent already received any legitimate idle +// transition before close_process — see allWatchedIdleLocked's +// "treat as satisfied" comment, which only applies to a +// concurrent re-evaluation, not to this explicit-removal hook. +// +// The natural-exit path (reapChild → emitExit for agent/command +// kinds) is NOT routed through here: the classifier emits a final +// idle transition on exit, which fires and deletes any watching +// timers exactly once. Cancelling on exit would swallow that +// legitimate fire and leave the parent never notified. +func (m *timerManager) onChildClosed(childID string) { + m.mu.Lock() + defer m.mu.Unlock() + for id, t := range m.timers { + if t.ownerID == childID { + if t.rt != nil { + t.rt.Stop() + t.rt = nil + } + t.status = timerStatusCanceled + delete(m.timers, id) + continue + } + if !contains(t.watched, childID) { + continue + } + pruned := t.watched[:0] + for _, w := range t.watched { + if w != childID { + pruned = append(pruned, w) + } + } + t.watched = pruned + if t.idleBaseline != nil { + delete(t.idleBaseline, childID) + } + if len(t.watched) == 0 { + if t.rt != nil { + t.rt.Stop() + t.rt = nil + } + t.status = timerStatusCanceled + delete(m.timers, id) + } + } +} + // allWatchedIdleLocked reports whether every watched child is now // idle. Called with m.mu held — uses live Child.IdleState() under the // child's own atomic, not under m.mu. diff --git a/internal/app/timers_test.go b/internal/app/timers_test.go index efe9f8b..f9f512c 100644 --- a/internal/app/timers_test.go +++ b/internal/app/timers_test.go @@ -411,3 +411,178 @@ func TestTimerRecordsRemovedOnIdleFire(t *testing.T) { t.Fatalf("fired idle timer %s was not removed from registry", resp.ID) } } + +// TestTimerCloseChildPrunesWatched covers the happy partial-prune +// case: an idle_any timer watches two children, one is closed, the +// timer stays pending and the remaining child can still satisfy it. +func TestTimerCloseChildPrunesWatched(t *testing.T) { + sess, mgr, rec := newTestManager(t) + owner := fakeChild("p_owner") + a := fakeChild("p_a") + b := fakeChild("p_b") + addChild(sess, owner) + addChild(sess, a) + addChild(sess, b) + working := StateWorking + a.idleState.Store(&working) + b.idleState.Store(&working) + + resp, err := mgr.TimerFireWhenIdleAny("p_owner", "one done", "", []string{"p_a", "p_b"}, 0) + if err != nil { + t.Fatalf("TimerFireWhenIdleAny: %v", err) + } + + mgr.onChildClosed("p_a") + + mgr.mu.Lock() + t1, ok := mgr.timers[resp.ID] + if !ok { + mgr.mu.Unlock() + t.Fatalf("timer was removed but still has live watched") + } + watched := append([]string(nil), t1.watched...) + mgr.mu.Unlock() + if len(watched) != 1 || watched[0] != "p_b" { + t.Fatalf("watched after close: %v, want [p_b]", watched) + } + if got := rec.snapshot(); len(got) != 0 { + t.Fatalf("close synthesised a fire: %+v", got) + } + + // p_b can still satisfy the timer. + idle := StateIdle + b.idleState.Store(&idle) + mgr.onChildStateChanged("p_b", StateIdle) + if got := rec.snapshot(); len(got) != 1 || got[0].Body != "one done" { + t.Fatalf("post-prune fire: %+v", got) + } +} + +// TestTimerCloseLastWatchedCancels is the regression for the +// reported stale-fire symptom: the only watched child is closed, +// so the timer must be cancelled — no synthetic fire, and the +// registry entry must be gone so a trailing classifier tick for the +// removed child cannot re-deliver later. +func TestTimerCloseLastWatchedCancels(t *testing.T) { + sess, mgr, rec := newTestManager(t) + owner := fakeChild("p_owner") + a := fakeChild("p_a") + addChild(sess, owner) + addChild(sess, a) + working := StateWorking + a.idleState.Store(&working) + + resp, err := mgr.TimerFireWhenIdleAny("p_owner", "stale body", "", []string{"p_a"}, 0) + if err != nil { + t.Fatalf("TimerFireWhenIdleAny: %v", err) + } + + mgr.onChildClosed("p_a") + + mgr.mu.Lock() + _, stillThere := mgr.timers[resp.ID] + mgr.mu.Unlock() + if stillThere { + t.Fatalf("timer with no remaining watched should be removed") + } + if got := rec.snapshot(); len(got) != 0 { + t.Fatalf("close synthesised a fire: %+v", got) + } + + // Simulate the trailing classifier tick for the now-closed child — + // must not fire. + mgr.onChildStateChanged("p_a", StateIdle) + if got := rec.snapshot(); len(got) != 0 { + t.Fatalf("trailing state change re-fired: %+v", got) + } +} + +// TestTimerCloseChildIdleAllPartialPrune mirrors the idle_any +// partial-prune for idle_all: pruning a watched child shrinks the +// list; the remaining child going idle then satisfies the timer. +func TestTimerCloseChildIdleAllPartialPrune(t *testing.T) { + sess, mgr, rec := newTestManager(t) + owner := fakeChild("p_owner") + a := fakeChild("p_a") + b := fakeChild("p_b") + addChild(sess, owner) + addChild(sess, a) + addChild(sess, b) + working := StateWorking + a.idleState.Store(&working) + b.idleState.Store(&working) + + resp, err := mgr.TimerFireWhenIdleAll("p_owner", "all done", "", []string{"p_a", "p_b"}, 0) + if err != nil { + t.Fatalf("TimerFireWhenIdleAll: %v", err) + } + if resp.Status != "pending" { + t.Fatalf("status: got %q want pending", resp.Status) + } + + mgr.onChildClosed("p_a") + + idle := StateIdle + b.idleState.Store(&idle) + mgr.onChildStateChanged("p_b", StateIdle) + if got := rec.snapshot(); len(got) != 1 || got[0].Body != "all done" { + t.Fatalf("idle_all after partial prune: %+v", got) + } +} + +// TestTimerCloseOwnerCancelsDelay ensures a delay timer is dropped +// when its owner is closed: no delivery, registry empty, the +// underlying time.Timer is stopped. +func TestTimerCloseOwnerCancelsDelay(t *testing.T) { + sess, mgr, rec := newTestManager(t) + c := fakeChild("p_owner") + addChild(sess, c) + id, err := mgr.TimerSet("p_owner", "x", "", 0.1) + if err != nil { + t.Fatalf("TimerSet: %v", err) + } + + mgr.onChildClosed("p_owner") + + mgr.mu.Lock() + _, stillThere := mgr.timers[id] + mgr.mu.Unlock() + if stillThere { + t.Fatalf("delay timer was not removed when owner closed") + } + time.Sleep(200 * time.Millisecond) // past the original firesAt + if got := rec.snapshot(); len(got) != 0 { + t.Fatalf("delay timer fired after owner close: %+v", got) + } +} + +// TestTimerCloseWatchedSubAgent is the exact shape of the reported +// stale-fire bug: orchestrator registers a watcher on a sub-agent, +// the sub-agent is closed, and the orchestrator must receive +// nothing (no stale body delivered after close_process). +func TestTimerCloseWatchedSubAgent(t *testing.T) { + sess, mgr, rec := newTestManager(t) + parent := fakeChild("p_owner") + sub := fakeChild("p_sub") + addChild(sess, parent) + addChild(sess, sub) + working := StateWorking + sub.idleState.Store(&working) + + if _, err := mgr.TimerFireWhenIdleAny( + "p_owner", + "codex-review-591 finished. Read your own pane …", + "", []string{"p_sub"}, 0, + ); err != nil { + t.Fatalf("TimerFireWhenIdleAny: %v", err) + } + + mgr.onChildClosed("p_sub") + + // Trailing classifier emission for the closed sub-agent must + // not deliver anything to the parent. + mgr.onChildStateChanged("p_sub", StateIdle) + if got := rec.snapshot(); len(got) != 0 { + t.Fatalf("stale fire delivered to parent after sub-agent close: %+v", got) + } +}