Cancel pending timers when a child is closed

Stale timer bodies were re-delivered to the orchestrator pane after
the parent had already processed the sub-agent's reply and called
close_process. The timer registry held no link to the child
lifecycle, so timers owned by or watching the closed child lingered
until something triggered a fire — e.g. a trailing classifier tick
for the now-removed child.

Add an OnChildClosed hook to ChildEventListener, emit it from
Session.Close (and the terminal-corpse path in reapChild), and have
the timer manager prune the registry: cancel timers owned by the
closed child; remove the closed child from each timer's watched
list (cancel the timer outright when watched empties).

Natural exit deliberately does not route through this hook — the
classifier already emits an idle transition on exit which delivers
any legitimate "fire when sub-agent finishes" semantics exactly
once; cancelling on exit would swallow that.
This commit is contained in:
2026-05-18 12:37:32 +01:00
parent de60b93bc6
commit 34b41be1df
7 changed files with 278 additions and 4 deletions

View File

@@ -33,6 +33,10 @@ loosely follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
repaint.
- Removed the redundant "Back to Settings" row from the
Agents / Auto-summarization settings screen.
- Pending `timer_*` entries are now cancelled when their owning or
watched child is closed via `close_process`, preventing stale
timer bodies from being re-delivered to the orchestrator pane
after the work has already been handled.
## [0.0.6] - 2026-05-15

View File

@@ -837,6 +837,13 @@ func (st *uiState) OnChildStateChanged(string, IdleState) {
st.drawSidebar()
}
// OnChildClosed is the explicit-removal hook (close_process or the
// terminal-corpse cleanup in reapChild). The UI already reflects
// removals via the OnChildExited path and the children-map view, so
// this is a no-op here — the timerManager is the consumer that
// cares.
func (st *uiState) OnChildClosed(string) {}
// OnChildExited drops focus and shows the empty state if it was the
// focused child.
func (st *uiState) OnChildExited(c *Child) {

View File

@@ -111,6 +111,13 @@ func (d *debugCapture) OnChildStateChanged(id string, state IdleState) {
})
}
func (d *debugCapture) OnChildClosed(id string) {
d.writeEvent("child_closed", map[string]any{
"time": time.Now().Format(time.RFC3339Nano),
"id": id,
})
}
func (d *debugCapture) OnPTYOut(childID string, chunk []byte) {
if len(chunk) == 0 {
return

View File

@@ -86,10 +86,10 @@ func newToolHost(sess *Session, pads *scratchpad.Store, launcher *Launcher, pres
return h
}
// timerListenerAdapter forwards OnChildStateChanged into the timer
// manager and ignores the other ChildEventListener methods. The
// session's listener API is by-interface, so we wrap the manager
// rather than make it implement the full surface.
// timerListenerAdapter forwards OnChildStateChanged and OnChildClosed
// into the timer manager and ignores the other ChildEventListener
// methods. The session's listener API is by-interface, so we wrap
// the manager rather than make it implement the full surface.
type timerListenerAdapter struct{ m *timerManager }
func (a timerListenerAdapter) OnChildSpawned(*Child) {}
@@ -98,6 +98,9 @@ func (a timerListenerAdapter) OnPTYOut(string, []byte) {}
func (a timerListenerAdapter) OnChildStateChanged(id string, st IdleState) {
a.m.onChildStateChanged(id, st)
}
func (a timerListenerAdapter) OnChildClosed(id string) {
a.m.onChildClosed(id)
}
func (h *toolHost) SetSize(cols, rows uint16) {
h.sizeMu.Lock()
@@ -553,6 +556,7 @@ func (n *chunkNotifier) OnPTYOut(id string, chunk []byte) {
}
}
func (n *chunkNotifier) OnChildStateChanged(string, IdleState) {}
func (n *chunkNotifier) OnChildClosed(string) {}
func (h *toolHost) GetProcessPorts(callerID, processID string) ([]mcp.PortSighting, error) {
c := h.sess.FindChild(processID)

View File

@@ -91,6 +91,12 @@ type ChildEventListener interface {
// updates a child's IdleState. Listeners use this to repaint the
// sidebar badge and to evaluate idle-aware timers.
OnChildStateChanged(childID string, state IdleState)
// OnChildClosed fires when a child is being removed from the
// session (either via close_process, or — for agent/terminal
// kinds — when the PTY exits and the entry will never be
// restarted). It signals that any pending references to childID
// (e.g. timers owned by or watching it) should be dropped.
OnChildClosed(childID string)
}
func NewSession(projectDir, projectKey string) *Session {
@@ -167,6 +173,12 @@ func (s *Session) emitStateChanged(id string, state IdleState) {
}
}
func (s *Session) emitClosed(id string) {
for _, l := range s.listenersSnapshot() {
l.OnChildClosed(id)
}
}
func (s *Session) ChildEnv() []string {
env := os.Environ()
// Mark patterm-owned PTYs so a recursive `patterm` invocation can
@@ -374,6 +386,11 @@ func (s *Session) Close(id string, sig syscall.Signal) error {
}
}
s.mu.Unlock()
// Notify listeners outside s.mu so they can take their own locks
// without inversion. Timer manager uses this to drop pending
// timers owned by or watching the closed child — otherwise the
// next classifier tick can deliver a stale fire to the parent.
s.emitClosed(id)
s.forgetPersisted(id)
return nil
}
@@ -486,6 +503,7 @@ func (s *Session) reapChild(c *Child, runID uint64) {
}
}
s.mu.Unlock()
s.emitClosed(c.ID)
}
}

View File

@@ -296,6 +296,65 @@ func (m *timerManager) onChildStateChanged(childID string, state IdleState) {
}
}
// onChildClosed drops pending timer references to childID. Called
// from Session.Close (and the terminal-corpse cleanup in reapChild)
// via the session listener bus — a deliberate signal from the host
// that childID is gone and the parent is not waiting on it anymore.
//
// Semantics:
// - timers owned by childID are cancelled and deleted: their owner
// is gone, so even if defaultFireFn's IsLive guard would no-op
// the delivery, the entry has no business surviving a close.
// - timers watching childID have childID pruned from t.watched
// (and t.idleBaseline). If t.watched becomes empty the timer is
// cancelled and deleted; we deliberately do NOT synthesise a
// fire here. The parent already received any legitimate idle
// transition before close_process — see allWatchedIdleLocked's
// "treat as satisfied" comment, which only applies to a
// concurrent re-evaluation, not to this explicit-removal hook.
//
// The natural-exit path (reapChild → emitExit for agent/command
// kinds) is NOT routed through here: the classifier emits a final
// idle transition on exit, which fires and deletes any watching
// timers exactly once. Cancelling on exit would swallow that
// legitimate fire and leave the parent never notified.
func (m *timerManager) onChildClosed(childID string) {
m.mu.Lock()
defer m.mu.Unlock()
for id, t := range m.timers {
if t.ownerID == childID {
if t.rt != nil {
t.rt.Stop()
t.rt = nil
}
t.status = timerStatusCanceled
delete(m.timers, id)
continue
}
if !contains(t.watched, childID) {
continue
}
pruned := t.watched[:0]
for _, w := range t.watched {
if w != childID {
pruned = append(pruned, w)
}
}
t.watched = pruned
if t.idleBaseline != nil {
delete(t.idleBaseline, childID)
}
if len(t.watched) == 0 {
if t.rt != nil {
t.rt.Stop()
t.rt = nil
}
t.status = timerStatusCanceled
delete(m.timers, id)
}
}
}
// allWatchedIdleLocked reports whether every watched child is now
// idle. Called with m.mu held — uses live Child.IdleState() under the
// child's own atomic, not under m.mu.

View File

@@ -411,3 +411,178 @@ func TestTimerRecordsRemovedOnIdleFire(t *testing.T) {
t.Fatalf("fired idle timer %s was not removed from registry", resp.ID)
}
}
// TestTimerCloseChildPrunesWatched covers the happy partial-prune
// case: an idle_any timer watches two children, one is closed, the
// timer stays pending and the remaining child can still satisfy it.
func TestTimerCloseChildPrunesWatched(t *testing.T) {
sess, mgr, rec := newTestManager(t)
owner := fakeChild("p_owner")
a := fakeChild("p_a")
b := fakeChild("p_b")
addChild(sess, owner)
addChild(sess, a)
addChild(sess, b)
working := StateWorking
a.idleState.Store(&working)
b.idleState.Store(&working)
resp, err := mgr.TimerFireWhenIdleAny("p_owner", "one done", "", []string{"p_a", "p_b"}, 0)
if err != nil {
t.Fatalf("TimerFireWhenIdleAny: %v", err)
}
mgr.onChildClosed("p_a")
mgr.mu.Lock()
t1, ok := mgr.timers[resp.ID]
if !ok {
mgr.mu.Unlock()
t.Fatalf("timer was removed but still has live watched")
}
watched := append([]string(nil), t1.watched...)
mgr.mu.Unlock()
if len(watched) != 1 || watched[0] != "p_b" {
t.Fatalf("watched after close: %v, want [p_b]", watched)
}
if got := rec.snapshot(); len(got) != 0 {
t.Fatalf("close synthesised a fire: %+v", got)
}
// p_b can still satisfy the timer.
idle := StateIdle
b.idleState.Store(&idle)
mgr.onChildStateChanged("p_b", StateIdle)
if got := rec.snapshot(); len(got) != 1 || got[0].Body != "one done" {
t.Fatalf("post-prune fire: %+v", got)
}
}
// TestTimerCloseLastWatchedCancels is the regression for the
// reported stale-fire symptom: the only watched child is closed,
// so the timer must be cancelled — no synthetic fire, and the
// registry entry must be gone so a trailing classifier tick for the
// removed child cannot re-deliver later.
func TestTimerCloseLastWatchedCancels(t *testing.T) {
sess, mgr, rec := newTestManager(t)
owner := fakeChild("p_owner")
a := fakeChild("p_a")
addChild(sess, owner)
addChild(sess, a)
working := StateWorking
a.idleState.Store(&working)
resp, err := mgr.TimerFireWhenIdleAny("p_owner", "stale body", "", []string{"p_a"}, 0)
if err != nil {
t.Fatalf("TimerFireWhenIdleAny: %v", err)
}
mgr.onChildClosed("p_a")
mgr.mu.Lock()
_, stillThere := mgr.timers[resp.ID]
mgr.mu.Unlock()
if stillThere {
t.Fatalf("timer with no remaining watched should be removed")
}
if got := rec.snapshot(); len(got) != 0 {
t.Fatalf("close synthesised a fire: %+v", got)
}
// Simulate the trailing classifier tick for the now-closed child —
// must not fire.
mgr.onChildStateChanged("p_a", StateIdle)
if got := rec.snapshot(); len(got) != 0 {
t.Fatalf("trailing state change re-fired: %+v", got)
}
}
// TestTimerCloseChildIdleAllPartialPrune mirrors the idle_any
// partial-prune for idle_all: pruning a watched child shrinks the
// list; the remaining child going idle then satisfies the timer.
func TestTimerCloseChildIdleAllPartialPrune(t *testing.T) {
sess, mgr, rec := newTestManager(t)
owner := fakeChild("p_owner")
a := fakeChild("p_a")
b := fakeChild("p_b")
addChild(sess, owner)
addChild(sess, a)
addChild(sess, b)
working := StateWorking
a.idleState.Store(&working)
b.idleState.Store(&working)
resp, err := mgr.TimerFireWhenIdleAll("p_owner", "all done", "", []string{"p_a", "p_b"}, 0)
if err != nil {
t.Fatalf("TimerFireWhenIdleAll: %v", err)
}
if resp.Status != "pending" {
t.Fatalf("status: got %q want pending", resp.Status)
}
mgr.onChildClosed("p_a")
idle := StateIdle
b.idleState.Store(&idle)
mgr.onChildStateChanged("p_b", StateIdle)
if got := rec.snapshot(); len(got) != 1 || got[0].Body != "all done" {
t.Fatalf("idle_all after partial prune: %+v", got)
}
}
// TestTimerCloseOwnerCancelsDelay ensures a delay timer is dropped
// when its owner is closed: no delivery, registry empty, the
// underlying time.Timer is stopped.
func TestTimerCloseOwnerCancelsDelay(t *testing.T) {
sess, mgr, rec := newTestManager(t)
c := fakeChild("p_owner")
addChild(sess, c)
id, err := mgr.TimerSet("p_owner", "x", "", 0.1)
if err != nil {
t.Fatalf("TimerSet: %v", err)
}
mgr.onChildClosed("p_owner")
mgr.mu.Lock()
_, stillThere := mgr.timers[id]
mgr.mu.Unlock()
if stillThere {
t.Fatalf("delay timer was not removed when owner closed")
}
time.Sleep(200 * time.Millisecond) // past the original firesAt
if got := rec.snapshot(); len(got) != 0 {
t.Fatalf("delay timer fired after owner close: %+v", got)
}
}
// TestTimerCloseWatchedSubAgent is the exact shape of the reported
// stale-fire bug: orchestrator registers a watcher on a sub-agent,
// the sub-agent is closed, and the orchestrator must receive
// nothing (no stale body delivered after close_process).
func TestTimerCloseWatchedSubAgent(t *testing.T) {
sess, mgr, rec := newTestManager(t)
parent := fakeChild("p_owner")
sub := fakeChild("p_sub")
addChild(sess, parent)
addChild(sess, sub)
working := StateWorking
sub.idleState.Store(&working)
if _, err := mgr.TimerFireWhenIdleAny(
"p_owner",
"codex-review-591 finished. Read your own pane …",
"", []string{"p_sub"}, 0,
); err != nil {
t.Fatalf("TimerFireWhenIdleAny: %v", err)
}
mgr.onChildClosed("p_sub")
// Trailing classifier emission for the closed sub-agent must
// not deliver anything to the parent.
mgr.onChildStateChanged("p_sub", StateIdle)
if got := rec.snapshot(); len(got) != 0 {
t.Fatalf("stale fire delivered to parent after sub-agent close: %+v", got)
}
}