From 1a168f47a7a1ad63ca44f39039100ed331908c14 Mon Sep 17 00:00:00 2001 From: sawka Date: Fri, 6 Feb 2026 12:36:06 -0800 Subject: [PATCH 01/17] implement system "resume" tracking --- emain/emain.ts | 10 ++++++++++ frontend/app/store/wshclientapi.ts | 5 +++++ pkg/wshrpc/wshclient/wshclient.go | 6 ++++++ pkg/wshrpc/wshrpctypes.go | 1 + pkg/wshrpc/wshserver/wshserver.go | 5 +++++ 5 files changed, 27 insertions(+) diff --git a/emain/emain.ts b/emain/emain.ts index 58187e5293..960a3d4d2b 100644 --- a/emain/emain.ts +++ b/emain/emain.ts @@ -412,6 +412,16 @@ async function appMain() { fireAndForget(createNewWaveWindow); } }); + electron.powerMonitor.on("resume", () => { + console.log("system resumed from sleep, notifying server"); + fireAndForget(async () => { + try { + await RpcApi.NotifySystemResumeCommand(ElectronWshClient, { noresponse: true }); + } catch (e) { + console.log("error calling NotifySystemResumeCommand", e); + } + }); + }); const rawGlobalHotKey = launchSettings?.["app:globalhotkey"]; if (rawGlobalHotKey) { registerGlobalHotkey(rawGlobalHotKey); diff --git a/frontend/app/store/wshclientapi.ts b/frontend/app/store/wshclientapi.ts index 72ffd8618d..1f90e5f23b 100644 --- a/frontend/app/store/wshclientapi.ts +++ b/frontend/app/store/wshclientapi.ts @@ -512,6 +512,11 @@ class RpcApiType { return client.wshRpcCall("notify", data, opts); } + // command "notifysystemresume" [call] + NotifySystemResumeCommand(client: WshClient, opts?: RpcOpts): Promise { + return client.wshRpcCall("notifysystemresume", null, opts); + } + // command "path" [call] PathCommand(client: WshClient, data: PathCommandData, opts?: RpcOpts): Promise { return client.wshRpcCall("path", data, opts); diff --git a/pkg/wshrpc/wshclient/wshclient.go b/pkg/wshrpc/wshclient/wshclient.go index 533fb01c24..2b6199f957 100644 --- a/pkg/wshrpc/wshclient/wshclient.go +++ b/pkg/wshrpc/wshclient/wshclient.go @@ -620,6 +620,12 @@ func NotifyCommand(w *wshutil.WshRpc, data wshrpc.WaveNotificationOptions, opts return err } +// command "notifysystemresume", wshserver.NotifySystemResumeCommand +func NotifySystemResumeCommand(w *wshutil.WshRpc, opts *wshrpc.RpcOpts) error { + _, err := sendRpcRequestCallHelper[any](w, "notifysystemresume", nil, opts) + return err +} + // command "path", wshserver.PathCommand func PathCommand(w *wshutil.WshRpc, data wshrpc.PathCommandData, opts *wshrpc.RpcOpts) (string, error) { resp, err := sendRpcRequestCallHelper[string](w, "path", data, opts) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 00cad208f0..5eb670bfb8 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -103,6 +103,7 @@ type WshRpcInterface interface { ConnUpdateWshCommand(ctx context.Context, remoteInfo RemoteInfo) (bool, error) FindGitBashCommand(ctx context.Context, rescan bool) (string, error) ConnServerInitCommand(ctx context.Context, data CommandConnServerInitData) error + NotifySystemResumeCommand(ctx context.Context) error // eventrecv is special, it's handled internally by WshRpc with EventListener EventRecvCommand(ctx context.Context, data wps.WaveEvent) error diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index 2777cd17f1..3ab4caaccf 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -755,6 +755,11 @@ func (ws *WshServer) DismissWshFailCommand(ctx context.Context, connName string) return nil } +func (ws *WshServer) NotifySystemResumeCommand(ctx context.Context) error { + log.Printf("NotifySystemResumeCommand called\n") + return nil +} + func (ws *WshServer) FindGitBashCommand(ctx context.Context, rescan bool) (string, error) { fullConfig := wconfig.GetWatcher().GetFullConfig() return shellutil.FindGitBash(&fullConfig, rescan), nil From de155cd96afa774e3f6cd21a5ebb80790023bcb7 Mon Sep 17 00:00:00 2001 From: sawka Date: Fri, 6 Feb 2026 16:27:11 -0800 Subject: [PATCH 02/17] working on connmonitor --- cmd/server/main-server.go | 2 +- cmd/wsh/cmd/wshcmd-connserver.go | 4 +- frontend/types/gotypes.d.ts | 1 + pkg/jobmanager/jobmanager.go | 2 +- pkg/remote/conncontroller/conncontroller.go | 53 +++++++++++------ pkg/remote/conncontroller/connmonitor.go | 66 +++++++++++++++++++++ pkg/util/utilfn/streamtolines.go | 7 ++- pkg/wshrpc/wshremote/wshremote_job.go | 2 +- pkg/wshrpc/wshrpctypes.go | 21 +++---- pkg/wshutil/wshrpcio.go | 4 +- pkg/wshutil/wshutil.go | 10 ++-- 11 files changed, 130 insertions(+), 42 deletions(-) create mode 100644 pkg/remote/conncontroller/connmonitor.go diff --git a/cmd/server/main-server.go b/cmd/server/main-server.go index 4c28bb866d..0d49a78c37 100644 --- a/cmd/server/main-server.go +++ b/cmd/server/main-server.go @@ -600,7 +600,7 @@ func main() { // use fmt instead of log here to make sure it goes directly to stderr fmt.Fprintf(os.Stderr, "WAVESRV-ESTART ws:%s web:%s version:%s buildtime:%s\n", wsListener.Addr(), webListener.Addr(), WaveVersion, BuildTime) }() - go wshutil.RunWshRpcOverListener(unixListener) + go wshutil.RunWshRpcOverListener(unixListener, nil) web.RunWebServer(webListener) // blocking runtime.KeepAlive(waveLock) } diff --git a/cmd/wsh/cmd/wshcmd-connserver.go b/cmd/wsh/cmd/wshcmd-connserver.go index 9f63976115..86f6c2a923 100644 --- a/cmd/wsh/cmd/wshcmd-connserver.go +++ b/cmd/wsh/cmd/wshcmd-connserver.go @@ -97,7 +97,7 @@ func handleNewListenerConn(conn net.Conn, router *wshutil.WshRouter) { router.UnregisterLink(baseds.LinkId(linkId)) } }() - wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) + wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh, nil) }() linkId := router.RegisterUntrustedLink(proxy) linkIdContainer.Store(int32(linkId)) @@ -265,7 +265,7 @@ func serverRunRouterDomainSocket(jwtToken string) error { log.Printf("upstream domain socket closed, shutting down") wshutil.DoShutdown("", 0, true) }() - wshutil.AdaptStreamToMsgCh(conn, upstreamProxy.FromRemoteCh) + wshutil.AdaptStreamToMsgCh(conn, upstreamProxy.FromRemoteCh, nil) }() // register the domain socket connection as upstream diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 8a30f5e9be..5c55d55961 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -785,6 +785,7 @@ declare global { // wshrpc.ConnStatus type ConnStatus = { status: string; + connhealthstatus?: string; wshenabled: boolean; connection: string; connected: boolean; diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go index cdd245af7f..7afaec047e 100644 --- a/pkg/jobmanager/jobmanager.go +++ b/pkg/jobmanager/jobmanager.go @@ -440,7 +440,7 @@ func handleJobDomainSocketClient(conn net.Conn) { panichandler.PanicHandler("handleJobDomainSocketClient:AdaptStreamToMsgCh", recover()) }() defer serverImpl.Close() - wshutil.AdaptStreamToMsgCh(conn, inputCh) + wshutil.AdaptStreamToMsgCh(conn, inputCh, nil) }() _ = wshRpc diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index 4e6f0aeefd..92ba1ef5ff 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -61,6 +61,11 @@ const ( NoWshCode_InstallVerifyError = "install-verify-error" ) +const ( + ConnHealthStatus_Good = "good" + ConnHealthStatus_Stalled = "stalled" +) + const DefaultConnectionTimeout = 60 * time.Second var globalLock = &sync.Mutex{} @@ -72,6 +77,7 @@ type SSHConn struct { lifecycleLock *sync.Mutex // this protects the lifecycle from concurrent calls Status string + ConnHealthStatus string WshEnabled *atomic.Bool Opts *remote.SSHOpts Client *ssh.Client @@ -84,6 +90,7 @@ type SSHConn struct { WshVersion string LastConnectTime int64 ActiveConnNum int + Monitor *ConnMonitor } var ConnServerCmdTemplate = strings.TrimSpace( @@ -128,16 +135,17 @@ func (conn *SSHConn) DeriveConnStatus() wshrpc.ConnStatus { conn.lock.Lock() defer conn.lock.Unlock() return wshrpc.ConnStatus{ - Status: conn.Status, - Connected: conn.Status == Status_Connected, - Connection: conn.Opts.String(), - HasConnected: (conn.LastConnectTime > 0), - ActiveConnNum: conn.ActiveConnNum, - Error: conn.Error, - WshEnabled: conn.WshEnabled.Load(), - WshError: conn.WshError, - NoWshReason: conn.NoWshReason, - WshVersion: conn.WshVersion, + Status: conn.Status, + Connected: conn.Status == Status_Connected, + Connection: conn.Opts.String(), + HasConnected: (conn.LastConnectTime > 0), + ActiveConnNum: conn.ActiveConnNum, + Error: conn.Error, + WshEnabled: conn.WshEnabled.Load(), + WshError: conn.WshError, + NoWshReason: conn.NoWshReason, + WshVersion: conn.WshVersion, + ConnHealthStatus: conn.ConnHealthStatus, } } @@ -276,7 +284,7 @@ func (conn *SSHConn) OpenDomainSocketListener(ctx context.Context) error { conn.DomainSockListener = nil conn.DomainSockName = "" }) - wshutil.RunWshRpcOverListener(listener) + wshutil.RunWshRpcOverListener(listener, conn.Monitor.UpdateLastActivityTime) }() return nil } @@ -323,11 +331,11 @@ func (conn *SSHConn) GetEnvironmentMaps(ctx context.Context) (map[string]string, func runSessionWithContext(ctx context.Context, session *ssh.Session, cmd string) error { errCh := make(chan error, 1) - + go func() { errCh <- session.Run(cmd) }() - + select { case <-ctx.Done(): session.Close() @@ -527,6 +535,7 @@ func (conn *SSHConn) StartConnServer(ctx context.Context, afterUpdate bool, useR log.Printf("[conncontroller:%s:output] error: %v\n", conn.GetName(), output.Error) continue } + conn.Monitor.UpdateLastActivityTime() line := output.Line if !strings.HasSuffix(line, "\n") { line += "\n" @@ -986,18 +995,26 @@ func (conn *SSHConn) ClearWshError() { }) } +func (conn *SSHConn) SetConnHealthStatus(status string) { + conn.WithLock(func() { + conn.ConnHealthStatus = status + }) +} + func getConnInternal(opts *remote.SSHOpts, createIfNotExists bool) *SSHConn { globalLock.Lock() defer globalLock.Unlock() rtn := clientControllerMap[*opts] if rtn == nil && createIfNotExists { rtn = &SSHConn{ - lock: &sync.Mutex{}, - lifecycleLock: &sync.Mutex{}, - Status: Status_Init, - WshEnabled: &atomic.Bool{}, - Opts: opts, + lock: &sync.Mutex{}, + lifecycleLock: &sync.Mutex{}, + Status: Status_Init, + ConnHealthStatus: ConnHealthStatus_Good, + WshEnabled: &atomic.Bool{}, + Opts: opts, } + rtn.Monitor = MakeConnMonitor(rtn) clientControllerMap[*opts] = rtn } return rtn diff --git a/pkg/remote/conncontroller/connmonitor.go b/pkg/remote/conncontroller/connmonitor.go new file mode 100644 index 0000000000..e7d94e32df --- /dev/null +++ b/pkg/remote/conncontroller/connmonitor.go @@ -0,0 +1,66 @@ +// Copyright 2026, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package conncontroller + +import ( + "fmt" + "sync" + "sync/atomic" + "time" +) + +type ConnMonitor struct { + lock *sync.Mutex + Conn *SSHConn + LastActivityTime atomic.Int64 + KeepAliveSentTime int64 + KeepAliveInFlight bool +} + +func MakeConnMonitor(conn *SSHConn) *ConnMonitor { + return &ConnMonitor{ + lock: &sync.Mutex{}, + Conn: conn, + } +} + +func (cm *ConnMonitor) UpdateLastActivityTime() { + cm.LastActivityTime.Store(time.Now().UnixMilli()) +} + +func (cm *ConnMonitor) setKeepAliveInFlight() bool { + cm.lock.Lock() + defer cm.lock.Unlock() + + if cm.KeepAliveInFlight { + return false + } + cm.KeepAliveInFlight = true + cm.KeepAliveSentTime = time.Now().UnixMilli() + return true +} + +func (cm *ConnMonitor) clearKeepAliveInFlight() { + cm.lock.Lock() + defer cm.lock.Unlock() + + cm.KeepAliveInFlight = false +} + +func (cm *ConnMonitor) SendKeepAlive() error { + conn := cm.Conn + if conn == nil || conn.Client == nil { + return fmt.Errorf("no active connection") + } + if !cm.setKeepAliveInFlight() { + return nil + } + _, _, err := conn.Client.SendRequest("keepalive@openssh.com", true, nil) + cm.clearKeepAliveInFlight() + if err != nil { + return err + } + cm.UpdateLastActivityTime() + return nil +} diff --git a/pkg/util/utilfn/streamtolines.go b/pkg/util/utilfn/streamtolines.go index 7cfabf4cf2..51758465da 100644 --- a/pkg/util/utilfn/streamtolines.go +++ b/pkg/util/utilfn/streamtolines.go @@ -56,7 +56,7 @@ func streamToLines_processBuf(lineBuf *lineBuf, readBuf []byte, lineFn func([]by } } -func StreamToLines(input io.Reader, lineFn func([]byte)) error { +func StreamToLines(input io.Reader, lineFn func([]byte), readCallback func()) error { var lineBuf lineBuf readBuf := make([]byte, 64*1024) for { @@ -65,6 +65,9 @@ func StreamToLines(input io.Reader, lineFn func([]byte)) error { if err != nil { return err } + if readCallback != nil { + readCallback() + } } } @@ -76,7 +79,7 @@ func StreamToLinesChan(input io.Reader) chan LineOutput { defer close(ch) err := StreamToLines(input, func(line []byte) { ch <- LineOutput{Line: string(line)} - }) + }, nil) if err != nil && err != io.EOF { ch <- LineOutput{Error: err} } diff --git a/pkg/wshrpc/wshremote/wshremote_job.go b/pkg/wshrpc/wshremote/wshremote_job.go index df6d1d4a0a..09310e09da 100644 --- a/pkg/wshrpc/wshremote/wshremote_job.go +++ b/pkg/wshrpc/wshremote/wshremote_job.go @@ -75,7 +75,7 @@ func (impl *ServerImpl) connectToJobManager(ctx context.Context, jobId string, m close(proxy.FromRemoteCh) cleanup() }() - wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) + wshutil.AdaptStreamToMsgCh(conn, proxy.FromRemoteCh, nil) }() routeId := wshutil.MakeLinkRouteId(linkId) diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 5eb670bfb8..4e667e6ecf 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -442,16 +442,17 @@ type ConnConfigRequest struct { } type ConnStatus struct { - Status string `json:"status"` - WshEnabled bool `json:"wshenabled"` - Connection string `json:"connection"` - Connected bool `json:"connected"` - HasConnected bool `json:"hasconnected"` // true if it has *ever* connected successfully - ActiveConnNum int `json:"activeconnnum"` - Error string `json:"error,omitempty"` - WshError string `json:"wsherror,omitempty"` - NoWshReason string `json:"nowshreason,omitempty"` - WshVersion string `json:"wshversion,omitempty"` + Status string `json:"status"` + ConnHealthStatus string `json:"connhealthstatus,omitempty"` + WshEnabled bool `json:"wshenabled"` + Connection string `json:"connection"` + Connected bool `json:"connected"` + HasConnected bool `json:"hasconnected"` // true if it has *ever* connected successfully + ActiveConnNum int `json:"activeconnnum"` + Error string `json:"error,omitempty"` + WshError string `json:"wsherror,omitempty"` + NoWshReason string `json:"nowshreason,omitempty"` + WshVersion string `json:"wshversion,omitempty"` } type WebSelectorOpts struct { diff --git a/pkg/wshutil/wshrpcio.go b/pkg/wshutil/wshrpcio.go index 3345bdd9d3..2a88e987b4 100644 --- a/pkg/wshutil/wshrpcio.go +++ b/pkg/wshutil/wshrpcio.go @@ -16,10 +16,10 @@ import ( // * stream (json lines) // * websocket (json packets) -func AdaptStreamToMsgCh(input io.Reader, output chan baseds.RpcInputChType) error { +func AdaptStreamToMsgCh(input io.Reader, output chan baseds.RpcInputChType, readCallback func()) error { return utilfn.StreamToLines(input, func(line []byte) { output <- baseds.RpcInputChType{MsgBytes: line} - }) + }, readCallback) } func AdaptOutputChToStream(outputCh chan []byte, output io.Writer) error { diff --git a/pkg/wshutil/wshutil.go b/pkg/wshutil/wshutil.go index d979e7652f..2bb7e2db11 100644 --- a/pkg/wshutil/wshutil.go +++ b/pkg/wshutil/wshutil.go @@ -171,7 +171,7 @@ func SetupConnRpcClient(conn net.Conn, serverImpl ServerImpl, debugStr string) ( }() // when input is closed, close the connection defer conn.Close() - AdaptStreamToMsgCh(conn, inputCh) + AdaptStreamToMsgCh(conn, inputCh, nil) }() rtn := MakeWshRpcWithChannels(inputCh, outputCh, wshrpc.RpcContext{}, serverImpl, debugStr) return rtn, writeErrCh, nil @@ -255,7 +255,7 @@ func ValidateAndExtractRpcContextFromToken(tokenStr string) (*wshrpc.RpcContext, return claimsToRpcCtx(claims), nil } -func RunWshRpcOverListener(listener net.Listener) { +func RunWshRpcOverListener(listener net.Listener, readCallback func()) { defer log.Printf("domain socket listener shutting down\n") for { conn, err := listener.Accept() @@ -267,7 +267,7 @@ func RunWshRpcOverListener(listener net.Listener) { break } log.Print("got domain socket connection\n") - go handleDomainSocketClient(conn) + go handleDomainSocketClient(conn, readCallback) } } @@ -324,7 +324,7 @@ func HandleStdIOClient(logName string, input chan utilfn.LineOutput, output io.W <-doneCh } -func handleDomainSocketClient(conn net.Conn) { +func handleDomainSocketClient(conn net.Conn, readCallback func()) { var linkIdContainer atomic.Int32 proxy := MakeRpcProxy("domain") go func() { @@ -350,7 +350,7 @@ func handleDomainSocketClient(conn net.Conn) { DefaultRouter.UnregisterLink(baseds.LinkId(linkId)) } }() - AdaptStreamToMsgCh(conn, proxy.FromRemoteCh) + AdaptStreamToMsgCh(conn, proxy.FromRemoteCh, readCallback) }() linkId := DefaultRouter.RegisterUntrustedLink(proxy) linkIdContainer.Store(int32(linkId)) From 5a492027320a7ec7dc31f7d4a46d7bfbabc87281 Mon Sep 17 00:00:00 2001 From: sawka Date: Sat, 7 Feb 2026 12:45:58 -0800 Subject: [PATCH 03/17] setconnhealthstatus when no response from keepalive for 10s --- pkg/remote/conncontroller/conncontroller.go | 17 ++++- pkg/remote/conncontroller/connmonitor.go | 75 ++++++++++++++++++--- 2 files changed, 82 insertions(+), 10 deletions(-) diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index 92ba1ef5ff..8dc79b07a9 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -996,9 +996,24 @@ func (conn *SSHConn) ClearWshError() { } func (conn *SSHConn) SetConnHealthStatus(status string) { + changed := false conn.WithLock(func() { - conn.ConnHealthStatus = status + if conn.ConnHealthStatus != status { + conn.ConnHealthStatus = status + changed = true + } + }) + if changed { + conn.FireConnChangeEvent() + } +} + +func (conn *SSHConn) GetConnHealthStatus() string { + var status string + conn.WithLock(func() { + status = conn.ConnHealthStatus }) + return status } func getConnInternal(opts *remote.SSHOpts, createIfNotExists bool) *SSHConn { diff --git a/pkg/remote/conncontroller/connmonitor.go b/pkg/remote/conncontroller/connmonitor.go index e7d94e32df..0cbbeefa8b 100644 --- a/pkg/remote/conncontroller/connmonitor.go +++ b/pkg/remote/conncontroller/connmonitor.go @@ -4,10 +4,13 @@ package conncontroller import ( + "context" "fmt" "sync" "sync/atomic" "time" + + "github.com/wavetermdev/waveterm/pkg/panichandler" ) type ConnMonitor struct { @@ -16,17 +19,25 @@ type ConnMonitor struct { LastActivityTime atomic.Int64 KeepAliveSentTime int64 KeepAliveInFlight bool + ctx context.Context + cancelFunc context.CancelFunc } func MakeConnMonitor(conn *SSHConn) *ConnMonitor { - return &ConnMonitor{ - lock: &sync.Mutex{}, - Conn: conn, + ctx, cancelFunc := context.WithCancel(context.Background()) + cm := &ConnMonitor{ + lock: &sync.Mutex{}, + Conn: conn, + ctx: ctx, + cancelFunc: cancelFunc, } + go cm.keepAliveMonitor() + return cm } func (cm *ConnMonitor) UpdateLastActivityTime() { cm.LastActivityTime.Store(time.Now().UnixMilli()) + cm.Conn.SetConnHealthStatus(ConnHealthStatus_Good) } func (cm *ConnMonitor) setKeepAliveInFlight() bool { @@ -48,6 +59,16 @@ func (cm *ConnMonitor) clearKeepAliveInFlight() { cm.KeepAliveInFlight = false } +func (cm *ConnMonitor) getTimeSinceKeepAlive() int64 { + cm.lock.Lock() + defer cm.lock.Unlock() + + if !cm.KeepAliveInFlight { + return 0 + } + return time.Now().UnixMilli() - cm.KeepAliveSentTime +} + func (cm *ConnMonitor) SendKeepAlive() error { conn := cm.Conn if conn == nil || conn.Client == nil { @@ -56,11 +77,47 @@ func (cm *ConnMonitor) SendKeepAlive() error { if !cm.setKeepAliveInFlight() { return nil } - _, _, err := conn.Client.SendRequest("keepalive@openssh.com", true, nil) - cm.clearKeepAliveInFlight() - if err != nil { - return err - } - cm.UpdateLastActivityTime() + go func() { + defer func() { + panichandler.PanicHandler("conncontroller:SendKeepAlive", recover()) + }() + defer cm.clearKeepAliveInFlight() + _, _, _ = conn.Client.SendRequest("keepalive@openssh.com", true, nil) + cm.UpdateLastActivityTime() + }() return nil } + +func (cm *ConnMonitor) keepAliveMonitor() { + defer func() { + panichandler.PanicHandler("conncontroller:keepAliveMonitor", recover()) + }() + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + lastActivity := cm.LastActivityTime.Load() + if lastActivity == 0 { + continue + } + timeSinceActivity := time.Now().UnixMilli() - lastActivity + if timeSinceActivity > 10000 { + cm.SendKeepAlive() + timeSinceKeepAlive := cm.getTimeSinceKeepAlive() + if timeSinceKeepAlive > 10000 { + cm.Conn.SetConnHealthStatus(ConnHealthStatus_Stalled) + } + } + case <-cm.ctx.Done(): + return + } + } +} + +func (cm *ConnMonitor) Close() { + if cm.cancelFunc != nil { + cm.cancelFunc() + } +} From c3e995b3844258e13d9169065987b27dd67ee1eb Mon Sep 17 00:00:00 2001 From: sawka Date: Sat, 7 Feb 2026 13:17:38 -0800 Subject: [PATCH 04/17] input handling for connmonitor, sets urgency, lower threshold for stall indicator --- pkg/remote/conncontroller/connmonitor.go | 75 +++++++++++++++++++----- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/pkg/remote/conncontroller/connmonitor.go b/pkg/remote/conncontroller/connmonitor.go index 0cbbeefa8b..cfe8e476de 100644 --- a/pkg/remote/conncontroller/connmonitor.go +++ b/pkg/remote/conncontroller/connmonitor.go @@ -17,19 +17,22 @@ type ConnMonitor struct { lock *sync.Mutex Conn *SSHConn LastActivityTime atomic.Int64 + LastInputTime atomic.Int64 KeepAliveSentTime int64 KeepAliveInFlight bool ctx context.Context cancelFunc context.CancelFunc + inputNotifyCh chan int64 } func MakeConnMonitor(conn *SSHConn) *ConnMonitor { ctx, cancelFunc := context.WithCancel(context.Background()) cm := &ConnMonitor{ - lock: &sync.Mutex{}, - Conn: conn, - ctx: ctx, - cancelFunc: cancelFunc, + lock: &sync.Mutex{}, + Conn: conn, + ctx: ctx, + cancelFunc: cancelFunc, + inputNotifyCh: make(chan int64, 1), } go cm.keepAliveMonitor() return cm @@ -40,6 +43,23 @@ func (cm *ConnMonitor) UpdateLastActivityTime() { cm.Conn.SetConnHealthStatus(ConnHealthStatus_Good) } +func (cm *ConnMonitor) NotifyInput() { + inputTime := time.Now().UnixMilli() + cm.LastInputTime.Store(inputTime) + select { + case cm.inputNotifyCh <- inputTime: + default: + } +} + +func (cm *ConnMonitor) isUrgent() bool { + lastInput := cm.LastInputTime.Load() + if lastInput == 0 { + return false + } + return time.Now().UnixMilli()-lastInput < 10000 +} + func (cm *ConnMonitor) setKeepAliveInFlight() bool { cm.lock.Lock() defer cm.lock.Unlock() @@ -88,6 +108,32 @@ func (cm *ConnMonitor) SendKeepAlive() error { return nil } +func (cm *ConnMonitor) checkConnection() { + lastActivity := cm.LastActivityTime.Load() + if lastActivity == 0 { + return + } + urgent := cm.isUrgent() + timeSinceActivity := time.Now().UnixMilli() - lastActivity + + keepAliveThreshold := int64(10000) + if urgent { + keepAliveThreshold = 1000 + } + if timeSinceActivity > keepAliveThreshold { + cm.SendKeepAlive() + } + + stalledThreshold := int64(10000) + if urgent { + stalledThreshold = 5000 + } + timeSinceKeepAlive := cm.getTimeSinceKeepAlive() + if timeSinceKeepAlive > stalledThreshold { + cm.Conn.SetConnHealthStatus(ConnHealthStatus_Stalled) + } +} + func (cm *ConnMonitor) keepAliveMonitor() { defer func() { panichandler.PanicHandler("conncontroller:keepAliveMonitor", recover()) @@ -98,18 +144,19 @@ func (cm *ConnMonitor) keepAliveMonitor() { for { select { case <-ticker.C: - lastActivity := cm.LastActivityTime.Load() - if lastActivity == 0 { - continue - } - timeSinceActivity := time.Now().UnixMilli() - lastActivity - if timeSinceActivity > 10000 { - cm.SendKeepAlive() - timeSinceKeepAlive := cm.getTimeSinceKeepAlive() - if timeSinceKeepAlive > 10000 { - cm.Conn.SetConnHealthStatus(ConnHealthStatus_Stalled) + cm.checkConnection() + + case inputTime := <-cm.inputNotifyCh: + select { + case <-time.After(1 * time.Second): + if cm.LastActivityTime.Load() >= inputTime { + break } + cm.checkConnection() + case <-cm.ctx.Done(): + return } + case <-cm.ctx.Done(): return } From 759e716d7e4d54c6f9da6b5705d06c9de136465e Mon Sep 17 00:00:00 2001 From: sawka Date: Sat, 7 Feb 2026 13:26:38 -0800 Subject: [PATCH 05/17] notify input on connmonitor --- pkg/blockcontroller/blockcontroller.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pkg/blockcontroller/blockcontroller.go b/pkg/blockcontroller/blockcontroller.go index d5b307e92a..9004c500b7 100644 --- a/pkg/blockcontroller/blockcontroller.go +++ b/pkg/blockcontroller/blockcontroller.go @@ -290,11 +290,28 @@ func DestroyBlockController(blockId string) { deleteController(blockId) } +func sendConnMonitorInputNotification(controller Controller) { + connName := controller.GetConnName() + if connName == "" || conncontroller.IsLocalConnName(connName) || conncontroller.IsWslConnName(connName) { + return + } + + connOpts, parseErr := remote.ParseOpts(connName) + if parseErr != nil { + return + } + sshConn := conncontroller.GetConn(connOpts) + if sshConn != nil && sshConn.Monitor != nil { + sshConn.Monitor.NotifyInput() + } +} + func SendInput(blockId string, inputUnion *BlockInputUnion) error { controller := getController(blockId) if controller == nil { return fmt.Errorf("no controller found for block %s", blockId) } + sendConnMonitorInputNotification(controller) return controller.SendInput(inputUnion) } From cd99449dbdfca2cd12048a25fd0bbb401b9fccf3 Mon Sep 17 00:00:00 2001 From: sawka Date: Sun, 8 Feb 2026 19:16:22 -0800 Subject: [PATCH 06/17] get the stalled overlay working --- frontend/app/block/connstatusoverlay.tsx | 106 +++++++++++++++++++- frontend/types/gotypes.d.ts | 2 + pkg/remote/conncontroller/conncontroller.go | 30 ++++-- pkg/remote/conncontroller/connmonitor.go | 6 +- pkg/wshrpc/wshrpctypes.go | 24 +++-- 5 files changed, 142 insertions(+), 26 deletions(-) diff --git a/frontend/app/block/connstatusoverlay.tsx b/frontend/app/block/connstatusoverlay.tsx index 8f0852abac..14ccd5a082 100644 --- a/frontend/app/block/connstatusoverlay.tsx +++ b/frontend/app/block/connstatusoverlay.tsx @@ -14,6 +14,91 @@ import * as jotai from "jotai"; import { OverlayScrollbarsComponent } from "overlayscrollbars-react"; import * as React from "react"; +function formatElapsedTime(elapsedMs: number): string { + const elapsedSeconds = Math.floor(elapsedMs / 1000); + + if (elapsedSeconds < 60) { + return `${elapsedSeconds}s`; + } + + const elapsedMinutes = Math.floor(elapsedSeconds / 60); + if (elapsedMinutes < 60) { + return `${elapsedMinutes}m`; + } + + const elapsedHours = Math.floor(elapsedMinutes / 60); + const remainingMinutes = elapsedMinutes % 60; + + if (elapsedHours < 24) { + if (remainingMinutes === 0) { + return `${elapsedHours}h`; + } + return `${elapsedHours}h${remainingMinutes}m`; + } + + return "more than a day"; +} + +const StalledOverlay = React.memo( + ({ + connName, + connStatus, + reconClassName, + handleDisconnect, + overlayRefCallback, + }: { + connName: string; + connStatus: ConnStatus; + reconClassName: string; + handleDisconnect: () => void; + overlayRefCallback: (el: HTMLDivElement | null) => void; + }) => { + const [elapsedTime, setElapsedTime] = React.useState(""); + + React.useEffect(() => { + if (!connStatus.lastactivitybeforerstalledtime) { + return; + } + + const updateElapsed = () => { + const now = Date.now(); + const lastActivity = connStatus.lastactivitybeforerstalledtime! * 1000; + const elapsed = now - lastActivity; + setElapsedTime(formatElapsedTime(elapsed)); + }; + + updateElapsed(); + const interval = setInterval(updateElapsed, 1000); + + return () => clearInterval(interval); + }, [connStatus.lastactivitybeforerstalledtime]); + + return ( +
+
+ +
+ Connection to "{connName}" is stalled + {elapsedTime && ` (no activity for ${elapsedTime})`} +
+
+ +
+
+ ); + } +); +StalledOverlay.displayName = "StalledOverlay"; + export const ConnStatusOverlay = React.memo( ({ nodeModel, @@ -52,6 +137,11 @@ export const ConnStatusOverlay = React.memo( prtn.catch((e) => console.log("error reconnecting", connName, e)); }, [connName, nodeModel.blockId]); + const handleDisconnect = React.useCallback(() => { + const prtn = RpcApi.ConnDisconnectCommand(TabRpcClient, connName, { timeout: 5000 }); + prtn.catch((e) => console.log("error disconnecting", connName, e)); + }, [connName]); + const handleDisableWsh = React.useCallback(async () => { const metamaptype: unknown = { "conn:wshenabled": false, @@ -121,10 +211,24 @@ export const ConnStatusOverlay = React.memo( [showError, showWshError, connStatus.error, connStatus.wsherror] ); - if (!showWshError && (isLayoutMode || connStatus.status == "connected" || connModalOpen)) { + let showStalled = connStatus.status == "connected" && connStatus.connhealthstatus == "stalled"; + showStalled = true; + if (!showWshError && !showStalled && (isLayoutMode || connStatus.status == "connected" || connModalOpen)) { return null; } + if (showStalled && !showWshError) { + return ( + + ); + } + return (
diff --git a/frontend/types/gotypes.d.ts b/frontend/types/gotypes.d.ts index 5c55d55961..065ce0967c 100644 --- a/frontend/types/gotypes.d.ts +++ b/frontend/types/gotypes.d.ts @@ -795,6 +795,8 @@ declare global { wsherror?: string; nowshreason?: string; wshversion?: string; + lastactivitybeforerstalledtime?: number; + keepalivesenttime?: number; }; // wshrpc.CpuDataRequest diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index 8dc79b07a9..ec3785d89e 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -134,18 +134,26 @@ func GetNumSSHHasConnected() int { func (conn *SSHConn) DeriveConnStatus() wshrpc.ConnStatus { conn.lock.Lock() defer conn.lock.Unlock() + var lastActivityBeforeStalledTime int64 + var keepAliveSentTime int64 + if conn.ConnHealthStatus == ConnHealthStatus_Stalled && conn.Monitor != nil { + lastActivityBeforeStalledTime = conn.Monitor.LastActivityTime.Load() + keepAliveSentTime = conn.Monitor.KeepAliveSentTime.Load() + } return wshrpc.ConnStatus{ - Status: conn.Status, - Connected: conn.Status == Status_Connected, - Connection: conn.Opts.String(), - HasConnected: (conn.LastConnectTime > 0), - ActiveConnNum: conn.ActiveConnNum, - Error: conn.Error, - WshEnabled: conn.WshEnabled.Load(), - WshError: conn.WshError, - NoWshReason: conn.NoWshReason, - WshVersion: conn.WshVersion, - ConnHealthStatus: conn.ConnHealthStatus, + Status: conn.Status, + Connected: conn.Status == Status_Connected, + Connection: conn.Opts.String(), + HasConnected: (conn.LastConnectTime > 0), + ActiveConnNum: conn.ActiveConnNum, + Error: conn.Error, + WshEnabled: conn.WshEnabled.Load(), + WshError: conn.WshError, + NoWshReason: conn.NoWshReason, + WshVersion: conn.WshVersion, + ConnHealthStatus: conn.ConnHealthStatus, + LastActivityBeforeStalledTime: lastActivityBeforeStalledTime, + KeepAliveSentTime: keepAliveSentTime, } } diff --git a/pkg/remote/conncontroller/connmonitor.go b/pkg/remote/conncontroller/connmonitor.go index cfe8e476de..a22c95d04f 100644 --- a/pkg/remote/conncontroller/connmonitor.go +++ b/pkg/remote/conncontroller/connmonitor.go @@ -18,7 +18,7 @@ type ConnMonitor struct { Conn *SSHConn LastActivityTime atomic.Int64 LastInputTime atomic.Int64 - KeepAliveSentTime int64 + KeepAliveSentTime atomic.Int64 KeepAliveInFlight bool ctx context.Context cancelFunc context.CancelFunc @@ -68,7 +68,7 @@ func (cm *ConnMonitor) setKeepAliveInFlight() bool { return false } cm.KeepAliveInFlight = true - cm.KeepAliveSentTime = time.Now().UnixMilli() + cm.KeepAliveSentTime.Store(time.Now().UnixMilli()) return true } @@ -86,7 +86,7 @@ func (cm *ConnMonitor) getTimeSinceKeepAlive() int64 { if !cm.KeepAliveInFlight { return 0 } - return time.Now().UnixMilli() - cm.KeepAliveSentTime + return time.Now().UnixMilli() - cm.KeepAliveSentTime.Load() } func (cm *ConnMonitor) SendKeepAlive() error { diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 4e667e6ecf..7292403d44 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -442,17 +442,19 @@ type ConnConfigRequest struct { } type ConnStatus struct { - Status string `json:"status"` - ConnHealthStatus string `json:"connhealthstatus,omitempty"` - WshEnabled bool `json:"wshenabled"` - Connection string `json:"connection"` - Connected bool `json:"connected"` - HasConnected bool `json:"hasconnected"` // true if it has *ever* connected successfully - ActiveConnNum int `json:"activeconnnum"` - Error string `json:"error,omitempty"` - WshError string `json:"wsherror,omitempty"` - NoWshReason string `json:"nowshreason,omitempty"` - WshVersion string `json:"wshversion,omitempty"` + Status string `json:"status"` + ConnHealthStatus string `json:"connhealthstatus,omitempty"` + WshEnabled bool `json:"wshenabled"` + Connection string `json:"connection"` + Connected bool `json:"connected"` + HasConnected bool `json:"hasconnected"` // true if it has *ever* connected successfully + ActiveConnNum int `json:"activeconnnum"` + Error string `json:"error,omitempty"` + WshError string `json:"wsherror,omitempty"` + NoWshReason string `json:"nowshreason,omitempty"` + WshVersion string `json:"wshversion,omitempty"` + LastActivityBeforeStalledTime int64 `json:"lastactivitybeforerstalledtime,omitempty"` + KeepAliveSentTime int64 `json:"keepalivesenttime,omitempty"` } type WebSelectorOpts struct { From 7f7f70e8224020ab71b22a4c2f9c6a01d7d8b703 Mon Sep 17 00:00:00 2001 From: sawka Date: Sun, 8 Feb 2026 19:45:41 -0800 Subject: [PATCH 07/17] add degraded status --- frontend/app/block/block.scss | 5 +++-- frontend/app/block/connectionbutton.tsx | 8 ++++++++ frontend/app/block/connstatusoverlay.tsx | 9 ++++++--- pkg/remote/conncontroller/conncontroller.go | 5 +++-- pkg/remote/conncontroller/connmonitor.go | 1 + 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/frontend/app/block/block.scss b/frontend/app/block/block.scss index 41c7d9835b..65806ba78d 100644 --- a/frontend/app/block/block.scss +++ b/frontend/app/block/block.scss @@ -284,8 +284,8 @@ .connstatus-overlay { position: absolute; top: calc(var(--header-height) + 6px); - left: 6px; - right: 6px; + left: 8px; + right: 8px; z-index: var(--zindex-block-mask-inner); display: flex; align-items: center; @@ -296,6 +296,7 @@ backdrop-filter: blur(50px); border-radius: 6px; box-shadow: 0px 13px 16px 0px rgb(from var(--block-bg-color) r g b / 40%); + opacity: 0.85; .connstatus-content { display: flex; diff --git a/frontend/app/block/connectionbutton.tsx b/frontend/app/block/connectionbutton.tsx index 1f74f00967..c0a37659cb 100644 --- a/frontend/app/block/connectionbutton.tsx +++ b/frontend/app/block/connectionbutton.tsx @@ -80,6 +80,14 @@ export const ConnectionButton = React.memo( color = "var(--grey-text-color)"; titleText = "Disconnected from " + connection; showDisconnectedSlash = true; + } else if (connStatus?.connhealthstatus === "degraded" || connStatus?.connhealthstatus === "stalled") { + color = "var(--warning-color)"; + iconName = "signal-bars-slash"; + if (connStatus.connhealthstatus === "degraded") { + titleText = "Connection degraded: " + connection; + } else { + titleText = "Connection stalled: " + connection; + } } if (iconSvg != null) { connIconElem = iconSvg; diff --git a/frontend/app/block/connstatusoverlay.tsx b/frontend/app/block/connstatusoverlay.tsx index 14ccd5a082..c0957f2b44 100644 --- a/frontend/app/block/connstatusoverlay.tsx +++ b/frontend/app/block/connstatusoverlay.tsx @@ -15,6 +15,10 @@ import { OverlayScrollbarsComponent } from "overlayscrollbars-react"; import * as React from "react"; function formatElapsedTime(elapsedMs: number): string { + if (elapsedMs <= 0) { + return ""; + } + const elapsedSeconds = Math.floor(elapsedMs / 1000); if (elapsedSeconds < 60) { @@ -62,7 +66,7 @@ const StalledOverlay = React.memo( const updateElapsed = () => { const now = Date.now(); - const lastActivity = connStatus.lastactivitybeforerstalledtime! * 1000; + const lastActivity = connStatus.lastactivitybeforerstalledtime!; const elapsed = now - lastActivity; setElapsedTime(formatElapsedTime(elapsed)); }; @@ -75,7 +79,7 @@ const StalledOverlay = React.memo( return (
@@ -212,7 +216,6 @@ export const ConnStatusOverlay = React.memo( ); let showStalled = connStatus.status == "connected" && connStatus.connhealthstatus == "stalled"; - showStalled = true; if (!showWshError && !showStalled && (isLayoutMode || connStatus.status == "connected" || connModalOpen)) { return null; } diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index ec3785d89e..63cfaa5d84 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -62,8 +62,9 @@ const ( ) const ( - ConnHealthStatus_Good = "good" - ConnHealthStatus_Stalled = "stalled" + ConnHealthStatus_Good = "good" + ConnHealthStatus_Degraded = "degraded" + ConnHealthStatus_Stalled = "stalled" ) const DefaultConnectionTimeout = 60 * time.Second diff --git a/pkg/remote/conncontroller/connmonitor.go b/pkg/remote/conncontroller/connmonitor.go index a22c95d04f..39f7bead41 100644 --- a/pkg/remote/conncontroller/connmonitor.go +++ b/pkg/remote/conncontroller/connmonitor.go @@ -152,6 +152,7 @@ func (cm *ConnMonitor) keepAliveMonitor() { if cm.LastActivityTime.Load() >= inputTime { break } + cm.Conn.SetConnHealthStatus(ConnHealthStatus_Degraded) cm.checkConnection() case <-cm.ctx.Done(): return From 0221ed62b2558bcd6bb2a9872e6682fd862b67d8 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 9 Feb 2026 10:58:57 -0800 Subject: [PATCH 08/17] simplify props to stalledoverlay --- frontend/app/block/connstatusoverlay.tsx | 28 ++++++++++++------------ frontend/tailwindsetup.css | 1 + 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/frontend/app/block/connstatusoverlay.tsx b/frontend/app/block/connstatusoverlay.tsx index c0957f2b44..9650a80f6a 100644 --- a/frontend/app/block/connstatusoverlay.tsx +++ b/frontend/app/block/connstatusoverlay.tsx @@ -47,18 +47,19 @@ const StalledOverlay = React.memo( ({ connName, connStatus, - reconClassName, - handleDisconnect, overlayRefCallback, }: { connName: string; connStatus: ConnStatus; - reconClassName: string; - handleDisconnect: () => void; overlayRefCallback: (el: HTMLDivElement | null) => void; }) => { const [elapsedTime, setElapsedTime] = React.useState(""); + const handleDisconnect = React.useCallback(() => { + const prtn = RpcApi.ConnDisconnectCommand(TabRpcClient, connName, { timeout: 5000 }); + prtn.catch((e) => console.log("error disconnecting", connName, e)); + }, [connName]); + React.useEffect(() => { if (!connStatus.lastactivitybeforerstalledtime) { return; @@ -92,9 +93,13 @@ const StalledOverlay = React.memo( {elapsedTime && ` (no activity for ${elapsedTime})`}
-
@@ -216,19 +221,14 @@ export const ConnStatusOverlay = React.memo( ); let showStalled = connStatus.status == "connected" && connStatus.connhealthstatus == "stalled"; + showStalled = true; if (!showWshError && !showStalled && (isLayoutMode || connStatus.status == "connected" || connModalOpen)) { return null; } if (showStalled && !showWshError) { return ( - + ); } diff --git a/frontend/tailwindsetup.css b/frontend/tailwindsetup.css index a41c476052..b9f1933825 100644 --- a/frontend/tailwindsetup.css +++ b/frontend/tailwindsetup.css @@ -67,6 +67,7 @@ --container-w600: 600px; --container-w450: 450px; + --container-w350: 350px; --container-xs: 300px; --container-xxs: 200px; --container-tiny: 120px; From 063308386f426edf3f1227ca556150c382cf5d1b Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 9 Feb 2026 12:32:37 -0800 Subject: [PATCH 09/17] remove debugging code --- frontend/app/block/connstatusoverlay.tsx | 1 - 1 file changed, 1 deletion(-) diff --git a/frontend/app/block/connstatusoverlay.tsx b/frontend/app/block/connstatusoverlay.tsx index 9650a80f6a..71edf2c196 100644 --- a/frontend/app/block/connstatusoverlay.tsx +++ b/frontend/app/block/connstatusoverlay.tsx @@ -221,7 +221,6 @@ export const ConnStatusOverlay = React.memo( ); let showStalled = connStatus.status == "connected" && connStatus.connhealthstatus == "stalled"; - showStalled = true; if (!showWshError && !showStalled && (isLayoutMode || connStatus.status == "connected" || connModalOpen)) { return null; } From a9ab2fda3ad93ab8df3c3ca378cb89bb29d19a4c Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 9 Feb 2026 12:36:08 -0800 Subject: [PATCH 10/17] fix json tag typo --- frontend/app/block/connstatusoverlay.tsx | 6 +++--- frontend/types/gotypes.d.ts | 2 +- pkg/wshrpc/wshrpctypes.go | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/frontend/app/block/connstatusoverlay.tsx b/frontend/app/block/connstatusoverlay.tsx index 71edf2c196..ce8b1cafba 100644 --- a/frontend/app/block/connstatusoverlay.tsx +++ b/frontend/app/block/connstatusoverlay.tsx @@ -61,13 +61,13 @@ const StalledOverlay = React.memo( }, [connName]); React.useEffect(() => { - if (!connStatus.lastactivitybeforerstalledtime) { + if (!connStatus.lastactivitybeforestalledtime) { return; } const updateElapsed = () => { const now = Date.now(); - const lastActivity = connStatus.lastactivitybeforerstalledtime!; + const lastActivity = connStatus.lastactivitybeforestalledtime!; const elapsed = now - lastActivity; setElapsedTime(formatElapsedTime(elapsed)); }; @@ -76,7 +76,7 @@ const StalledOverlay = React.memo( const interval = setInterval(updateElapsed, 1000); return () => clearInterval(interval); - }, [connStatus.lastactivitybeforerstalledtime]); + }, [connStatus.lastactivitybeforestalledtime]); return (
Date: Mon, 9 Feb 2026 12:36:19 -0800 Subject: [PATCH 11/17] use GetClient (sync) --- pkg/remote/conncontroller/connmonitor.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/remote/conncontroller/connmonitor.go b/pkg/remote/conncontroller/connmonitor.go index 39f7bead41..f85789929b 100644 --- a/pkg/remote/conncontroller/connmonitor.go +++ b/pkg/remote/conncontroller/connmonitor.go @@ -91,7 +91,8 @@ func (cm *ConnMonitor) getTimeSinceKeepAlive() int64 { func (cm *ConnMonitor) SendKeepAlive() error { conn := cm.Conn - if conn == nil || conn.Client == nil { + client := conn.GetClient() + if conn == nil || client == nil { return fmt.Errorf("no active connection") } if !cm.setKeepAliveInFlight() { @@ -102,7 +103,7 @@ func (cm *ConnMonitor) SendKeepAlive() error { panichandler.PanicHandler("conncontroller:SendKeepAlive", recover()) }() defer cm.clearKeepAliveInFlight() - _, _, _ = conn.Client.SendRequest("keepalive@openssh.com", true, nil) + _, _, _ = client.SendRequest("keepalive@openssh.com", true, nil) cm.UpdateLastActivityTime() }() return nil From c9ce4e86a3c84500e1f18c0f6d63d17d1807dad0 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 9 Feb 2026 12:44:05 -0800 Subject: [PATCH 12/17] add a MaybeGetConn func, document that monitor will never be nil --- pkg/blockcontroller/blockcontroller.go | 7 +++++-- pkg/blockcontroller/durableshellcontroller.go | 2 +- pkg/blockcontroller/shellcontroller.go | 2 +- pkg/remote/conncontroller/conncontroller.go | 9 ++++++++- pkg/wshrpc/wshserver/wshserver.go | 2 +- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/pkg/blockcontroller/blockcontroller.go b/pkg/blockcontroller/blockcontroller.go index 9004c500b7..ee4b601de8 100644 --- a/pkg/blockcontroller/blockcontroller.go +++ b/pkg/blockcontroller/blockcontroller.go @@ -300,7 +300,7 @@ func sendConnMonitorInputNotification(controller Controller) { if parseErr != nil { return } - sshConn := conncontroller.GetConn(connOpts) + sshConn := conncontroller.MaybeGetConn(connOpts) if sshConn != nil && sshConn.Monitor != nil { sshConn.Monitor.NotifyInput() } @@ -430,7 +430,10 @@ func CheckConnStatus(blockId string) error { if err != nil { return fmt.Errorf("error parsing connection name: %w", err) } - conn := conncontroller.GetConn(opts) + conn := conncontroller.MaybeGetConn(opts) + if conn == nil { + return fmt.Errorf("no connection found") + } connStatus := conn.DeriveConnStatus() if connStatus.Status != conncontroller.Status_Connected { return fmt.Errorf("not connected: %s", connStatus.Status) diff --git a/pkg/blockcontroller/durableshellcontroller.go b/pkg/blockcontroller/durableshellcontroller.go index 25ac22c9aa..a208a3df75 100644 --- a/pkg/blockcontroller/durableshellcontroller.go +++ b/pkg/blockcontroller/durableshellcontroller.go @@ -229,7 +229,7 @@ func (dsc *DurableShellController) startNewJob(ctx context.Context, blockMeta wa if err != nil { return "", fmt.Errorf("invalid ssh remote name (%s): %w", connName, err) } - conn := conncontroller.GetConn(opts) + conn := conncontroller.MaybeGetConn(opts) if conn == nil { return "", fmt.Errorf("connection %q not found", connName) } diff --git a/pkg/blockcontroller/shellcontroller.go b/pkg/blockcontroller/shellcontroller.go index b0c7081efc..a410225394 100644 --- a/pkg/blockcontroller/shellcontroller.go +++ b/pkg/blockcontroller/shellcontroller.go @@ -355,7 +355,7 @@ func (bc *ShellController) getConnUnion(logCtx context.Context, remoteName strin if err != nil { return ConnUnion{}, fmt.Errorf("invalid ssh remote name (%s): %w", remoteName, err) } - conn := conncontroller.GetConn(opts) + conn := conncontroller.MaybeGetConn(opts) if conn == nil { return ConnUnion{}, fmt.Errorf("ssh connection not found: %s", remoteName) } diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index 63cfaa5d84..43e25b9e18 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -91,7 +91,7 @@ type SSHConn struct { WshVersion string LastConnectTime int64 ActiveConnNum int - Monitor *ConnMonitor + Monitor *ConnMonitor // will not be nil } var ConnServerCmdTemplate = strings.TrimSpace( @@ -293,6 +293,7 @@ func (conn *SSHConn) OpenDomainSocketListener(ctx context.Context) error { conn.DomainSockListener = nil conn.DomainSockName = "" }) + // monitor will never be nil (set up in Make) wshutil.RunWshRpcOverListener(listener, conn.Monitor.UpdateLastActivityTime) }() return nil @@ -1050,6 +1051,12 @@ func GetConn(opts *remote.SSHOpts) *SSHConn { return conn } +// does NOT connect, can return nil +func MaybeGetConn(opts *remote.SSHOpts) *SSHConn { + conn := getConnInternal(opts, false) + return conn +} + func IsConnected(connName string) (bool, error) { if IsLocalConnName(connName) { return true, nil diff --git a/pkg/wshrpc/wshserver/wshserver.go b/pkg/wshrpc/wshserver/wshserver.go index 3ab4caaccf..04b711fa0b 100644 --- a/pkg/wshrpc/wshserver/wshserver.go +++ b/pkg/wshrpc/wshserver/wshserver.go @@ -595,7 +595,7 @@ func (ws *WshServer) ConnDisconnectCommand(ctx context.Context, connName string) if err != nil { return fmt.Errorf("error parsing connection name: %w", err) } - conn := conncontroller.GetConn(connOpts) + conn := conncontroller.MaybeGetConn(connOpts) if conn == nil { return fmt.Errorf("connection not found: %s", connName) } From b7c7b6f54e4f1e5cb239320cabcb6569f96ec662 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 9 Feb 2026 12:48:14 -0800 Subject: [PATCH 13/17] remove origin mode reset (also resets cursor) --- pkg/util/shellutil/shellutil.go | 35 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/pkg/util/shellutil/shellutil.go b/pkg/util/shellutil/shellutil.go index 3f8ee505be..e6f6c21f38 100644 --- a/pkg/util/shellutil/shellutil.go +++ b/pkg/util/shellutil/shellutil.go @@ -620,24 +620,23 @@ func FixupWaveZshHistory() error { } func GetTerminalResetSeq() string { - resetSeq := "\x1b[0m" // reset attributes - resetSeq += "\x1b[?25h" // show cursor - resetSeq += "\x1b[?1l" // normal cursor keys - resetSeq += "\x1b[?6l" // origin mode off (DECOM) - resetSeq += "\x1b[?7h" // wraparound on - resetSeq += "\x1b[?45l" // reverse wraparound off - resetSeq += "\x1b[?66l" // application keypad off (DECNKM) - resetSeq += "\x1b[4l" // insert mode off (IRM) - resetSeq += "\x1b[?9l" // X10 mouse tracking off - resetSeq += "\x1b[?1000l" // disable Send Mouse X & Y on button press - resetSeq += "\x1b[?1002l" // disable Use Cell Motion Mouse Tracking - resetSeq += "\x1b[?1003l" // disable Use All Motion Mouse Tracking - resetSeq += "\x1b[?1004l" // disable Send FocusIn/FocusOut events - resetSeq += "\x1b[?1006l" // disable Enable SGR Mouse Mode - resetSeq += "\x1b[?1007l" // disable Enable Alternate Scroll Mode - resetSeq += "\x1b[?2004l" // disable bracketed paste mode - resetSeq += "\x1b[?2026l" // synchronized output off - resetSeq += FormatOSC(16162, "R") // disable alternate screen mode + resetSeq := "\x1b[0m" // reset attributes + resetSeq += "\x1b[?25h" // show cursor + resetSeq += "\x1b[?1l" // normal cursor keys + resetSeq += "\x1b[?7h" // wraparound on + resetSeq += "\x1b[?45l" // reverse wraparound off + resetSeq += "\x1b[?66l" // application keypad off (DECNKM) + resetSeq += "\x1b[4l" // insert mode off (IRM) + resetSeq += "\x1b[?9l" // X10 mouse tracking off + resetSeq += "\x1b[?1000l" // disable Send Mouse X & Y on button press + resetSeq += "\x1b[?1002l" // disable Use Cell Motion Mouse Tracking + resetSeq += "\x1b[?1003l" // disable Use All Motion Mouse Tracking + resetSeq += "\x1b[?1004l" // disable Send FocusIn/FocusOut events + resetSeq += "\x1b[?1006l" // disable Enable SGR Mouse Mode + resetSeq += "\x1b[?1007l" // disable Enable Alternate Scroll Mode + resetSeq += "\x1b[?2004l" // disable bracketed paste mode + resetSeq += "\x1b[?2026l" // synchronized output off + resetSeq += FormatOSC(16162, "R") // disable alternate screen mode return resetSeq } From fbd7379d1f1a429cf71c69327f7738124cb7397c Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 9 Feb 2026 15:41:22 -0800 Subject: [PATCH 14/17] better connmonitor integration. create a new connmonitor whenever we connect ssh. --- pkg/blockcontroller/blockcontroller.go | 7 ++- pkg/remote/conncontroller/conncontroller.go | 52 +++++++++++++++++---- pkg/remote/conncontroller/connmonitor.go | 40 +++++++++++----- 3 files changed, 76 insertions(+), 23 deletions(-) diff --git a/pkg/blockcontroller/blockcontroller.go b/pkg/blockcontroller/blockcontroller.go index ee4b601de8..524a66c10b 100644 --- a/pkg/blockcontroller/blockcontroller.go +++ b/pkg/blockcontroller/blockcontroller.go @@ -301,8 +301,11 @@ func sendConnMonitorInputNotification(controller Controller) { return } sshConn := conncontroller.MaybeGetConn(connOpts) - if sshConn != nil && sshConn.Monitor != nil { - sshConn.Monitor.NotifyInput() + if sshConn != nil { + monitor := sshConn.GetMonitor() + if monitor != nil { + monitor.NotifyInput() + } } } diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index 43e25b9e18..16a41809f2 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -137,9 +137,10 @@ func (conn *SSHConn) DeriveConnStatus() wshrpc.ConnStatus { defer conn.lock.Unlock() var lastActivityBeforeStalledTime int64 var keepAliveSentTime int64 - if conn.ConnHealthStatus == ConnHealthStatus_Stalled && conn.Monitor != nil { - lastActivityBeforeStalledTime = conn.Monitor.LastActivityTime.Load() - keepAliveSentTime = conn.Monitor.KeepAliveSentTime.Load() + monitor := conn.Monitor + if conn.ConnHealthStatus == ConnHealthStatus_Stalled && monitor != nil { + lastActivityBeforeStalledTime = monitor.LastActivityTime.Load() + keepAliveSentTime = monitor.KeepAliveSentTime.Load() } return wshrpc.ConnStatus{ Status: conn.Status, @@ -197,9 +198,14 @@ func (conn *SSHConn) Close() error { func (conn *SSHConn) closeInternal_withlifecyclelock() { // does not set status (that should happen at another level) - client := WithLockRtn(conn, func() *ssh.Client { - return conn.Client + conn.WithLock(func() { + if conn.Monitor != nil { + conn.Monitor.Close() + conn.Monitor = nil + } + conn.Monitor = nil }) + client := conn.GetClient() if client != nil { // this MUST go first to force close the connection. // the DomainSockListener.Close() sends SSH protocol packets which can block on a dead network conn @@ -293,8 +299,12 @@ func (conn *SSHConn) OpenDomainSocketListener(ctx context.Context) error { conn.DomainSockListener = nil conn.DomainSockName = "" }) - // monitor will never be nil (set up in Make) - wshutil.RunWshRpcOverListener(listener, conn.Monitor.UpdateLastActivityTime) + monitor := conn.GetMonitor() + var updateCallback func() + if monitor != nil { + updateCallback = monitor.UpdateLastActivityTime + } + wshutil.RunWshRpcOverListener(listener, updateCallback) }() return nil } @@ -545,7 +555,10 @@ func (conn *SSHConn) StartConnServer(ctx context.Context, afterUpdate bool, useR log.Printf("[conncontroller:%s:output] error: %v\n", conn.GetName(), output.Error) continue } - conn.Monitor.UpdateLastActivityTime() + monitor := conn.GetMonitor() + if monitor != nil { + monitor.UpdateLastActivityTime() + } line := output.Line if !strings.HasSuffix(line, "\n") { line += "\n" @@ -682,6 +695,12 @@ func (conn *SSHConn) GetClient() *ssh.Client { return conn.Client } +func (conn *SSHConn) GetMonitor() *ConnMonitor { + conn.lock.Lock() + defer conn.lock.Unlock() + return conn.Monitor +} + func (conn *SSHConn) WaitForConnect(ctx context.Context) error { for { status := conn.DeriveConnStatus() @@ -935,7 +954,13 @@ func (conn *SSHConn) connectInternal(ctx context.Context, connFlags *wconfig.Con return err } conn.WithLock(func() { + if conn.Monitor != nil { + conn.Monitor.Close() + conn.Monitor = nil + } conn.Client = client + conn.ConnHealthStatus = ConnHealthStatus_Good + conn.Monitor = MakeConnMonitor(conn, client) }) go func() { defer func() { @@ -973,6 +998,11 @@ func (conn *SSHConn) waitForDisconnect() { return } err := client.Wait() + if err != nil { + log.Printf("[conn:%s] client.Wait() returned error: %v", conn.GetName(), err) + } else { + log.Printf("[conn:%s] client.Wait() completed (clean disconnect)", conn.GetName()) + } conn.lifecycleLock.Lock() defer conn.lifecycleLock.Unlock() conn.WithLock(func() { @@ -1005,9 +1035,12 @@ func (conn *SSHConn) ClearWshError() { }) } -func (conn *SSHConn) SetConnHealthStatus(status string) { +func (conn *SSHConn) SetConnHealthStatus(client *ssh.Client, status string) { changed := false conn.WithLock(func() { + if conn.Client != client { + return + } if conn.ConnHealthStatus != status { conn.ConnHealthStatus = status changed = true @@ -1039,7 +1072,6 @@ func getConnInternal(opts *remote.SSHOpts, createIfNotExists bool) *SSHConn { WshEnabled: &atomic.Bool{}, Opts: opts, } - rtn.Monitor = MakeConnMonitor(rtn) clientControllerMap[*opts] = rtn } return rtn diff --git a/pkg/remote/conncontroller/connmonitor.go b/pkg/remote/conncontroller/connmonitor.go index f85789929b..8ec18e2d04 100644 --- a/pkg/remote/conncontroller/connmonitor.go +++ b/pkg/remote/conncontroller/connmonitor.go @@ -5,17 +5,21 @@ package conncontroller import ( "context" - "fmt" "sync" "sync/atomic" "time" "github.com/wavetermdev/waveterm/pkg/panichandler" + "golang.org/x/crypto/ssh" ) +// Lock ordering: conn.lock > cm.lock (conn.lock is outer, cm.lock is inner) +// CRITICAL: Methods that hold cm.lock must NEVER call into SSHConn (deadlock - violates ordering). +// Methods called from SSHConn while conn.lock is held should avoid acquiring cm.lock (keep locking simple). type ConnMonitor struct { lock *sync.Mutex - Conn *SSHConn + Conn *SSHConn // always non-nil, set at creation + Client *ssh.Client // always non-nil, set at creation LastActivityTime atomic.Int64 LastInputTime atomic.Int64 KeepAliveSentTime atomic.Int64 @@ -25,11 +29,18 @@ type ConnMonitor struct { inputNotifyCh chan int64 } -func MakeConnMonitor(conn *SSHConn) *ConnMonitor { +func MakeConnMonitor(conn *SSHConn, client *ssh.Client) *ConnMonitor { + if conn == nil { + panic("conn cannot be nil") + } + if client == nil { + panic("client cannot be nil") + } ctx, cancelFunc := context.WithCancel(context.Background()) cm := &ConnMonitor{ lock: &sync.Mutex{}, Conn: conn, + Client: client, ctx: ctx, cancelFunc: cancelFunc, inputNotifyCh: make(chan int64, 1), @@ -38,9 +49,15 @@ func MakeConnMonitor(conn *SSHConn) *ConnMonitor { return cm } +// setConnHealthStatus calls into SSHConn.SetConnHealthStatus +// CRITICAL: cm.lock must NOT be held when calling this method (violates lock ordering) +func (cm *ConnMonitor) setConnHealthStatus(status string) { + cm.Conn.SetConnHealthStatus(cm.Client, status) +} + func (cm *ConnMonitor) UpdateLastActivityTime() { cm.LastActivityTime.Store(time.Now().UnixMilli()) - cm.Conn.SetConnHealthStatus(ConnHealthStatus_Good) + cm.setConnHealthStatus(ConnHealthStatus_Good) } func (cm *ConnMonitor) NotifyInput() { @@ -90,11 +107,7 @@ func (cm *ConnMonitor) getTimeSinceKeepAlive() int64 { } func (cm *ConnMonitor) SendKeepAlive() error { - conn := cm.Conn - client := conn.GetClient() - if conn == nil || client == nil { - return fmt.Errorf("no active connection") - } + client := cm.Client if !cm.setKeepAliveInFlight() { return nil } @@ -131,7 +144,7 @@ func (cm *ConnMonitor) checkConnection() { } timeSinceKeepAlive := cm.getTimeSinceKeepAlive() if timeSinceKeepAlive > stalledThreshold { - cm.Conn.SetConnHealthStatus(ConnHealthStatus_Stalled) + cm.setConnHealthStatus(ConnHealthStatus_Stalled) } } @@ -143,6 +156,11 @@ func (cm *ConnMonitor) keepAliveMonitor() { defer ticker.Stop() for { + // check if our client is still the active one + if cm.Conn.GetClient() != cm.Client { + return + } + select { case <-ticker.C: cm.checkConnection() @@ -153,7 +171,7 @@ func (cm *ConnMonitor) keepAliveMonitor() { if cm.LastActivityTime.Load() >= inputTime { break } - cm.Conn.SetConnHealthStatus(ConnHealthStatus_Degraded) + cm.setConnHealthStatus(ConnHealthStatus_Degraded) cm.checkConnection() case <-cm.ctx.Done(): return From ea0a69d8a0569f767fb560b4f83675007430e4f1 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 9 Feb 2026 16:04:30 -0800 Subject: [PATCH 15/17] log the keepalive error for information purposes (and the duration) --- pkg/remote/conncontroller/connmonitor.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/remote/conncontroller/connmonitor.go b/pkg/remote/conncontroller/connmonitor.go index 8ec18e2d04..a9d703032c 100644 --- a/pkg/remote/conncontroller/connmonitor.go +++ b/pkg/remote/conncontroller/connmonitor.go @@ -5,6 +5,7 @@ package conncontroller import ( "context" + "log" "sync" "sync/atomic" "time" @@ -116,7 +117,12 @@ func (cm *ConnMonitor) SendKeepAlive() error { panichandler.PanicHandler("conncontroller:SendKeepAlive", recover()) }() defer cm.clearKeepAliveInFlight() - _, _, _ = client.SendRequest("keepalive@openssh.com", true, nil) + startTime := time.Now() + _, _, err := client.SendRequest("keepalive@openssh.com", true, nil) + duration := time.Since(startTime).Milliseconds() + if err != nil { + log.Printf("[conncontroller] conn:%s keepalive error (duration=%dms): %v", cm.Conn.GetName(), duration, err) + } cm.UpdateLastActivityTime() }() return nil From 386ce7a2c3227e8d381b1eb144868a37347814f5 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 9 Feb 2026 16:07:18 -0800 Subject: [PATCH 16/17] remove dead code --- frontend/app/block/connstatusoverlay.tsx | 5 ----- 1 file changed, 5 deletions(-) diff --git a/frontend/app/block/connstatusoverlay.tsx b/frontend/app/block/connstatusoverlay.tsx index ce8b1cafba..6c90511d7d 100644 --- a/frontend/app/block/connstatusoverlay.tsx +++ b/frontend/app/block/connstatusoverlay.tsx @@ -146,11 +146,6 @@ export const ConnStatusOverlay = React.memo( prtn.catch((e) => console.log("error reconnecting", connName, e)); }, [connName, nodeModel.blockId]); - const handleDisconnect = React.useCallback(() => { - const prtn = RpcApi.ConnDisconnectCommand(TabRpcClient, connName, { timeout: 5000 }); - prtn.catch((e) => console.log("error disconnecting", connName, e)); - }, [connName]); - const handleDisableWsh = React.useCallback(async () => { const metamaptype: unknown = { "conn:wshenabled": false, From 1721308374c8466b514e183757b339c4e599e9b8 Mon Sep 17 00:00:00 2001 From: sawka Date: Mon, 9 Feb 2026 16:50:45 -0800 Subject: [PATCH 17/17] do not update lastactivitytime on sendrequest error --- pkg/remote/conncontroller/connmonitor.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/remote/conncontroller/connmonitor.go b/pkg/remote/conncontroller/connmonitor.go index a9d703032c..dc14561484 100644 --- a/pkg/remote/conncontroller/connmonitor.go +++ b/pkg/remote/conncontroller/connmonitor.go @@ -119,9 +119,11 @@ func (cm *ConnMonitor) SendKeepAlive() error { defer cm.clearKeepAliveInFlight() startTime := time.Now() _, _, err := client.SendRequest("keepalive@openssh.com", true, nil) - duration := time.Since(startTime).Milliseconds() if err != nil { + // errors are only returned for network and I/O issues (likely disconnection). do not update last activity time + duration := time.Since(startTime).Milliseconds() log.Printf("[conncontroller] conn:%s keepalive error (duration=%dms): %v", cm.Conn.GetName(), duration, err) + return } cm.UpdateLastActivityTime() }()