diff --git a/packages/kernel-errors/src/index.test.ts b/packages/kernel-errors/src/index.test.ts index be7911cd7..ff651779e 100644 --- a/packages/kernel-errors/src/index.test.ts +++ b/packages/kernel-errors/src/index.test.ts @@ -20,6 +20,7 @@ describe('index', () => { 'VatAlreadyExistsError', 'VatDeletedError', 'VatNotFoundError', + 'getNetworkErrorCode', 'isMarshaledError', 'isMarshaledOcapError', 'isOcapError', diff --git a/packages/kernel-errors/src/index.ts b/packages/kernel-errors/src/index.ts index 4f27db9eb..73e8d3374 100644 --- a/packages/kernel-errors/src/index.ts +++ b/packages/kernel-errors/src/index.ts @@ -27,4 +27,5 @@ export { unmarshalError } from './marshal/unmarshalError.ts'; export { isMarshaledError } from './marshal/isMarshaledError.ts'; export { isMarshaledOcapError } from './marshal/isMarshaledOcapError.ts'; export { isRetryableNetworkError } from './utils/isRetryableNetworkError.ts'; +export { getNetworkErrorCode } from './utils/getNetworkErrorCode.ts'; export { isResourceLimitError } from './utils/isResourceLimitError.ts'; diff --git a/packages/kernel-errors/src/utils/getNetworkErrorCode.test.ts b/packages/kernel-errors/src/utils/getNetworkErrorCode.test.ts new file mode 100644 index 000000000..1fe562f08 --- /dev/null +++ b/packages/kernel-errors/src/utils/getNetworkErrorCode.test.ts @@ -0,0 +1,88 @@ +import { describe, it, expect } from 'vitest'; + +import { getNetworkErrorCode } from './getNetworkErrorCode.ts'; + +describe('getNetworkErrorCode', () => { + describe('Node.js network error codes', () => { + it.each([ + { code: 'ECONNRESET' }, + { code: 'ETIMEDOUT' }, + { code: 'EPIPE' }, + { code: 'ECONNREFUSED' }, + { code: 'EHOSTUNREACH' }, + { code: 'ENETUNREACH' }, + { code: 'ENOTFOUND' }, + ])('returns $code from error with code property', ({ code }) => { + const error = new Error('Network error') as Error & { code: string }; + error.code = code; + expect(getNetworkErrorCode(error)).toBe(code); + }); + }); + + describe('libp2p and other named errors', () => { + it.each([ + { name: 'DialError' }, + { name: 'TransportError' }, + { name: 'MuxerClosedError' }, + { name: 'WebRTCDialError' }, + ])('returns $name from error with name property', ({ name }) => { + const error = new Error('Connection failed'); + error.name = name; + expect(getNetworkErrorCode(error)).toBe(name); + }); + + it('prefers code over name when both are present', () => { + const error = Object.assign(new Error('Network error'), { + code: 'ECONNREFUSED', + name: 'DialError', + }); + expect(getNetworkErrorCode(error)).toBe('ECONNREFUSED'); + }); + }); + + describe('relay reservation errors', () => { + it('returns name for Error with NO_RESERVATION in message (name takes precedence)', () => { + const error = new Error( + 'failed to connect via relay with status NO_RESERVATION', + ); + // name ('Error') takes precedence over message parsing + expect(getNetworkErrorCode(error)).toBe('Error'); + }); + + it('returns NO_RESERVATION when error has empty name', () => { + const error = Object.assign( + new Error('failed to connect via relay with status NO_RESERVATION'), + { name: '' }, + ); + expect(getNetworkErrorCode(error)).toBe('NO_RESERVATION'); + }); + + it('returns name when both name and NO_RESERVATION message are present', () => { + const error = Object.assign( + new Error('failed to connect via relay with status NO_RESERVATION'), + { name: 'InvalidMessageError' }, + ); + // name takes precedence over message parsing + expect(getNetworkErrorCode(error)).toBe('InvalidMessageError'); + }); + }); + + describe('unknown errors', () => { + it.each([ + { name: 'null', value: null }, + { name: 'undefined', value: undefined }, + { name: 'string', value: 'error string' }, + { name: 'number', value: 42 }, + { name: 'plain object', value: { message: 'error' } }, + { name: 'empty error name and code', value: { name: '', code: '' } }, + ])('returns UNKNOWN for $name', ({ value }) => { + expect(getNetworkErrorCode(value)).toBe('UNKNOWN'); + }); + + it('returns UNKNOWN for generic Error with no code', () => { + const error = new Error('Generic error'); + // Generic Error has name 'Error', so it returns 'Error' + expect(getNetworkErrorCode(error)).toBe('Error'); + }); + }); +}); diff --git a/packages/kernel-errors/src/utils/getNetworkErrorCode.ts b/packages/kernel-errors/src/utils/getNetworkErrorCode.ts new file mode 100644 index 000000000..db21f6e05 --- /dev/null +++ b/packages/kernel-errors/src/utils/getNetworkErrorCode.ts @@ -0,0 +1,36 @@ +/** + * Extract a network error code from an error object. + * + * Returns error codes like 'ECONNREFUSED', 'ETIMEDOUT', etc., or + * the error name for libp2p errors, or 'UNKNOWN' for unrecognized errors. + * + * @param error - The error to extract the code from. + * @returns The error code string. + */ +export function getNetworkErrorCode(error: unknown): string { + const anyError = error as { + code?: string; + name?: string; + message?: string; + }; + + // Node.js network error codes (ECONNREFUSED, ETIMEDOUT, etc.) + if (typeof anyError?.code === 'string' && anyError.code.length > 0) { + return anyError.code; + } + + // libp2p errors and other named errors + if (typeof anyError?.name === 'string' && anyError.name.length > 0) { + return anyError.name; + } + + // Check message for relay reservation errors + if ( + typeof anyError?.message === 'string' && + anyError.message.includes('NO_RESERVATION') + ) { + return 'NO_RESERVATION'; + } + + return 'UNKNOWN'; +} diff --git a/packages/kernel-errors/src/utils/isRetryableNetworkError.test.ts b/packages/kernel-errors/src/utils/isRetryableNetworkError.test.ts index 69272f9ee..235ba6d26 100644 --- a/packages/kernel-errors/src/utils/isRetryableNetworkError.test.ts +++ b/packages/kernel-errors/src/utils/isRetryableNetworkError.test.ts @@ -34,6 +34,7 @@ describe('isRetryableNetworkError', () => { { code: 'ECONNREFUSED', description: 'connection refused' }, { code: 'EHOSTUNREACH', description: 'no route to host' }, { code: 'ENETUNREACH', description: 'network unreachable' }, + { code: 'ENOTFOUND', description: 'DNS lookup failed' }, ])('returns true for $code ($description)', ({ code }) => { const error = new Error('Network error') as Error & { code: string }; error.code = code; diff --git a/packages/kernel-errors/src/utils/isRetryableNetworkError.ts b/packages/kernel-errors/src/utils/isRetryableNetworkError.ts index 26926a7a3..336b2bf3c 100644 --- a/packages/kernel-errors/src/utils/isRetryableNetworkError.ts +++ b/packages/kernel-errors/src/utils/isRetryableNetworkError.ts @@ -21,6 +21,9 @@ export function isRetryableNetworkError(error: unknown): boolean { } // Node.js network error codes + // Note: ENOTFOUND (DNS lookup failed) is included to allow permanent failure + // detection to work - after multiple consecutive ENOTFOUND errors, the peer + // will be marked as permanently failed rather than giving up immediately. const code = anyError?.code; if ( code === 'ECONNRESET' || @@ -28,7 +31,8 @@ export function isRetryableNetworkError(error: unknown): boolean { code === 'EPIPE' || code === 'ECONNREFUSED' || code === 'EHOSTUNREACH' || - code === 'ENETUNREACH' + code === 'ENETUNREACH' || + code === 'ENOTFOUND' ) { return true; } diff --git a/packages/ocap-kernel/src/remotes/platform/reconnection-lifecycle.test.ts b/packages/ocap-kernel/src/remotes/platform/reconnection-lifecycle.test.ts index 3653992ee..60286a770 100644 --- a/packages/ocap-kernel/src/remotes/platform/reconnection-lifecycle.test.ts +++ b/packages/ocap-kernel/src/remotes/platform/reconnection-lifecycle.test.ts @@ -17,7 +17,7 @@ vi.mock('@metamask/kernel-utils', async () => { }; }); -// Mock kernel-errors for isRetryableNetworkError +// Mock kernel-errors for isRetryableNetworkError and getNetworkErrorCode vi.mock('@metamask/kernel-errors', async () => { const actual = await vi.importActual( '@metamask/kernel-errors', @@ -25,6 +25,7 @@ vi.mock('@metamask/kernel-errors', async () => { return { ...actual, isRetryableNetworkError: vi.fn(), + getNetworkErrorCode: vi.fn().mockReturnValue('ECONNREFUSED'), }; }); @@ -75,9 +76,11 @@ describe('reconnection-lifecycle', () => { incrementAttempt: vi.fn().mockReturnValue(1), decrementAttempt: vi.fn(), calculateBackoff: vi.fn().mockReturnValue(100), - startReconnection: vi.fn(), + startReconnection: vi.fn().mockReturnValue(true), stopReconnection: vi.fn(), resetBackoff: vi.fn(), + isPermanentlyFailed: vi.fn().mockReturnValue(false), + recordError: vi.fn(), }, maxRetryAttempts: 3, onRemoteGiveUp: vi.fn(), @@ -569,5 +572,147 @@ describe('reconnection-lifecycle', () => { // stopReconnection should be called on success expect(deps.reconnectionManager.stopReconnection).toHaveBeenCalled(); }); + + describe('permanent failure detection', () => { + it('gives up when peer is permanently failed at start of loop', async () => { + ( + deps.reconnectionManager.isPermanentlyFailed as ReturnType< + typeof vi.fn + > + ).mockReturnValue(true); + + const lifecycle = makeReconnectionLifecycle(deps); + + await lifecycle.attemptReconnection('peer1'); + + expect(deps.reconnectionManager.stopReconnection).toHaveBeenCalledWith( + 'peer1', + ); + expect(deps.onRemoteGiveUp).toHaveBeenCalledWith('peer1'); + expect(deps.logger.log).toHaveBeenCalledWith( + expect.stringContaining('permanently failed'), + ); + }); + + it('records error after failed dial attempt', async () => { + const error = new Error('Connection refused'); + (error as Error & { code: string }).code = 'ECONNREFUSED'; + (deps.dialPeer as ReturnType).mockRejectedValueOnce( + error, + ); + ( + kernelErrors.isRetryableNetworkError as ReturnType + ).mockReturnValue(true); + (deps.reconnectionManager.isReconnecting as ReturnType) + .mockReturnValueOnce(true) + .mockReturnValueOnce(false); + + const lifecycle = makeReconnectionLifecycle(deps); + + await lifecycle.attemptReconnection('peer1'); + + expect(deps.reconnectionManager.recordError).toHaveBeenCalledWith( + 'peer1', + 'ECONNREFUSED', + ); + }); + + it('gives up when error triggers permanent failure', async () => { + const error = new Error('Connection refused'); + (deps.dialPeer as ReturnType).mockRejectedValueOnce( + error, + ); + ( + kernelErrors.isRetryableNetworkError as ReturnType + ).mockReturnValue(true); + ( + deps.reconnectionManager.isPermanentlyFailed as ReturnType< + typeof vi.fn + > + ) + .mockReturnValueOnce(false) // At start of loop + .mockReturnValueOnce(true); // After recording error + + const lifecycle = makeReconnectionLifecycle(deps); + + await lifecycle.attemptReconnection('peer1'); + + expect(deps.reconnectionManager.stopReconnection).toHaveBeenCalledWith( + 'peer1', + ); + expect(deps.onRemoteGiveUp).toHaveBeenCalledWith('peer1'); + expect(deps.outputError).toHaveBeenCalledWith( + 'peer1', + expect.stringContaining('permanent failure detected'), + expect.any(Error), + ); + }); + + it('continues retrying when error does not trigger permanent failure', async () => { + (deps.dialPeer as ReturnType) + .mockRejectedValueOnce(new Error('Temporary failure')) + .mockResolvedValueOnce(mockChannel); + ( + kernelErrors.isRetryableNetworkError as ReturnType + ).mockReturnValue(true); + ( + deps.reconnectionManager.isPermanentlyFailed as ReturnType< + typeof vi.fn + > + ).mockReturnValue(false); + (deps.reconnectionManager.isReconnecting as ReturnType) + .mockReturnValueOnce(true) + .mockReturnValueOnce(true) + .mockReturnValueOnce(false); + + const lifecycle = makeReconnectionLifecycle(deps); + + await lifecycle.attemptReconnection('peer1'); + + expect(deps.dialPeer).toHaveBeenCalledTimes(2); + expect(deps.reconnectionManager.recordError).toHaveBeenCalled(); + }); + }); + }); + + describe('handleConnectionLoss with permanent failure', () => { + it('skips reconnection and calls onRemoteGiveUp for permanently failed peer', () => { + ( + deps.reconnectionManager.isReconnecting as ReturnType + ).mockReturnValue(false); + ( + deps.reconnectionManager.startReconnection as ReturnType + ).mockReturnValue(false); + + const lifecycle = makeReconnectionLifecycle(deps); + + lifecycle.handleConnectionLoss('peer1'); + + expect(deps.reconnectionManager.startReconnection).toHaveBeenCalledWith( + 'peer1', + ); + expect(deps.onRemoteGiveUp).toHaveBeenCalledWith('peer1'); + expect(deps.logger.log).toHaveBeenCalledWith( + expect.stringContaining('permanently failed'), + ); + }); + + it('proceeds with reconnection when startReconnection returns true', () => { + ( + deps.reconnectionManager.isReconnecting as ReturnType + ).mockReturnValue(false); + ( + deps.reconnectionManager.startReconnection as ReturnType + ).mockReturnValue(true); + + const lifecycle = makeReconnectionLifecycle(deps); + + lifecycle.handleConnectionLoss('peer1'); + + expect(deps.reconnectionManager.startReconnection).toHaveBeenCalledWith( + 'peer1', + ); + expect(deps.onRemoteGiveUp).not.toHaveBeenCalled(); + }); }); }); diff --git a/packages/ocap-kernel/src/remotes/platform/reconnection-lifecycle.ts b/packages/ocap-kernel/src/remotes/platform/reconnection-lifecycle.ts index a780c51f9..1940af2ea 100644 --- a/packages/ocap-kernel/src/remotes/platform/reconnection-lifecycle.ts +++ b/packages/ocap-kernel/src/remotes/platform/reconnection-lifecycle.ts @@ -1,5 +1,6 @@ import { isRetryableNetworkError, + getNetworkErrorCode, isResourceLimitError, } from '@metamask/kernel-errors'; import { @@ -86,6 +87,16 @@ export function makeReconnectionLifecycle( const state = peerStateManager.getState(peerId); while (reconnectionManager.isReconnecting(peerId) && !signal.aborted) { + // Check for permanent failure before attempting + if (reconnectionManager.isPermanentlyFailed(peerId)) { + logger.log( + `${peerId}:: permanently failed due to error pattern, giving up`, + ); + reconnectionManager.stopReconnection(peerId); + onRemoteGiveUp?.(peerId); + return; + } + if (!reconnectionManager.shouldRetry(peerId, maxAttempts)) { logger.log( `${peerId}:: max reconnection attempts (${maxAttempts}) reached, giving up`, @@ -148,6 +159,23 @@ export function makeReconnectionLifecycle( ); continue; } + + // Record the error for pattern analysis + const errorCode = getNetworkErrorCode(problem); + reconnectionManager.recordError(peerId, errorCode); + + // Check if this error triggered permanent failure + if (reconnectionManager.isPermanentlyFailed(peerId)) { + outputError( + peerId, + `permanent failure detected (${errorCode})`, + problem, + ); + reconnectionManager.stopReconnection(peerId); + onRemoteGiveUp?.(peerId); + return; + } + if (!isRetryableNetworkError(problem)) { outputError(peerId, `non-retryable failure`, problem); reconnectionManager.stopReconnection(peerId); @@ -209,7 +237,7 @@ export function makeReconnectionLifecycle( /** * Handle connection loss for a given peer ID. - * Skips reconnection if the peer was intentionally closed. + * Skips reconnection if the peer was intentionally closed or permanently failed. * * @param peerId - The peer ID to handle the connection loss for. */ @@ -226,7 +254,15 @@ export function makeReconnectionLifecycle( state.channel = undefined; if (!reconnectionManager.isReconnecting(peerId)) { - reconnectionManager.startReconnection(peerId); + const started = reconnectionManager.startReconnection(peerId); + if (!started) { + // Peer is permanently failed, don't attempt reconnection + logger.log( + `${peerId}:: peer is permanently failed, skipping reconnection`, + ); + onRemoteGiveUp?.(peerId); + return; + } attemptReconnection(peerId).catch((problem) => { outputError(peerId, 'reconnection error', problem); reconnectionManager.stopReconnection(peerId); diff --git a/packages/ocap-kernel/src/remotes/platform/reconnection.test.ts b/packages/ocap-kernel/src/remotes/platform/reconnection.test.ts index 7ca47e634..678827cde 100644 --- a/packages/ocap-kernel/src/remotes/platform/reconnection.test.ts +++ b/packages/ocap-kernel/src/remotes/platform/reconnection.test.ts @@ -1,7 +1,11 @@ import * as kernelUtils from '@metamask/kernel-utils'; import { describe, it, expect, beforeEach, vi } from 'vitest'; -import { ReconnectionManager } from './reconnection.ts'; +import { + ReconnectionManager, + DEFAULT_CONSECUTIVE_ERROR_THRESHOLD, + PERMANENT_FAILURE_ERROR_CODES, +} from './reconnection.ts'; // Mock the calculateReconnectionBackoff function vi.mock('@metamask/kernel-utils', async () => { @@ -201,6 +205,37 @@ describe('ReconnectionManager', () => { expect(manager.getAttemptCount('peer1')).toBe(0); expect(manager.getAttemptCount('peer2')).toBe(1); }); + + it('clears error history to prevent false permanent failures', () => { + // Accumulate some errors + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + expect(manager.getErrorHistory('peer1')).toHaveLength(3); + + // Successful communication - should clear error history + manager.resetBackoff('peer1'); + + expect(manager.getErrorHistory('peer1')).toStrictEqual([]); + }); + + it('prevents stale errors from triggering permanent failure after success', () => { + // Accumulate 4 errors (one short of threshold) + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + + // Successful communication + manager.resetBackoff('peer1'); + + // One more error should NOT trigger permanent failure + // because the history was cleared + manager.recordError('peer1', 'ECONNREFUSED'); + + expect(manager.isPermanentlyFailed('peer1')).toBe(false); + expect(manager.getErrorHistory('peer1')).toHaveLength(1); + }); }); describe('calculateBackoff', () => { @@ -351,6 +386,43 @@ describe('ReconnectionManager', () => { it('handles empty state', () => { expect(() => manager.resetAllBackoffs()).not.toThrow(); }); + + it('clears error history for reconnecting peers after wake from sleep', () => { + // Set up peer with errors during reconnection + manager.startReconnection('peer1'); + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + + // peer2 not reconnecting, should not be affected + manager.recordError('peer2', 'ECONNREFUSED'); + manager.recordError('peer2', 'ECONNREFUSED'); + + // Simulate wake from sleep + manager.resetAllBackoffs(); + + // Reconnecting peer's error history should be cleared + expect(manager.getErrorHistory('peer1')).toStrictEqual([]); + // Non-reconnecting peer's error history should remain + expect(manager.getErrorHistory('peer2')).toHaveLength(2); + }); + + it('prevents stale errors from triggering permanent failure after wake', () => { + manager.startReconnection('peer1'); + // Accumulate 4 errors before sleep + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + + // Wake from sleep + manager.resetAllBackoffs(); + + // One more error should NOT trigger permanent failure + manager.recordError('peer1', 'ECONNREFUSED'); + + expect(manager.isPermanentlyFailed('peer1')).toBe(false); + }); }); describe('clear', () => { @@ -551,4 +623,269 @@ describe('ReconnectionManager', () => { expect(manager.isReconnecting(peerId)).toBe(true); }); }); + + describe('error tracking', () => { + describe('recordError', () => { + it('records errors in history', () => { + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ETIMEDOUT'); + + const history = manager.getErrorHistory('peer1'); + expect(history).toHaveLength(2); + expect(history[0]?.code).toBe('ECONNREFUSED'); + expect(history[1]?.code).toBe('ETIMEDOUT'); + }); + + it('records timestamps for each error', () => { + const beforeTime = Date.now(); + manager.recordError('peer1', 'ECONNREFUSED'); + const afterTime = Date.now(); + + const history = manager.getErrorHistory('peer1'); + expect(history[0]?.timestamp).toBeGreaterThanOrEqual(beforeTime); + expect(history[0]?.timestamp).toBeLessThanOrEqual(afterTime); + }); + + it('tracks errors per peer independently', () => { + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer2', 'ETIMEDOUT'); + + expect(manager.getErrorHistory('peer1')).toHaveLength(1); + expect(manager.getErrorHistory('peer2')).toHaveLength(1); + expect(manager.getErrorHistory('peer1')[0]?.code).toBe('ECONNREFUSED'); + expect(manager.getErrorHistory('peer2')[0]?.code).toBe('ETIMEDOUT'); + }); + }); + + describe('getErrorHistory', () => { + it('returns empty array for new peer', () => { + expect(manager.getErrorHistory('newpeer')).toStrictEqual([]); + }); + + it('returns readonly array', () => { + manager.recordError('peer1', 'ECONNREFUSED'); + const history = manager.getErrorHistory('peer1'); + expect(Array.isArray(history)).toBe(true); + }); + }); + }); + + describe('permanent failure detection', () => { + describe('isPermanentlyFailed', () => { + it('returns false for new peer', () => { + expect(manager.isPermanentlyFailed('newpeer')).toBe(false); + }); + + it('returns false when not enough errors recorded', () => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD - 1; i += 1) { + manager.recordError('peer1', 'ECONNREFUSED'); + } + expect(manager.isPermanentlyFailed('peer1')).toBe(false); + }); + + it('returns true after threshold consecutive identical permanent failure errors', () => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', 'ECONNREFUSED'); + } + expect(manager.isPermanentlyFailed('peer1')).toBe(true); + }); + + it.each([...PERMANENT_FAILURE_ERROR_CODES])( + 'detects permanent failure for %s error code', + (errorCode) => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', errorCode); + } + expect(manager.isPermanentlyFailed('peer1')).toBe(true); + }, + ); + + it('does not mark as permanently failed for non-permanent error codes', () => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', 'ETIMEDOUT'); + } + expect(manager.isPermanentlyFailed('peer1')).toBe(false); + }); + + it('does not mark as permanently failed for mixed error codes', () => { + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ETIMEDOUT'); // breaks the streak + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer1', 'ECONNREFUSED'); + expect(manager.isPermanentlyFailed('peer1')).toBe(false); + }); + + it('detects permanent failure when streak completes at end of history', () => { + // Mix of errors followed by consecutive permanent failure errors + manager.recordError('peer1', 'ETIMEDOUT'); + manager.recordError('peer1', 'EPIPE'); + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', 'ECONNREFUSED'); + } + expect(manager.isPermanentlyFailed('peer1')).toBe(true); + }); + }); + + describe('clearPermanentFailure', () => { + it('clears permanent failure status', () => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', 'ECONNREFUSED'); + } + expect(manager.isPermanentlyFailed('peer1')).toBe(true); + + manager.clearPermanentFailure('peer1'); + + expect(manager.isPermanentlyFailed('peer1')).toBe(false); + }); + + it('clears error history', () => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', 'ECONNREFUSED'); + } + expect(manager.getErrorHistory('peer1').length).toBeGreaterThan(0); + + manager.clearPermanentFailure('peer1'); + + expect(manager.getErrorHistory('peer1')).toStrictEqual([]); + }); + + it('allows reconnection after clearing', () => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', 'ECONNREFUSED'); + } + expect(manager.startReconnection('peer1')).toBe(false); + + manager.clearPermanentFailure('peer1'); + + expect(manager.startReconnection('peer1')).toBe(true); + }); + + it('works on non-failed peer', () => { + expect(() => manager.clearPermanentFailure('peer1')).not.toThrow(); + expect(manager.isPermanentlyFailed('peer1')).toBe(false); + }); + }); + + describe('startReconnection with permanent failure', () => { + it('returns false for permanently failed peer', () => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', 'ECONNREFUSED'); + } + + const result = manager.startReconnection('peer1'); + + expect(result).toBe(false); + expect(manager.isReconnecting('peer1')).toBe(false); + }); + + it('returns true for non-failed peer', () => { + const result = manager.startReconnection('peer1'); + expect(result).toBe(true); + }); + + it('resets error history when starting new reconnection session', () => { + manager.recordError('peer1', 'ETIMEDOUT'); + manager.recordError('peer1', 'EPIPE'); + expect(manager.getErrorHistory('peer1')).toHaveLength(2); + + manager.startReconnection('peer1'); + + expect(manager.getErrorHistory('peer1')).toStrictEqual([]); + }); + }); + + describe('custom threshold', () => { + it('respects custom consecutive error threshold', () => { + const customManager = new ReconnectionManager({ + consecutiveErrorThreshold: 3, + }); + + // Not enough errors + customManager.recordError('peer1', 'ECONNREFUSED'); + customManager.recordError('peer1', 'ECONNREFUSED'); + expect(customManager.isPermanentlyFailed('peer1')).toBe(false); + + // Exactly at threshold + customManager.recordError('peer1', 'ECONNREFUSED'); + expect(customManager.isPermanentlyFailed('peer1')).toBe(true); + }); + + it('throws if consecutiveErrorThreshold is less than 1', () => { + expect( + () => new ReconnectionManager({ consecutiveErrorThreshold: 0 }), + ).toThrow('consecutiveErrorThreshold must be at least 1'); + expect( + () => new ReconnectionManager({ consecutiveErrorThreshold: -1 }), + ).toThrow('consecutiveErrorThreshold must be at least 1'); + }); + }); + + describe('error history capping', () => { + it('caps error history to consecutive error threshold', () => { + const customManager = new ReconnectionManager({ + consecutiveErrorThreshold: 3, + }); + + // Record more errors than threshold + customManager.recordError('peer1', 'ERROR1'); + customManager.recordError('peer1', 'ERROR2'); + customManager.recordError('peer1', 'ERROR3'); + customManager.recordError('peer1', 'ERROR4'); + customManager.recordError('peer1', 'ERROR5'); + + const history = customManager.getErrorHistory('peer1'); + expect(history).toHaveLength(3); + expect(history[0]?.code).toBe('ERROR3'); + expect(history[1]?.code).toBe('ERROR4'); + expect(history[2]?.code).toBe('ERROR5'); + }); + + it('maintains correct permanent failure detection with capped history', () => { + const customManager = new ReconnectionManager({ + consecutiveErrorThreshold: 3, + }); + + // Record mixed errors that get capped + customManager.recordError('peer1', 'ETIMEDOUT'); + customManager.recordError('peer1', 'EPIPE'); + customManager.recordError('peer1', 'ECONNREFUSED'); + customManager.recordError('peer1', 'ECONNREFUSED'); + customManager.recordError('peer1', 'ECONNREFUSED'); + + // Last 3 are all ECONNREFUSED, should be permanently failed + expect(customManager.isPermanentlyFailed('peer1')).toBe(true); + }); + }); + + describe('clearPeer with permanent failure', () => { + it('clears permanent failure status when clearing peer', () => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', 'ECONNREFUSED'); + } + expect(manager.isPermanentlyFailed('peer1')).toBe(true); + + manager.clearPeer('peer1'); + + expect(manager.isPermanentlyFailed('peer1')).toBe(false); + expect(manager.getErrorHistory('peer1')).toStrictEqual([]); + }); + }); + + describe('clear with permanent failure', () => { + it('clears all permanent failure states', () => { + for (let i = 0; i < DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; i += 1) { + manager.recordError('peer1', 'ECONNREFUSED'); + manager.recordError('peer2', 'EHOSTUNREACH'); + } + expect(manager.isPermanentlyFailed('peer1')).toBe(true); + expect(manager.isPermanentlyFailed('peer2')).toBe(true); + + manager.clear(); + + expect(manager.isPermanentlyFailed('peer1')).toBe(false); + expect(manager.isPermanentlyFailed('peer2')).toBe(false); + }); + }); + }); }); diff --git a/packages/ocap-kernel/src/remotes/platform/reconnection.ts b/packages/ocap-kernel/src/remotes/platform/reconnection.ts index 8393cd6ab..4ff364e6d 100644 --- a/packages/ocap-kernel/src/remotes/platform/reconnection.ts +++ b/packages/ocap-kernel/src/remotes/platform/reconnection.ts @@ -1,17 +1,59 @@ import { calculateReconnectionBackoff } from '@metamask/kernel-utils'; +/** + * Default threshold for consecutive identical errors before marking as permanently failed. + */ +export const DEFAULT_CONSECUTIVE_ERROR_THRESHOLD = 5; + +/** + * Error codes that can indicate permanent failure when occurring consecutively. + * These are network errors that suggest the peer is unreachable at the given address. + */ +export const PERMANENT_FAILURE_ERROR_CODES = new Set([ + 'ECONNREFUSED', // Connection refused - peer not listening + 'EHOSTUNREACH', // No route to host - network path unavailable + 'ENOTFOUND', // DNS lookup failed - hostname doesn't resolve + 'ENETUNREACH', // Network unreachable +]); + +export type ErrorRecord = { + code: string; + timestamp: number; +}; + export type ReconnectionState = { isReconnecting: boolean; attemptCount: number; // completed attempts + errorHistory: ErrorRecord[]; + permanentlyFailed: boolean; }; /** * Reconnection state management for remote communications. - * Handles reconnection attempts, backoff calculations, and retry logic. + * Handles reconnection attempts, backoff calculations, retry logic, + * and permanent failure detection based on error patterns. */ export class ReconnectionManager { readonly #states = new Map(); + readonly #consecutiveErrorThreshold: number; + + /** + * Creates a new ReconnectionManager. + * + * @param options - Configuration options. + * @param options.consecutiveErrorThreshold - Number of consecutive identical errors + * before marking a peer as permanently failed. Default is 5. Must be at least 1. + */ + constructor(options?: { consecutiveErrorThreshold?: number }) { + const threshold = + options?.consecutiveErrorThreshold ?? DEFAULT_CONSECUTIVE_ERROR_THRESHOLD; + if (threshold < 1) { + throw new Error('consecutiveErrorThreshold must be at least 1'); + } + this.#consecutiveErrorThreshold = threshold; + } + /** * Get or create reconnection state for a peer. * @@ -21,7 +63,12 @@ export class ReconnectionManager { #getState(peerId: string): ReconnectionState { let state = this.#states.get(peerId); if (!state) { - state = { isReconnecting: false, attemptCount: 0 }; + state = { + isReconnecting: false, + attemptCount: 0, + errorHistory: [], + permanentlyFailed: false, + }; this.#states.set(peerId, state); } return state; @@ -29,18 +76,27 @@ export class ReconnectionManager { /** * Start reconnection for a peer. - * Resets attempt count when starting a new reconnection session. + * Resets attempt count and error history when starting a new reconnection session. * * @param peerId - The peer ID to start reconnection for. + * @returns False if the peer is permanently failed and reconnection should not proceed. */ - startReconnection(peerId: string): void { + startReconnection(peerId: string): boolean { const state = this.#getState(peerId); - // Reset attempt count when starting a new reconnection session + + // Don't start reconnection for permanently failed peers + if (state.permanentlyFailed) { + return false; + } + + // Reset attempt count and error history when starting a new reconnection session // This allows retries after max attempts were previously exhausted if (!state.isReconnecting) { state.attemptCount = 0; + state.errorHistory = []; } state.isReconnecting = true; + return true; } /** @@ -90,12 +146,16 @@ export class ReconnectionManager { } /** - * Reset the backoff counter for a peer. + * Reset the backoff counter and error history for a peer. + * Called on successful communication to indicate the connection is working. + * Clears error history to prevent stale errors from triggering false permanent failures. * * @param peerId - The peer ID to reset backoff for. */ resetBackoff(peerId: string): void { - this.#getState(peerId).attemptCount = 0; + const state = this.#getState(peerId); + state.attemptCount = 0; + state.errorHistory = []; } /** @@ -135,12 +195,15 @@ export class ReconnectionManager { } /** - * Reset all backoffs (e.g., after wake from sleep). + * Reset all backoffs and error histories (e.g., after wake from sleep). + * Clears error histories because network conditions have changed and old errors + * are no longer relevant for permanent failure detection. */ resetAllBackoffs(): void { for (const state of this.#states.values()) { if (state.isReconnecting) { state.attemptCount = 0; + state.errorHistory = []; } } } @@ -160,4 +223,97 @@ export class ReconnectionManager { clearPeer(peerId: string): void { this.#states.delete(peerId); } + + /** + * Record an error that occurred during reconnection. + * This updates the error history and checks for permanent failure patterns. + * Error history is capped at the consecutive error threshold to prevent unbounded growth. + * + * @param peerId - The peer ID that experienced the error. + * @param errorCode - The error code (e.g., 'ECONNREFUSED', 'ETIMEDOUT'). + */ + recordError(peerId: string, errorCode: string): void { + const state = this.#getState(peerId); + state.errorHistory.push({ + code: errorCode, + timestamp: Date.now(), + }); + + // Cap error history to prevent unbounded memory growth + // We only need the last N errors for pattern detection + if (state.errorHistory.length > this.#consecutiveErrorThreshold) { + state.errorHistory = state.errorHistory.slice( + -this.#consecutiveErrorThreshold, + ); + } + + // Check for permanent failure pattern + this.#checkPermanentFailure(peerId); + } + + /** + * Check if a peer has been marked as permanently failed. + * + * @param peerId - The peer ID to check. + * @returns True if the peer is permanently failed. + */ + isPermanentlyFailed(peerId: string): boolean { + return this.#getState(peerId).permanentlyFailed; + } + + /** + * Clear the permanent failure status for a peer. + * Call this when manually requesting reconnection to a previously failed peer. + * + * @param peerId - The peer ID to clear permanent failure for. + */ + clearPermanentFailure(peerId: string): void { + const state = this.#getState(peerId); + state.permanentlyFailed = false; + state.errorHistory = []; + } + + /** + * Get the error history for a peer. + * + * @param peerId - The peer ID to get error history for. + * @returns The error history array. + */ + getErrorHistory(peerId: string): readonly ErrorRecord[] { + return this.#getState(peerId).errorHistory; + } + + /** + * Check if recent errors indicate permanent failure and update state accordingly. + * + * Permanent failure is detected when: + * - The last N errors (where N = consecutiveErrorThreshold) have the same error code + * - AND that error code is in the PERMANENT_FAILURE_ERROR_CODES set + * + * @param peerId - The peer ID to check. + */ + #checkPermanentFailure(peerId: string): void { + const state = this.#getState(peerId); + const { errorHistory } = state; + + if (errorHistory.length < this.#consecutiveErrorThreshold) { + return; + } + + // Get the last N errors + const recentErrors = errorHistory.slice(-this.#consecutiveErrorThreshold); + const firstCode = recentErrors[0]?.code; + + if (!firstCode) { + return; + } + + // Check if all recent errors have the same code + const allSameCode = recentErrors.every((error) => error.code === firstCode); + + // Check if this error code indicates permanent failure + if (allSameCode && PERMANENT_FAILURE_ERROR_CODES.has(firstCode)) { + state.permanentlyFailed = true; + } + } } diff --git a/packages/ocap-kernel/src/remotes/platform/transport.test.ts b/packages/ocap-kernel/src/remotes/platform/transport.test.ts index a96724427..5ceb3bc97 100644 --- a/packages/ocap-kernel/src/remotes/platform/transport.test.ts +++ b/packages/ocap-kernel/src/remotes/platform/transport.test.ts @@ -16,7 +16,7 @@ let initTransport: typeof import('./transport.ts').initTransport; // Mock ReconnectionManager const mockReconnectionManager = { isReconnecting: vi.fn().mockReturnValue(false), - startReconnection: vi.fn(), + startReconnection: vi.fn().mockReturnValue(true), stopReconnection: vi.fn(), shouldRetry: vi.fn().mockReturnValue(true), incrementAttempt: vi.fn().mockReturnValue(1), @@ -26,6 +26,9 @@ const mockReconnectionManager = { resetAllBackoffs: vi.fn(), clear: vi.fn(), clearPeer: vi.fn(), + isPermanentlyFailed: vi.fn().mockReturnValue(false), + recordError: vi.fn(), + clearPermanentFailure: vi.fn(), }; vi.mock('./reconnection.ts', () => { @@ -51,6 +54,12 @@ vi.mock('./reconnection.ts', () => { clear = mockReconnectionManager.clear; clearPeer = mockReconnectionManager.clearPeer; + + isPermanentlyFailed = mockReconnectionManager.isPermanentlyFailed; + + recordError = mockReconnectionManager.recordError; + + clearPermanentFailure = mockReconnectionManager.clearPermanentFailure; } return { ReconnectionManager: MockReconnectionManager, @@ -128,6 +137,10 @@ vi.mock('@metamask/kernel-errors', () => ({ errorWithCode?.code === 'ETIMEDOUT' ); }), + getNetworkErrorCode: vi.fn().mockImplementation((error: unknown) => { + const errorWithCode = error as { code?: string; name?: string }; + return errorWithCode?.code ?? errorWithCode?.name ?? 'UNKNOWN'; + }), isResourceLimitError: vi.fn().mockReturnValue(false), })); @@ -172,6 +185,9 @@ describe('transport.initTransport', () => { mockReconnectionManager.resetAllBackoffs.mockClear(); mockReconnectionManager.clear.mockClear(); mockReconnectionManager.clearPeer.mockClear(); + mockReconnectionManager.isPermanentlyFailed.mockClear(); + mockReconnectionManager.recordError.mockClear(); + mockReconnectionManager.clearPermanentFailure.mockClear(); mockConnectionFactory.dialIdempotent.mockReset(); mockConnectionFactory.onInboundConnection.mockClear(); @@ -603,6 +619,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); mockReconnectionManager.stopReconnection.mockImplementation(() => { reconnecting = false; @@ -963,6 +980,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); mockReconnectionManager.stopReconnection.mockImplementation(() => { reconnecting = false; @@ -1011,6 +1029,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); mockReconnectionManager.stopReconnection.mockImplementation(() => { reconnecting = false; @@ -1062,6 +1081,53 @@ describe('transport.initTransport', () => { expect(mockReconnectionManager.startReconnection).not.toHaveBeenCalled(); }); + it('does not clear error history when reconnection is already in progress', async () => { + const mockChannel = createMockChannel('peer-1'); + mockConnectionFactory.dialIdempotent.mockResolvedValue(mockChannel); + + const { closeConnection, reconnectPeer } = await initTransport( + '0x1234', + {}, + vi.fn(), + ); + + await closeConnection('peer-1'); + + // Set up reconnection state - already reconnecting + mockReconnectionManager.isReconnecting.mockReturnValue(true); + + await reconnectPeer('peer-1'); + + // Should not clear permanent failure status (which also clears error history) + // when reconnection is already in progress + expect( + mockReconnectionManager.clearPermanentFailure, + ).not.toHaveBeenCalled(); + }); + + it('clears permanent failure status when manually reconnecting', async () => { + const mockChannel = createMockChannel('peer-1'); + mockConnectionFactory.dialIdempotent.mockResolvedValue(mockChannel); + + const { closeConnection, reconnectPeer } = await initTransport( + '0x1234', + {}, + vi.fn(), + ); + + await closeConnection('peer-1'); + await reconnectPeer('peer-1'); + + // Should clear permanent failure status before attempting reconnection + expect( + mockReconnectionManager.clearPermanentFailure, + ).toHaveBeenCalledWith('peer-1'); + // Then start reconnection + expect(mockReconnectionManager.startReconnection).toHaveBeenCalledWith( + 'peer-1', + ); + }); + it('allows sending messages after reconnection', async () => { const mockChannel = createMockChannel('peer-1'); mockConnectionFactory.dialIdempotent.mockResolvedValue(mockChannel); @@ -1169,6 +1235,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); mockReconnectionManager.stopReconnection.mockImplementation(() => { reconnecting = false; @@ -1260,6 +1327,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); mockReconnectionManager.stopReconnection.mockImplementation(() => { reconnecting = false; @@ -1305,6 +1373,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); // First call should return true (to enter loop), then false when checking after flush failure mockReconnectionManager.shouldRetry @@ -1360,6 +1429,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); mockReconnectionManager.shouldRetry .mockReturnValueOnce(true) @@ -1412,6 +1482,7 @@ describe('transport.initTransport', () => { mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; attemptCount = 0; // Reset on start + return true; }); mockReconnectionManager.incrementAttempt.mockImplementation(() => { attemptCount += 1; @@ -1430,6 +1501,8 @@ describe('transport.initTransport', () => { mockReconnectionManager.stopReconnection.mockImplementation(() => { reconnecting = false; }); + mockReconnectionManager.isPermanentlyFailed.mockReturnValue(false); + const { abortableDelay } = await import('@metamask/kernel-utils'); (abortableDelay as ReturnType).mockResolvedValue(undefined); @@ -1477,6 +1550,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); mockReconnectionManager.shouldRetry.mockReturnValue(true); mockReconnectionManager.incrementAttempt.mockReturnValue(1); @@ -1484,6 +1558,7 @@ describe('transport.initTransport', () => { mockReconnectionManager.stopReconnection.mockImplementation(() => { reconnecting = false; }); + mockReconnectionManager.isPermanentlyFailed.mockReturnValue(false); const { abortableDelay } = await import('@metamask/kernel-utils'); (abortableDelay as ReturnType).mockResolvedValue(undefined); @@ -1568,6 +1643,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); mockReconnectionManager.stopReconnection.mockImplementation(() => { reconnecting = false; @@ -1622,6 +1698,7 @@ describe('transport.initTransport', () => { ); mockReconnectionManager.startReconnection.mockImplementation(() => { reconnecting = true; + return true; }); mockReconnectionManager.stopReconnection.mockImplementation(() => { reconnecting = false; diff --git a/packages/ocap-kernel/src/remotes/platform/transport.ts b/packages/ocap-kernel/src/remotes/platform/transport.ts index e57a062ed..4fc72dc69 100644 --- a/packages/ocap-kernel/src/remotes/platform/transport.ts +++ b/packages/ocap-kernel/src/remotes/platform/transport.ts @@ -506,8 +506,9 @@ export async function initTransport( } /** - * Manually reconnect to a peer after intentional close. - * Clears the intentional close flag and initiates reconnection. + * Manually reconnect to a peer after intentional close or permanent failure. + * Clears the intentional close flag and permanent failure status, + * then initiates reconnection. * * @param peerId - The peer ID to reconnect to. * @param hints - The hints to use for the reconnection. @@ -516,12 +517,17 @@ export async function initTransport( peerId: string, hints: string[] = [], ): Promise { - logger.log(`${peerId}:: manually reconnecting after intentional close`); + logger.log(`${peerId}:: manually reconnecting`); peerStateManager.clearIntentionallyClosed(peerId); - // If already reconnecting, don't start another attempt + // If already reconnecting, don't start another attempt and don't clear error history + // Clearing error history while reconnection is in progress would reset progress + // toward permanent failure detection if (reconnectionManager.isReconnecting(peerId)) { return; } + // Clear permanent failure status - user explicitly wants to reconnect + // Only clear when not already reconnecting to preserve error tracking during active attempts + reconnectionManager.clearPermanentFailure(peerId); registerLocationHints(peerId, hints); handleConnectionLoss(peerId); }