From 65efd5d34b44eb4e9873a0d4d76c35bb3357751b Mon Sep 17 00:00:00 2001 From: Todd Short Date: Thu, 29 Jan 2026 11:02:32 -0500 Subject: [PATCH] Add immediate fallback dialer to eliminate Happy Eyeballs delay Implements ImmediateFallbackDialContext that removes the 300ms delay from Go's Happy Eyeballs algorithm by trying addresses sequentially in the order returned by DNS, without racing or artificial delays. This respects DNS server address ordering (which already optimizes for the local network environment) while eliminating the delay that causes IPv6 "network is unreachable" failures in dual-stack environments where IPv6 has internal-only routing. All network clients (HTTP, Kubernetes REST, image pulls) now use the immediate fallback dialer. Signed-off-by: Todd Short Co-Authored-By: Claude Sonnet 4.5 --- cmd/catalogd/main.go | 12 ++- cmd/operator-controller/main.go | 9 ++ internal/shared/util/http/httputil.go | 107 ++++++++++++++++++++- internal/shared/util/http/httputil_test.go | 83 ++++++++++++++++ 4 files changed, 206 insertions(+), 5 deletions(-) create mode 100644 internal/shared/util/http/httputil_test.go diff --git a/cmd/catalogd/main.go b/cmd/catalogd/main.go index af2463e2c6..e96d9a619f 100644 --- a/cmd/catalogd/main.go +++ b/cmd/catalogd/main.go @@ -149,6 +149,13 @@ func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(ocv1.AddToScheme(scheme)) ctrl.SetLogger(klog.NewKlogr()) + + // Configure global HTTP transport to use custom dialer for all HTTP clients + // including the containers/image library used for pulling from registries. + // The custom dialer tries addresses in DNS order without Happy Eyeballs' 300ms delay. + if err := httputil.ConfigureDefaultTransport(); err != nil { + setupLog.Error(err, "Failed to configure custom dialer") + } } func main() { @@ -274,7 +281,10 @@ func run(ctx context.Context) error { } // Create manager - mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + restConfig := ctrl.GetConfigOrDie() + // Configure REST client to use custom dialer without Happy Eyeballs delay + restConfig.Dial = httputil.ImmediateFallbackDialContext + mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ Scheme: scheme, Metrics: metricsServerOptions, PprofBindAddress: cfg.pprofAddr, diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index dd0bc98f6d..9d55883e62 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -198,6 +198,13 @@ func init() { tlsprofiles.AddFlags(flags) ctrl.SetLogger(klog.NewKlogr()) + + // Configure global HTTP transport to use custom dialer for all HTTP clients + // including the containers/image library used for pulling from registries. + // The custom dialer tries addresses in DNS order without Happy Eyeballs' 300ms delay. + if err := httputil.ConfigureDefaultTransport(); err != nil { + setupLog.Error(err, "Failed to configure custom dialer") + } } func validateMetricsFlags() error { if (cfg.certFile != "" && cfg.keyFile == "") || (cfg.certFile == "" && cfg.keyFile != "") { @@ -325,6 +332,8 @@ func run() error { } restConfig := ctrl.GetConfigOrDie() + // Configure REST client to use custom dialer without Happy Eyeballs delay + restConfig.Dial = httputil.ImmediateFallbackDialContext mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ Scheme: scheme.Scheme, Metrics: metricsServerOptions, diff --git a/internal/shared/util/http/httputil.go b/internal/shared/util/http/httputil.go index f5a982d2de..39288ed5c4 100644 --- a/internal/shared/util/http/httputil.go +++ b/internal/shared/util/http/httputil.go @@ -1,11 +1,107 @@ package http import ( + "context" "crypto/tls" + "fmt" + "net" "net/http" "time" + + "k8s.io/klog/v2" ) +// ImmediateFallbackDialContext creates a DialContext function that tries connection +// attempts sequentially in the order returned by DNS, without the 300ms Happy Eyeballs +// delay. This respects DNS server ordering while eliminating the racing delay. +// +// Go's standard Happy Eyeballs implementation (RFC 6555/8305) is in the net package: +// https://cs.opensource.google/go/go/+/refs/tags/go1.25.3:src/net/dial.go;l=525 (DialContext) +// https://cs.opensource.google/go/go/+/refs/tags/go1.25.3:src/net/dial.go;l=585 (dialParallel) +func ImmediateFallbackDialContext(ctx context.Context, network, address string) (net.Conn, error) { + // Split the address into host and port + host, port, err := net.SplitHostPort(address) + if err != nil { + return nil, err + } + + klog.InfoS("Resolving DNS for connection", "host", host, "port", port, "network", network) + + // Resolve all IP addresses for the host + ips, err := net.DefaultResolver.LookupIP(ctx, "ip", host) + if err != nil { + klog.ErrorS(err, "DNS resolution failed", "host", host) + return nil, err + } + + if len(ips) == 0 { + err := fmt.Errorf("no IP addresses found for host %s", host) + klog.ErrorS(err, "DNS resolution returned no addresses", "host", host) + return nil, err + } + + // Convert IPs to strings for logging + ipStrings := make([]string, 0, len(ips)) + for _, ip := range ips { + ipStrings = append(ipStrings, ip.String()) + } + klog.InfoS("DNS resolution complete", "host", host, "addressCount", len(ips), "addresses", ipStrings) + + dialer := &net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + } + + // Try each address sequentially in the order DNS returned them + var lastErr error + for i, ip := range ips { + // Determine address type and dial network + var addrType, dialNetwork string + if ip.To4() != nil { + addrType = "IPv4" + dialNetwork = network + if network == "tcp" { + dialNetwork = "tcp4" + } + } else { + addrType = "IPv6" + dialNetwork = network + if network == "tcp" { + dialNetwork = "tcp6" + } + } + + target := net.JoinHostPort(ip.String(), port) + klog.InfoS("Attempting connection", "host", host, "type", addrType, + "address", ip.String(), "port", port, "attempt", i+1, "of", len(ips)) + + conn, err := dialer.DialContext(ctx, dialNetwork, target) + if err == nil { + klog.InfoS("Successfully connected", "host", host, "type", addrType, + "address", ip.String(), "port", port) + return conn, nil + } + klog.ErrorS(err, "Connection failed", "host", host, "type", addrType, + "address", ip.String(), "port", port, "attempt", i+1, "of", len(ips)) + lastErr = err + } + + klog.ErrorS(lastErr, "All connection attempts failed", "host", host, "totalAttempts", len(ips)) + return nil, lastErr +} + +// ConfigureDefaultTransport configures http.DefaultTransport to use ImmediateFallbackDialContext. +// This affects all HTTP clients that use the default transport, including the containers/image +// library used for pulling from registries. Returns an error if DefaultTransport is not *http.Transport. +func ConfigureDefaultTransport() error { + transport, ok := http.DefaultTransport.(*http.Transport) + if !ok { + return fmt.Errorf("http.DefaultTransport is not *http.Transport, cannot configure custom dialer") + } + transport.DialContext = ImmediateFallbackDialContext + return nil +} + func BuildHTTPClient(cpw *CertPoolWatcher) (*http.Client, error) { httpClient := &http.Client{Timeout: 10 * time.Second} @@ -14,13 +110,16 @@ func BuildHTTPClient(cpw *CertPoolWatcher) (*http.Client, error) { return nil, err } - tlsConfig := &tls.Config{ + // Clone the default transport to inherit custom dialer and other defaults + transport, ok := http.DefaultTransport.(*http.Transport) + if !ok { + return nil, fmt.Errorf("http.DefaultTransport is not *http.Transport, cannot build HTTP client") + } + tlsTransport := transport.Clone() + tlsTransport.TLSClientConfig = &tls.Config{ RootCAs: pool, MinVersion: tls.VersionTLS12, } - tlsTransport := &http.Transport{ - TLSClientConfig: tlsConfig, - } httpClient.Transport = tlsTransport return httpClient, nil diff --git a/internal/shared/util/http/httputil_test.go b/internal/shared/util/http/httputil_test.go new file mode 100644 index 0000000000..4d3bc8b41b --- /dev/null +++ b/internal/shared/util/http/httputil_test.go @@ -0,0 +1,83 @@ +package http + +import ( + "context" + "net" + "testing" +) + +func TestImmediateFallbackDialContext(t *testing.T) { + tests := []struct { + name string + address string + wantFail bool + minExpectedAddrs int // minimum addresses we expect to find + }{ + { + name: "dual-stack hostname tries addresses in DNS order", + address: "localhost:80", + wantFail: true, // nothing listening on port 80 + minExpectedAddrs: 1, // should have at least one address + }, + { + name: "IPv4-only hostname", + address: "127.0.0.1:80", + wantFail: true, + minExpectedAddrs: 1, + }, + { + name: "IPv6-only hostname", + address: "[::1]:80", + wantFail: true, + minExpectedAddrs: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + + // Parse the address to extract host for DNS lookup + host, _, err := net.SplitHostPort(tt.address) + if err != nil { + t.Fatalf("Failed to split host:port: %v", err) + } + + // Look up IPs to verify DNS resolution works + ips, err := net.DefaultResolver.LookupIP(ctx, "ip", host) + if err != nil { + t.Skipf("DNS resolution failed for %s: %v (this is OK for test environments)", host, err) + } + + if len(ips) < tt.minExpectedAddrs { + t.Skip("Not enough IP addresses found for hostname") + } + + t.Logf("DNS returned %d address(es) - will try each in order:", len(ips)) + + // Log all addresses for debugging + for i, ip := range ips { + ipType := "IPv6" + if ip.To4() != nil { + ipType = "IPv4" + } + t.Logf(" [%d] %s (%s)", i, ip.String(), ipType) + } + + // Actually call the dialer function + _, err = ImmediateFallbackDialContext(ctx, "tcp", tt.address) + + if tt.wantFail { + if err == nil { + t.Errorf("Expected connection to fail, but it succeeded") + } else { + t.Logf("Connection failed as expected: %v", err) + } + } else { + if err != nil { + t.Errorf("Expected connection to succeed, but got error: %v", err) + } + } + }) + } +}