-
Notifications
You must be signed in to change notification settings - Fork 71
🐛 Add immediate fallback dialer to eliminate Happy Eyeballs delay #2464
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -149,6 +149,13 @@ func init() { | |
| utilruntime.Must(clientgoscheme.AddToScheme(scheme)) | ||
| utilruntime.Must(ocv1.AddToScheme(scheme)) | ||
| ctrl.SetLogger(klog.NewKlogr()) | ||
|
|
||
| // Configure global HTTP transport to use custom dialer for all HTTP clients | ||
| // including the containers/image library used for pulling from registries. | ||
| // The custom dialer tries addresses in DNS order without Happy Eyeballs' 300ms delay. | ||
| if err := httputil.ConfigureDefaultTransport(); err != nil { | ||
| setupLog.Error(err, "Failed to configure custom dialer") | ||
| } | ||
| } | ||
|
|
||
| func main() { | ||
|
|
@@ -274,7 +281,10 @@ func run(ctx context.Context) error { | |
| } | ||
|
|
||
| // Create manager | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add a feature gate for this one ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. It's not feature-gate "worthy", since it's attempting to fix a bug. The behavior is mimicking existing behavior without any explicit default of IPv4 or IPv6. So, there's really no choice between the two. |
||
| mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ | ||
| restConfig := ctrl.GetConfigOrDie() | ||
| // Configure REST client to use custom dialer without Happy Eyeballs delay | ||
| restConfig.Dial = httputil.ImmediateFallbackDialContext | ||
| mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ | ||
| Scheme: scheme, | ||
| Metrics: metricsServerOptions, | ||
| PprofBindAddress: cfg.pprofAddr, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,107 @@ | ||
| package http | ||
|
|
||
| import ( | ||
| "context" | ||
| "crypto/tls" | ||
| "fmt" | ||
| "net" | ||
| "net/http" | ||
| "time" | ||
|
|
||
| "k8s.io/klog/v2" | ||
| ) | ||
|
|
||
| // ImmediateFallbackDialContext creates a DialContext function that tries connection | ||
| // attempts sequentially in the order returned by DNS, without the 300ms Happy Eyeballs | ||
| // delay. This respects DNS server ordering while eliminating the racing delay. | ||
| // | ||
| // Go's standard Happy Eyeballs implementation (RFC 6555/8305) is in the net package: | ||
| // https://cs.opensource.google/go/go/+/refs/tags/go1.25.3:src/net/dial.go;l=525 (DialContext) | ||
| // https://cs.opensource.google/go/go/+/refs/tags/go1.25.3:src/net/dial.go;l=585 (dialParallel) | ||
| func ImmediateFallbackDialContext(ctx context.Context, network, address string) (net.Conn, error) { | ||
| // Split the address into host and port | ||
| host, port, err := net.SplitHostPort(address) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| klog.InfoS("Resolving DNS for connection", "host", host, "port", port, "network", network) | ||
|
|
||
| // Resolve all IP addresses for the host | ||
| ips, err := net.DefaultResolver.LookupIP(ctx, "ip", host) | ||
| if err != nil { | ||
| klog.ErrorS(err, "DNS resolution failed", "host", host) | ||
| return nil, err | ||
| } | ||
|
|
||
| if len(ips) == 0 { | ||
| err := fmt.Errorf("no IP addresses found for host %s", host) | ||
| klog.ErrorS(err, "DNS resolution returned no addresses", "host", host) | ||
| return nil, err | ||
| } | ||
|
|
||
| // Convert IPs to strings for logging | ||
| ipStrings := make([]string, 0, len(ips)) | ||
| for _, ip := range ips { | ||
| ipStrings = append(ipStrings, ip.String()) | ||
| } | ||
| klog.InfoS("DNS resolution complete", "host", host, "addressCount", len(ips), "addresses", ipStrings) | ||
|
|
||
| dialer := &net.Dialer{ | ||
| Timeout: 30 * time.Second, | ||
| KeepAlive: 30 * time.Second, | ||
| } | ||
|
|
||
| // Try each address sequentially in the order DNS returned them | ||
| var lastErr error | ||
| for i, ip := range ips { | ||
| // Determine address type and dial network | ||
| var addrType, dialNetwork string | ||
| if ip.To4() != nil { | ||
| addrType = "IPv4" | ||
| dialNetwork = network | ||
| if network == "tcp" { | ||
| dialNetwork = "tcp4" | ||
| } | ||
| } else { | ||
| addrType = "IPv6" | ||
| dialNetwork = network | ||
| if network == "tcp" { | ||
| dialNetwork = "tcp6" | ||
| } | ||
| } | ||
|
|
||
| target := net.JoinHostPort(ip.String(), port) | ||
| klog.InfoS("Attempting connection", "host", host, "type", addrType, | ||
| "address", ip.String(), "port", port, "attempt", i+1, "of", len(ips)) | ||
|
|
||
| conn, err := dialer.DialContext(ctx, dialNetwork, target) | ||
| if err == nil { | ||
| klog.InfoS("Successfully connected", "host", host, "type", addrType, | ||
| "address", ip.String(), "port", port) | ||
| return conn, nil | ||
| } | ||
| klog.ErrorS(err, "Connection failed", "host", host, "type", addrType, | ||
| "address", ip.String(), "port", port, "attempt", i+1, "of", len(ips)) | ||
| lastErr = err | ||
| } | ||
|
|
||
| klog.ErrorS(lastErr, "All connection attempts failed", "host", host, "totalAttempts", len(ips)) | ||
| return nil, lastErr | ||
| } | ||
|
|
||
| // ConfigureDefaultTransport configures http.DefaultTransport to use ImmediateFallbackDialContext. | ||
| // This affects all HTTP clients that use the default transport, including the containers/image | ||
| // library used for pulling from registries. Returns an error if DefaultTransport is not *http.Transport. | ||
| func ConfigureDefaultTransport() error { | ||
| transport, ok := http.DefaultTransport.(*http.Transport) | ||
| if !ok { | ||
| return fmt.Errorf("http.DefaultTransport is not *http.Transport, cannot configure custom dialer") | ||
| } | ||
| transport.DialContext = ImmediateFallbackDialContext | ||
| return nil | ||
| } | ||
|
|
||
| func BuildHTTPClient(cpw *CertPoolWatcher) (*http.Client, error) { | ||
| httpClient := &http.Client{Timeout: 10 * time.Second} | ||
|
|
||
|
|
@@ -14,13 +110,16 @@ func BuildHTTPClient(cpw *CertPoolWatcher) (*http.Client, error) { | |
| return nil, err | ||
| } | ||
|
|
||
| tlsConfig := &tls.Config{ | ||
| // Clone the default transport to inherit custom dialer and other defaults | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about something like bellow. By updating BuildHTTPClient to avoid assuming http.DefaultTransport is always *http.Transport. Right now it can panic or fail if something replaces the default transport. A simple fix is: try to clone http.DefaultTransport when it’s a *http.Transport, otherwise fall back to a new http.Transport with basic defaults. Then set TLSClientConfig on that single transport. This also seems to removes the duplicate tlsTransport := &http.Transport{...} block, so we don’t overwrite the cloned transport.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a lot more code, for not much benefit, and I'm concerned with us setting all those http.Transport{} values to something fixed. Better to indicate an error, IMHO, since it would be a coding error on our part if the original code does not work. |
||
| transport, ok := http.DefaultTransport.(*http.Transport) | ||
| if !ok { | ||
| return nil, fmt.Errorf("http.DefaultTransport is not *http.Transport, cannot build HTTP client") | ||
| } | ||
| tlsTransport := transport.Clone() | ||
| tlsTransport.TLSClientConfig = &tls.Config{ | ||
| RootCAs: pool, | ||
| MinVersion: tls.VersionTLS12, | ||
| } | ||
| tlsTransport := &http.Transport{ | ||
| TLSClientConfig: tlsConfig, | ||
| } | ||
| httpClient.Transport = tlsTransport | ||
|
|
||
| return httpClient, nil | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| package http | ||
|
|
||
| import ( | ||
| "context" | ||
| "net" | ||
| "testing" | ||
| ) | ||
|
|
||
| func TestImmediateFallbackDialContext(t *testing.T) { | ||
| tests := []struct { | ||
| name string | ||
| address string | ||
| wantFail bool | ||
| minExpectedAddrs int // minimum addresses we expect to find | ||
|
Comment on lines
+13
to
+14
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lets group expectations from the test input, so that we can read them better: want struct {
fail bool
minExpectedAddrs int
}
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure adding a new structure inside another structure is of much benefit, TBH. |
||
| }{ | ||
| { | ||
| name: "dual-stack hostname tries addresses in DNS order", | ||
| address: "localhost:80", | ||
| wantFail: true, // nothing listening on port 80 | ||
| minExpectedAddrs: 1, // should have at least one address | ||
| }, | ||
| { | ||
| name: "IPv4-only hostname", | ||
| address: "127.0.0.1:80", | ||
| wantFail: true, | ||
| minExpectedAddrs: 1, | ||
| }, | ||
| { | ||
| name: "IPv6-only hostname", | ||
| address: "[::1]:80", | ||
| wantFail: true, | ||
| minExpectedAddrs: 1, | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| ctx := context.Background() | ||
|
|
||
| // Parse the address to extract host for DNS lookup | ||
| host, _, err := net.SplitHostPort(tt.address) | ||
| if err != nil { | ||
| t.Fatalf("Failed to split host:port: %v", err) | ||
| } | ||
|
|
||
| // Look up IPs to verify DNS resolution works | ||
| ips, err := net.DefaultResolver.LookupIP(ctx, "ip", host) | ||
| if err != nil { | ||
| t.Skipf("DNS resolution failed for %s: %v (this is OK for test environments)", host, err) | ||
| } | ||
|
|
||
| if len(ips) < tt.minExpectedAddrs { | ||
| t.Skip("Not enough IP addresses found for hostname") | ||
| } | ||
|
|
||
| t.Logf("DNS returned %d address(es) - will try each in order:", len(ips)) | ||
|
|
||
| // Log all addresses for debugging | ||
| for i, ip := range ips { | ||
| ipType := "IPv6" | ||
| if ip.To4() != nil { | ||
| ipType = "IPv4" | ||
| } | ||
| t.Logf(" [%d] %s (%s)", i, ip.String(), ipType) | ||
| } | ||
|
|
||
| // Actually call the dialer function | ||
| _, err = ImmediateFallbackDialContext(ctx, "tcp", tt.address) | ||
|
|
||
| if tt.wantFail { | ||
| if err == nil { | ||
| t.Errorf("Expected connection to fail, but it succeeded") | ||
| } else { | ||
| t.Logf("Connection failed as expected: %v", err) | ||
| } | ||
| } else { | ||
| if err != nil { | ||
| t.Errorf("Expected connection to succeed, but got error: %v", err) | ||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in what cases we are failing the tests? |
||
| }) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about we wrapper this one to avoid pacnic?
Something like
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Again, a panic indicates that we did something unexpected (wrong) that caused this to happen, so we want the panic to occur. See my other comment:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I moved it to a common function.