diff --git a/src/context.c b/src/context.c index 47a04829..707016dc 100644 --- a/src/context.c +++ b/src/context.c @@ -737,15 +737,25 @@ void _getdns_upstream_log(getdns_upstream *upstream, uint64_t system, void upstream_backoff(getdns_upstream *upstream) { upstream->conn_state = GETDNS_CONN_BACKOFF; - upstream->conn_retry_time = time(NULL) + upstream->upstreams->tls_backoff_time; + /* Increase the backoff interval incrementally up to the tls_backoff_time*/ + if (upstream->conn_backoff_interval < upstream->upstreams->tls_backoff_time) { + if (upstream->conn_backoff_interval < (UINT16_MAX-1)/2) + upstream->conn_backoff_interval *= 2; + else + upstream->conn_backoff_interval = upstream->upstreams->tls_backoff_time; + } + if (upstream->conn_backoff_interval < upstream->upstreams->tls_backoff_time) + upstream->conn_retry_time = time(NULL) + upstream->conn_backoff_interval; + else + upstream->conn_retry_time = time(NULL) + upstream->upstreams->tls_backoff_time; upstream->total_responses = 0; upstream->total_timeouts = 0; upstream->conn_completed = 0; upstream->conn_setup_failed = 0; upstream->conn_shutdowns = 0; upstream->conn_backoffs++; - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, - "%-40s : !Backing off this upstream - Will retry as new upstream at %s", + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE, + "%-40s : !Backing off this upstream - Will retry again at %s", upstream->addr_str, asctime(gmtime(&upstream->conn_retry_time))); } @@ -774,6 +784,14 @@ _getdns_upstream_reset(getdns_upstream *upstream) upstream_backoff(upstream); } + + /* If we didn't backoff it would be nice to reset the conn_backoff_interval + if the upstream is working well again otherwise it would get stuck at the + tls_backoff_time forever... How about */ + if (upstream->conn_state != GETDNS_CONN_BACKOFF && + upstream->responses_received > 1) + upstream->conn_backoff_interval = 1; + // Reset per connection counters upstream->queries_sent = 0; upstream->responses_received = 0; @@ -818,13 +836,13 @@ _getdns_upstream_shutdown(getdns_upstream *upstream) (upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"), (int)upstream->responses_received, (int)upstream->responses_timeouts, _getdns_auth_str(upstream->tls_auth_state), (int)upstream->keepalive_timeout); - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO, "%-40s : Upstream : %s - Resps=%6d, Timeouts =%6d, Best_auth =%7s\n", upstream->addr_str, (upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"), (int)upstream->total_responses, (int)upstream->total_timeouts, _getdns_auth_str(upstream->best_tls_auth_state)); - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO, "%-40s : Upstream : %s - Conns=%6d, Conn_fails=%6d, Conn_shuts=%7d, Backoffs =%6d\n", upstream->addr_str, (upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"), @@ -958,6 +976,7 @@ upstream_init(getdns_upstream *upstream, upstream->conn_shutdowns = 0; upstream->conn_setup_failed = 0; upstream->conn_retry_time = 0; + upstream->conn_backoff_interval = 1; upstream->conn_backoffs = 0; upstream->total_responses = 0; upstream->total_timeouts = 0; diff --git a/src/context.h b/src/context.h index f94f9347..8690bba7 100644 --- a/src/context.h +++ b/src/context.h @@ -170,6 +170,7 @@ typedef struct getdns_upstream { size_t conn_shutdowns; size_t conn_setup_failed; time_t conn_retry_time; + uint16_t conn_backoff_interval; size_t conn_backoffs; size_t total_responses; size_t total_timeouts; diff --git a/src/getdns/getdns_extra.h.in b/src/getdns/getdns_extra.h.in index dbb8eab4..c32a5c14 100644 --- a/src/getdns/getdns_extra.h.in +++ b/src/getdns/getdns_extra.h.in @@ -520,7 +520,7 @@ typedef enum getdns_loglevel_type { #define GETDNS_LOG_CRIT_TEXT "Critical conditions" #define GETDNS_LOG_ERR_TEXT "Error conditions" #define GETDNS_LOG_WARNING_TEXT "Warning conditions" -#define GETDNS_LOG_NOTICE_TEXT "normal, but significant, condition" +#define GETDNS_LOG_NOTICE_TEXT "Normal, but significant, condition" #define GETDNS_LOG_INFO_TEXT "Informational message" #define GETDNS_LOG_DEBUG_TEXT "Debug-level message" diff --git a/src/stub.c b/src/stub.c index d1299b02..8ca9a6dc 100644 --- a/src/stub.c +++ b/src/stub.c @@ -609,7 +609,7 @@ stub_timeout_cb(void *userarg) netreq->fd = -1; netreq->upstream->udp_timeouts++; if (netreq->upstream->udp_timeouts % 100 == 0) - _getdns_upstream_log(netreq->upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(netreq->upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO, "%-40s : Upstream stats: Transport=UDP - Resp=%d,Timeouts=%d\n", netreq->upstream->addr_str, (int)netreq->upstream->udp_responses, (int)netreq->upstream->udp_timeouts); @@ -910,7 +910,7 @@ tls_verify_callback(int preverify_ok, X509_STORE_CTX *ctx) X509_verify_cert_error_string(err)); #endif if (!preverify_ok && !upstream->tls_fallback_ok) - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR, "%-40s : Verify failed : Transport=TLS - *Failure* - (%d) \"%s\"\n", upstream->addr_str, err, X509_verify_cert_error_string(err)); @@ -946,7 +946,7 @@ tls_verify_callback(int preverify_ok, X509_STORE_CTX *ctx) DEBUG_STUB("%s %-35s: FD: %d, WARNING: Proceeding even though pinset validation failed!\n", STUB_DEBUG_SETUP_TLS, __FUNC__, upstream->fd); else - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR, "%-40s : Conn failed : Transport=TLS - *Failure* - Pinset validation failure\n", upstream->addr_str); } else { @@ -1475,7 +1475,7 @@ stub_udp_read_cb(void *userarg) upstream->udp_responses++; if (upstream->udp_responses == 1 || upstream->udp_responses % 100 == 0) - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO, "%-40s : Upstream stats: Transport=UDP - Resp=%d,Timeouts=%d\n", upstream->addr_str, (int)upstream->udp_responses, (int)upstream->udp_timeouts); @@ -1813,13 +1813,20 @@ upstream_active(getdns_upstream *upstream) } static int -upstream_usable(getdns_upstream *upstream) +upstream_usable(getdns_upstream *upstream, int backoff_ok) { + /* If backoff_ok is not true then only use upstreams that are in a healthy + state. */ if ((upstream->conn_state == GETDNS_CONN_CLOSED || upstream->conn_state == GETDNS_CONN_SETUP || upstream->conn_state == GETDNS_CONN_OPEN) && upstream->keepalive_shutdown == 0) return 1; + /* Otherwise, allow upstreams that are backed off to be used because that + is better that having no upstream at all. */ + if (backoff_ok == 1 && + upstream->conn_state == GETDNS_CONN_BACKOFF) + return 1; return 0; } @@ -1835,15 +1842,22 @@ upstream_stats(getdns_upstream *upstream) { /* [TLS1]TODO: This arbitrary logic at the moment - review and improve!*/ return (upstream->total_responses - upstream->total_timeouts - - upstream->conn_shutdowns*GETDNS_TRANSPORT_FAIL_MULT); + - upstream->conn_shutdowns*GETDNS_TRANSPORT_FAIL_MULT + - upstream->conn_setup_failed); } static int upstream_valid(getdns_upstream *upstream, getdns_transport_list_t transport, - getdns_network_req *netreq) + getdns_network_req *netreq, + int backoff_ok) { - if (!(upstream->transport == transport && upstream_usable(upstream))) + /* Checking upstreams with backoff_ok true will aslo return upstreams + that are in a backoff state. Otherwise only use upstreams that have + a 'good' connection state. backoff_ok is usefull when no upstreams at all + are valid, for example when the network connection is down and need to + keep trying to connect before failing completely. */ + if (!(upstream->transport == transport && upstream_usable(upstream, backoff_ok))) return 0; if (transport == GETDNS_TRANSPORT_TCP) return 1; @@ -1894,7 +1908,8 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra if (upstreams->upstreams[i].conn_state == GETDNS_CONN_BACKOFF && upstreams->upstreams[i].conn_retry_time < now) { upstreams->upstreams[i].conn_state = GETDNS_CONN_CLOSED; - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + upstreams->upstreams[i].conn_backoff_interval = 1; + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE, "%-40s : Re-instating upstream\n", upstreams->upstreams[i].addr_str); } @@ -1909,14 +1924,13 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra } /* OK - Find the next one to use. First check we have at least one valid - upstream because we completely back off failed - upstreams we may have no valid upstream at all (in contrast to UDP). This - will be better communicated to the user when we have better error codes*/ + upstream (not backed-off) because we completely back off failed + upstreams we may have no valid upstream at all (in contrast to UDP).*/ i = upstreams->current_stateful; do { DEBUG_STUB("%s %-35s: Testing upstreams %d %d\n", STUB_DEBUG_SETUP, __FUNC__, (int)i, (int)upstreams->upstreams[i].conn_state); - if (upstream_valid(&upstreams->upstreams[i], transport, netreq)) { + if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 0)) { upstream = &upstreams->upstreams[i]; break; } @@ -1924,14 +1938,48 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra if (i >= upstreams->count) i = 0; } while (i != upstreams->current_stateful); - if (!upstream) - return NULL; + if (!upstream) { + /* Oh, oh. We have no valid upstreams. Try to find one that might work so + allow backed off upstreams to be considered valid. + Don't worry about the policy, just use the one with the least bad + stats that still fits the bill (right transport, right authentication) + to try to avoid total failure due to network outages. */ + do { + if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1)) { + upstream = &upstreams->upstreams[i]; + break; + } + i++; + if (i >= upstreams->count) + i = 0; + } while (i != upstreams->current_stateful); + if (!upstream) { + /* We _really_ have nothing that authenticates well enough right now... + leave to regular backoff logic. */ + return NULL; + } + do { + i++; + if (i >= upstreams->count) + i = 0; + if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1) && + upstream_stats(&upstreams->upstreams[i]) > upstream_stats(upstream)) + upstream = &upstreams->upstreams[i]; + } while (i != upstreams->current_stateful); + upstream->conn_state = GETDNS_CONN_CLOSED; + upstream->conn_backoff_interval = 1; + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE, + "%-40s : No valid upstreams... promoting backed-off upstream %s for re-try...\n", + upstreams->upstreams[i].addr_str); + return upstream; + } /* Now select the specific upstream */ if (netreq->owner->context->round_robin_upstreams == 0) { - /* Base the decision on the stats, noting we will have started from 0*/ + /* Base the decision on the stats and being not backed-off, + noting we will have started from 0*/ for (i++; i < upstreams->count; i++) { - if (upstream_valid(&upstreams->upstreams[i], transport, netreq) && + if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 0) && upstream_stats(&upstreams->upstreams[i]) > upstream_stats(upstream)) upstream = &upstreams->upstreams[i]; } @@ -2057,12 +2105,17 @@ upstream_find_for_transport(getdns_network_req *netreq, } else { /* For stateful transport we should keep trying until all our transports - are exhausted/backed-off (no upstream)*/ + are exhausted/backed-off (no upstream) and until we have tried each + upstream at least once for this netreq in a total backoff scenario */ + size_t i = 0; do { upstream = upstream_select_stateful(netreq, transport); if (!upstream) return NULL; *fd = upstream_connect(upstream, transport, netreq->owner); + if (i >= upstream->upstreams->count) + return NULL; + i++; } while (*fd == -1); DEBUG_STUB("%s %-35s: FD: %d Connecting to upstream: %p No: %d\n", STUB_DEBUG_SETUP, __FUNC__, *fd, (void*)upstream, @@ -2100,7 +2153,7 @@ upstream_find_for_netreq(getdns_network_req *netreq) } /* Handle better, will give generic error*/ DEBUG_STUB("%s %-35s: MSG: %p No valid upstream! \n", STUB_DEBUG_SCHEDULE, __FUNC__, (void*)netreq); - _getdns_context_log(netreq->owner->context, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_context_log(netreq->owner->context, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR, "*FAILURE* no valid transports or upstreams available!\n"); return -1; } diff --git a/src/tools/getdns_query.c b/src/tools/getdns_query.c index d8337ffd..582f2c0c 100644 --- a/src/tools/getdns_query.c +++ b/src/tools/getdns_query.c @@ -1769,7 +1769,6 @@ main(int argc, char **argv) loop->vmt->run(loop); } else if (listen_count) { - fprintf(stderr, "Starting Stubby DAEMON....\n"); assert(loop); #ifndef GETDNS_ON_WINDOWS if (i_am_stubby && !run_in_foreground) {