From 42945cfc08bb49775505bf9975184990c72b3ad7 Mon Sep 17 00:00:00 2001 From: Sara Dickinson Date: Fri, 8 Sep 2017 17:28:37 +0100 Subject: [PATCH 1/5] Make the backoff time incrementally increase until the upstream starts working again --- src/context.c | 23 +++++++++++++++++++++-- src/context.h | 1 + 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/context.c b/src/context.c index 47a04829..3e670442 100644 --- a/src/context.c +++ b/src/context.c @@ -737,7 +737,17 @@ void _getdns_upstream_log(getdns_upstream *upstream, uint64_t system, void upstream_backoff(getdns_upstream *upstream) { upstream->conn_state = GETDNS_CONN_BACKOFF; - upstream->conn_retry_time = time(NULL) + upstream->upstreams->tls_backoff_time; + /* Increase the backoff interval incrementally up to the tls_backoff_time*/ + if (upstream->conn_backoff_interval < upstream->upstreams->tls_backoff_time) { + if (upstream->conn_backoff_interval < (UINT16_MAX-1)/2) + upstream->conn_backoff_interval *= 2; + else + upstream->conn_backoff_interval = upstream->upstreams->tls_backoff_time + } + if (upstream->conn_backoff_interval < upstream->upstreams->tls_backoff_time) + upstream->conn_retry_time = time(NULL) + upstream->conn_backoff_interval; + else + upstream->conn_retry_time = time(NULL) + upstream->upstreams->tls_backoff_time; upstream->total_responses = 0; upstream->total_timeouts = 0; upstream->conn_completed = 0; @@ -745,7 +755,7 @@ upstream_backoff(getdns_upstream *upstream) { upstream->conn_shutdowns = 0; upstream->conn_backoffs++; _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, - "%-40s : !Backing off this upstream - Will retry as new upstream at %s", + "%-40s : !Backing off this upstream - Will retry again at %s", upstream->addr_str, asctime(gmtime(&upstream->conn_retry_time))); } @@ -774,6 +784,14 @@ _getdns_upstream_reset(getdns_upstream *upstream) upstream_backoff(upstream); } + + /* If we didn't backoff it would be nice to reset the conn_backoff_interval + if the upstream is working well again otherwise it would get stuck at the + tls_backoff_time forever... How about */ + if (upstream->conn_state != GETDNS_CONN_BACKOFF && + upstream->responses_received > 1) + upstream->conn_backoff_interval = 1; + // Reset per connection counters upstream->queries_sent = 0; upstream->responses_received = 0; @@ -958,6 +976,7 @@ upstream_init(getdns_upstream *upstream, upstream->conn_shutdowns = 0; upstream->conn_setup_failed = 0; upstream->conn_retry_time = 0; + upstream->conn_backoff_interval = 1; upstream->conn_backoffs = 0; upstream->total_responses = 0; upstream->total_timeouts = 0; diff --git a/src/context.h b/src/context.h index f94f9347..8690bba7 100644 --- a/src/context.h +++ b/src/context.h @@ -170,6 +170,7 @@ typedef struct getdns_upstream { size_t conn_shutdowns; size_t conn_setup_failed; time_t conn_retry_time; + uint16_t conn_backoff_interval; size_t conn_backoffs; size_t total_responses; size_t total_timeouts; From 729af1d1590bbba1c3ee17b381829483c59e7cbb Mon Sep 17 00:00:00 2001 From: Sara Dickinson Date: Tue, 12 Sep 2017 13:47:56 +0100 Subject: [PATCH 2/5] Allow backed-off upstreams to be re-instated if all our upstreams are unusable (e.g. if the network is down). But limit re-tries for a given netreq to the total number of upstreams before failing. This should (roughly) allow 2 retries per upstream of the correct transport before bailing out. Otherwise we are stuck in a loop retrying forever! --- src/context.c | 2 +- src/stub.c | 64 ++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/src/context.c b/src/context.c index 3e670442..f8bd1493 100644 --- a/src/context.c +++ b/src/context.c @@ -742,7 +742,7 @@ upstream_backoff(getdns_upstream *upstream) { if (upstream->conn_backoff_interval < (UINT16_MAX-1)/2) upstream->conn_backoff_interval *= 2; else - upstream->conn_backoff_interval = upstream->upstreams->tls_backoff_time + upstream->conn_backoff_interval = upstream->upstreams->tls_backoff_time; } if (upstream->conn_backoff_interval < upstream->upstreams->tls_backoff_time) upstream->conn_retry_time = time(NULL) + upstream->conn_backoff_interval; diff --git a/src/stub.c b/src/stub.c index d1299b02..6dd58da2 100644 --- a/src/stub.c +++ b/src/stub.c @@ -1813,13 +1813,16 @@ upstream_active(getdns_upstream *upstream) } static int -upstream_usable(getdns_upstream *upstream) +upstream_usable(getdns_upstream *upstream, int backoff_ok) { if ((upstream->conn_state == GETDNS_CONN_CLOSED || upstream->conn_state == GETDNS_CONN_SETUP || upstream->conn_state == GETDNS_CONN_OPEN) && upstream->keepalive_shutdown == 0) return 1; + if (backoff_ok == 1 && + upstream->conn_state == GETDNS_CONN_BACKOFF) + return 1; return 0; } @@ -1835,15 +1838,17 @@ upstream_stats(getdns_upstream *upstream) { /* [TLS1]TODO: This arbitrary logic at the moment - review and improve!*/ return (upstream->total_responses - upstream->total_timeouts - - upstream->conn_shutdowns*GETDNS_TRANSPORT_FAIL_MULT); + - upstream->conn_shutdowns*GETDNS_TRANSPORT_FAIL_MULT + - upstream->conn_setup_failed); } static int upstream_valid(getdns_upstream *upstream, getdns_transport_list_t transport, - getdns_network_req *netreq) + getdns_network_req *netreq, + int backoff_ok) { - if (!(upstream->transport == transport && upstream_usable(upstream))) + if (!(upstream->transport == transport && upstream_usable(upstream, backoff_ok))) return 0; if (transport == GETDNS_TRANSPORT_TCP) return 1; @@ -1894,6 +1899,7 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra if (upstreams->upstreams[i].conn_state == GETDNS_CONN_BACKOFF && upstreams->upstreams[i].conn_retry_time < now) { upstreams->upstreams[i].conn_state = GETDNS_CONN_CLOSED; + upstreams->upstreams[i].conn_backoff_interval = 1; _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, "%-40s : Re-instating upstream\n", upstreams->upstreams[i].addr_str); @@ -1910,13 +1916,12 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra /* OK - Find the next one to use. First check we have at least one valid upstream because we completely back off failed - upstreams we may have no valid upstream at all (in contrast to UDP). This - will be better communicated to the user when we have better error codes*/ + upstreams we may have no valid upstream at all (in contrast to UDP).*/ i = upstreams->current_stateful; do { DEBUG_STUB("%s %-35s: Testing upstreams %d %d\n", STUB_DEBUG_SETUP, __FUNC__, (int)i, (int)upstreams->upstreams[i].conn_state); - if (upstream_valid(&upstreams->upstreams[i], transport, netreq)) { + if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 0)) { upstream = &upstreams->upstreams[i]; break; } @@ -1924,14 +1929,46 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra if (i >= upstreams->count) i = 0; } while (i != upstreams->current_stateful); - if (!upstream) - return NULL; + if (!upstream) { + /* Oh, oh. We have no valid upstreams. Try to find one that might work. + Don't worry about the policy, just use the one with the least bad + stats that still fits the bill (right transport, right authentication) + to try to avoid total failure due to network outages. */ + do { + if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1)) { + upstream = &upstreams->upstreams[i]; + break; + } + i++; + if (i >= upstreams->count) + i = 0; + } while (i != upstreams->current_stateful); + if (!upstream) { + /* We _really_ have nothing that authenticates well enough right now... + leave to regular backoff logic. */ + return NULL; + } + do { + i++; + if (i >= upstreams->count) + i = 0; + if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1) && + upstream_stats(&upstreams->upstreams[i]) > upstream_stats(upstream)) + upstream = &upstreams->upstreams[i]; + } while (i != upstreams->current_stateful); + upstream->conn_state = GETDNS_CONN_CLOSED; + upstream->conn_backoff_interval = 1; + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + "%-40s : No valid upstreams... promoting backed-off upstream %s for re-try...\n", + upstreams->upstreams[i].addr_str); + return upstream; + } /* Now select the specific upstream */ if (netreq->owner->context->round_robin_upstreams == 0) { /* Base the decision on the stats, noting we will have started from 0*/ for (i++; i < upstreams->count; i++) { - if (upstream_valid(&upstreams->upstreams[i], transport, netreq) && + if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 0) && upstream_stats(&upstreams->upstreams[i]) > upstream_stats(upstream)) upstream = &upstreams->upstreams[i]; } @@ -2057,12 +2094,17 @@ upstream_find_for_transport(getdns_network_req *netreq, } else { /* For stateful transport we should keep trying until all our transports - are exhausted/backed-off (no upstream)*/ + are exhausted/backed-off (no upstream) and until we have tried each + upstream at least once for this netreq in a total backoff scenario */ + size_t i = 0; do { upstream = upstream_select_stateful(netreq, transport); if (!upstream) return NULL; *fd = upstream_connect(upstream, transport, netreq->owner); + if (i >= upstream->upstreams->count) + return NULL; + i++; } while (*fd == -1); DEBUG_STUB("%s %-35s: FD: %d Connecting to upstream: %p No: %d\n", STUB_DEBUG_SETUP, __FUNC__, *fd, (void*)upstream, From b760a2ced20faede982420bf1c239f8479908f05 Mon Sep 17 00:00:00 2001 From: Sara Dickinson Date: Tue, 12 Sep 2017 15:01:02 +0100 Subject: [PATCH 3/5] Refine the logging levels to match the errors given when backing off, etc. --- src/context.c | 6 +++--- src/getdns/getdns_extra.h.in | 2 +- src/stub.c | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/context.c b/src/context.c index f8bd1493..707016dc 100644 --- a/src/context.c +++ b/src/context.c @@ -754,7 +754,7 @@ upstream_backoff(getdns_upstream *upstream) { upstream->conn_setup_failed = 0; upstream->conn_shutdowns = 0; upstream->conn_backoffs++; - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE, "%-40s : !Backing off this upstream - Will retry again at %s", upstream->addr_str, asctime(gmtime(&upstream->conn_retry_time))); @@ -836,13 +836,13 @@ _getdns_upstream_shutdown(getdns_upstream *upstream) (upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"), (int)upstream->responses_received, (int)upstream->responses_timeouts, _getdns_auth_str(upstream->tls_auth_state), (int)upstream->keepalive_timeout); - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO, "%-40s : Upstream : %s - Resps=%6d, Timeouts =%6d, Best_auth =%7s\n", upstream->addr_str, (upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"), (int)upstream->total_responses, (int)upstream->total_timeouts, _getdns_auth_str(upstream->best_tls_auth_state)); - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO, "%-40s : Upstream : %s - Conns=%6d, Conn_fails=%6d, Conn_shuts=%7d, Backoffs =%6d\n", upstream->addr_str, (upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"), diff --git a/src/getdns/getdns_extra.h.in b/src/getdns/getdns_extra.h.in index dbb8eab4..c32a5c14 100644 --- a/src/getdns/getdns_extra.h.in +++ b/src/getdns/getdns_extra.h.in @@ -520,7 +520,7 @@ typedef enum getdns_loglevel_type { #define GETDNS_LOG_CRIT_TEXT "Critical conditions" #define GETDNS_LOG_ERR_TEXT "Error conditions" #define GETDNS_LOG_WARNING_TEXT "Warning conditions" -#define GETDNS_LOG_NOTICE_TEXT "normal, but significant, condition" +#define GETDNS_LOG_NOTICE_TEXT "Normal, but significant, condition" #define GETDNS_LOG_INFO_TEXT "Informational message" #define GETDNS_LOG_DEBUG_TEXT "Debug-level message" diff --git a/src/stub.c b/src/stub.c index 6dd58da2..eac26fcf 100644 --- a/src/stub.c +++ b/src/stub.c @@ -609,7 +609,7 @@ stub_timeout_cb(void *userarg) netreq->fd = -1; netreq->upstream->udp_timeouts++; if (netreq->upstream->udp_timeouts % 100 == 0) - _getdns_upstream_log(netreq->upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(netreq->upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO, "%-40s : Upstream stats: Transport=UDP - Resp=%d,Timeouts=%d\n", netreq->upstream->addr_str, (int)netreq->upstream->udp_responses, (int)netreq->upstream->udp_timeouts); @@ -910,7 +910,7 @@ tls_verify_callback(int preverify_ok, X509_STORE_CTX *ctx) X509_verify_cert_error_string(err)); #endif if (!preverify_ok && !upstream->tls_fallback_ok) - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR, "%-40s : Verify failed : Transport=TLS - *Failure* - (%d) \"%s\"\n", upstream->addr_str, err, X509_verify_cert_error_string(err)); @@ -946,7 +946,7 @@ tls_verify_callback(int preverify_ok, X509_STORE_CTX *ctx) DEBUG_STUB("%s %-35s: FD: %d, WARNING: Proceeding even though pinset validation failed!\n", STUB_DEBUG_SETUP_TLS, __FUNC__, upstream->fd); else - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR, "%-40s : Conn failed : Transport=TLS - *Failure* - Pinset validation failure\n", upstream->addr_str); } else { @@ -1475,7 +1475,7 @@ stub_udp_read_cb(void *userarg) upstream->udp_responses++; if (upstream->udp_responses == 1 || upstream->udp_responses % 100 == 0) - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO, "%-40s : Upstream stats: Transport=UDP - Resp=%d,Timeouts=%d\n", upstream->addr_str, (int)upstream->udp_responses, (int)upstream->udp_timeouts); @@ -1900,7 +1900,7 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra upstreams->upstreams[i].conn_retry_time < now) { upstreams->upstreams[i].conn_state = GETDNS_CONN_CLOSED; upstreams->upstreams[i].conn_backoff_interval = 1; - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE, "%-40s : Re-instating upstream\n", upstreams->upstreams[i].addr_str); } @@ -1958,7 +1958,7 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra } while (i != upstreams->current_stateful); upstream->conn_state = GETDNS_CONN_CLOSED; upstream->conn_backoff_interval = 1; - _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE, "%-40s : No valid upstreams... promoting backed-off upstream %s for re-try...\n", upstreams->upstreams[i].addr_str); return upstream; @@ -2142,7 +2142,7 @@ upstream_find_for_netreq(getdns_network_req *netreq) } /* Handle better, will give generic error*/ DEBUG_STUB("%s %-35s: MSG: %p No valid upstream! \n", STUB_DEBUG_SCHEDULE, __FUNC__, (void*)netreq); - _getdns_context_log(netreq->owner->context, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, + _getdns_context_log(netreq->owner->context, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR, "*FAILURE* no valid transports or upstreams available!\n"); return -1; } From f53e5645d9f6bcba9d59eba3372743284b55f282 Mon Sep 17 00:00:00 2001 From: Sara Dickinson Date: Wed, 13 Sep 2017 10:00:56 +0100 Subject: [PATCH 4/5] Improve the comments about the new backoff handling. Remove unnecessary log. --- src/stub.c | 17 ++++++++++++++--- src/tools/getdns_query.c | 1 - 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/stub.c b/src/stub.c index eac26fcf..8ca9a6dc 100644 --- a/src/stub.c +++ b/src/stub.c @@ -1815,11 +1815,15 @@ upstream_active(getdns_upstream *upstream) static int upstream_usable(getdns_upstream *upstream, int backoff_ok) { + /* If backoff_ok is not true then only use upstreams that are in a healthy + state. */ if ((upstream->conn_state == GETDNS_CONN_CLOSED || upstream->conn_state == GETDNS_CONN_SETUP || upstream->conn_state == GETDNS_CONN_OPEN) && upstream->keepalive_shutdown == 0) return 1; + /* Otherwise, allow upstreams that are backed off to be used because that + is better that having no upstream at all. */ if (backoff_ok == 1 && upstream->conn_state == GETDNS_CONN_BACKOFF) return 1; @@ -1848,6 +1852,11 @@ upstream_valid(getdns_upstream *upstream, getdns_network_req *netreq, int backoff_ok) { + /* Checking upstreams with backoff_ok true will aslo return upstreams + that are in a backoff state. Otherwise only use upstreams that have + a 'good' connection state. backoff_ok is usefull when no upstreams at all + are valid, for example when the network connection is down and need to + keep trying to connect before failing completely. */ if (!(upstream->transport == transport && upstream_usable(upstream, backoff_ok))) return 0; if (transport == GETDNS_TRANSPORT_TCP) @@ -1915,7 +1924,7 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra } /* OK - Find the next one to use. First check we have at least one valid - upstream because we completely back off failed + upstream (not backed-off) because we completely back off failed upstreams we may have no valid upstream at all (in contrast to UDP).*/ i = upstreams->current_stateful; do { @@ -1930,7 +1939,8 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra i = 0; } while (i != upstreams->current_stateful); if (!upstream) { - /* Oh, oh. We have no valid upstreams. Try to find one that might work. + /* Oh, oh. We have no valid upstreams. Try to find one that might work so + allow backed off upstreams to be considered valid. Don't worry about the policy, just use the one with the least bad stats that still fits the bill (right transport, right authentication) to try to avoid total failure due to network outages. */ @@ -1966,7 +1976,8 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra /* Now select the specific upstream */ if (netreq->owner->context->round_robin_upstreams == 0) { - /* Base the decision on the stats, noting we will have started from 0*/ + /* Base the decision on the stats and being not backed-off, + noting we will have started from 0*/ for (i++; i < upstreams->count; i++) { if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 0) && upstream_stats(&upstreams->upstreams[i]) > upstream_stats(upstream)) diff --git a/src/tools/getdns_query.c b/src/tools/getdns_query.c index d8337ffd..582f2c0c 100644 --- a/src/tools/getdns_query.c +++ b/src/tools/getdns_query.c @@ -1769,7 +1769,6 @@ main(int argc, char **argv) loop->vmt->run(loop); } else if (listen_count) { - fprintf(stderr, "Starting Stubby DAEMON....\n"); assert(loop); #ifndef GETDNS_ON_WINDOWS if (i_am_stubby && !run_in_foreground) { From 0c39696b64064d34695cfcfc1ca8acd8ff85bf30 Mon Sep 17 00:00:00 2001 From: Jim Hague Date: Wed, 13 Sep 2017 15:50:20 +0100 Subject: [PATCH 5/5] Add '-f' to tpkg usage message. --- src/test/tpkg/tpkg | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/tpkg/tpkg b/src/test/tpkg/tpkg index 043d2ece..0eddaedb 100755 --- a/src/test/tpkg/tpkg +++ b/src/test/tpkg/tpkg @@ -197,6 +197,7 @@ function usage() { out " -b DIR\tuse DIR is a base directory in stead of ." out " -a ARGS\tpass the string ARGS through to the test scripts" out " -d\t\tUse directories instead of tar.gz for tpkg archive format" + out " -f\t\tForce test to be re-run if already executed" out out " (C) NLnetLabs, Miek Gieben. Licensed under the GPL version 2." }