Allow backed-off upstreams to be re-instated if all our upstreams are unusable (e.g. if the network is down).

But limit re-tries for a given netreq to the total number of upstreams before failing. This should (roughly) allow 2 retries per upstream of the correct transport before bailing out. Otherwise we are stuck in a loop retrying forever!
This commit is contained in:
Sara Dickinson 2017-09-12 13:47:56 +01:00
parent 42945cfc08
commit 729af1d159
2 changed files with 54 additions and 12 deletions

View File

@ -742,7 +742,7 @@ upstream_backoff(getdns_upstream *upstream) {
if (upstream->conn_backoff_interval < (UINT16_MAX-1)/2)
upstream->conn_backoff_interval *= 2;
else
upstream->conn_backoff_interval = upstream->upstreams->tls_backoff_time
upstream->conn_backoff_interval = upstream->upstreams->tls_backoff_time;
}
if (upstream->conn_backoff_interval < upstream->upstreams->tls_backoff_time)
upstream->conn_retry_time = time(NULL) + upstream->conn_backoff_interval;

View File

@ -1813,13 +1813,16 @@ upstream_active(getdns_upstream *upstream)
}
static int
upstream_usable(getdns_upstream *upstream)
upstream_usable(getdns_upstream *upstream, int backoff_ok)
{
if ((upstream->conn_state == GETDNS_CONN_CLOSED ||
upstream->conn_state == GETDNS_CONN_SETUP ||
upstream->conn_state == GETDNS_CONN_OPEN) &&
upstream->keepalive_shutdown == 0)
return 1;
if (backoff_ok == 1 &&
upstream->conn_state == GETDNS_CONN_BACKOFF)
return 1;
return 0;
}
@ -1835,15 +1838,17 @@ upstream_stats(getdns_upstream *upstream)
{
/* [TLS1]TODO: This arbitrary logic at the moment - review and improve!*/
return (upstream->total_responses - upstream->total_timeouts
- upstream->conn_shutdowns*GETDNS_TRANSPORT_FAIL_MULT);
- upstream->conn_shutdowns*GETDNS_TRANSPORT_FAIL_MULT
- upstream->conn_setup_failed);
}
static int
upstream_valid(getdns_upstream *upstream,
getdns_transport_list_t transport,
getdns_network_req *netreq)
getdns_network_req *netreq,
int backoff_ok)
{
if (!(upstream->transport == transport && upstream_usable(upstream)))
if (!(upstream->transport == transport && upstream_usable(upstream, backoff_ok)))
return 0;
if (transport == GETDNS_TRANSPORT_TCP)
return 1;
@ -1894,6 +1899,7 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra
if (upstreams->upstreams[i].conn_state == GETDNS_CONN_BACKOFF &&
upstreams->upstreams[i].conn_retry_time < now) {
upstreams->upstreams[i].conn_state = GETDNS_CONN_CLOSED;
upstreams->upstreams[i].conn_backoff_interval = 1;
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : Re-instating upstream\n",
upstreams->upstreams[i].addr_str);
@ -1910,13 +1916,12 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra
/* OK - Find the next one to use. First check we have at least one valid
upstream because we completely back off failed
upstreams we may have no valid upstream at all (in contrast to UDP). This
will be better communicated to the user when we have better error codes*/
upstreams we may have no valid upstream at all (in contrast to UDP).*/
i = upstreams->current_stateful;
do {
DEBUG_STUB("%s %-35s: Testing upstreams %d %d\n", STUB_DEBUG_SETUP,
__FUNC__, (int)i, (int)upstreams->upstreams[i].conn_state);
if (upstream_valid(&upstreams->upstreams[i], transport, netreq)) {
if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 0)) {
upstream = &upstreams->upstreams[i];
break;
}
@ -1924,14 +1929,46 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra
if (i >= upstreams->count)
i = 0;
} while (i != upstreams->current_stateful);
if (!upstream)
return NULL;
if (!upstream) {
/* Oh, oh. We have no valid upstreams. Try to find one that might work.
Don't worry about the policy, just use the one with the least bad
stats that still fits the bill (right transport, right authentication)
to try to avoid total failure due to network outages. */
do {
if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1)) {
upstream = &upstreams->upstreams[i];
break;
}
i++;
if (i >= upstreams->count)
i = 0;
} while (i != upstreams->current_stateful);
if (!upstream) {
/* We _really_ have nothing that authenticates well enough right now...
leave to regular backoff logic. */
return NULL;
}
do {
i++;
if (i >= upstreams->count)
i = 0;
if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1) &&
upstream_stats(&upstreams->upstreams[i]) > upstream_stats(upstream))
upstream = &upstreams->upstreams[i];
} while (i != upstreams->current_stateful);
upstream->conn_state = GETDNS_CONN_CLOSED;
upstream->conn_backoff_interval = 1;
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : No valid upstreams... promoting backed-off upstream %s for re-try...\n",
upstreams->upstreams[i].addr_str);
return upstream;
}
/* Now select the specific upstream */
if (netreq->owner->context->round_robin_upstreams == 0) {
/* Base the decision on the stats, noting we will have started from 0*/
for (i++; i < upstreams->count; i++) {
if (upstream_valid(&upstreams->upstreams[i], transport, netreq) &&
if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 0) &&
upstream_stats(&upstreams->upstreams[i]) > upstream_stats(upstream))
upstream = &upstreams->upstreams[i];
}
@ -2057,12 +2094,17 @@ upstream_find_for_transport(getdns_network_req *netreq,
}
else {
/* For stateful transport we should keep trying until all our transports
are exhausted/backed-off (no upstream)*/
are exhausted/backed-off (no upstream) and until we have tried each
upstream at least once for this netreq in a total backoff scenario */
size_t i = 0;
do {
upstream = upstream_select_stateful(netreq, transport);
if (!upstream)
return NULL;
*fd = upstream_connect(upstream, transport, netreq->owner);
if (i >= upstream->upstreams->count)
return NULL;
i++;
} while (*fd == -1);
DEBUG_STUB("%s %-35s: FD: %d Connecting to upstream: %p No: %d\n",
STUB_DEBUG_SETUP, __FUNC__, *fd, (void*)upstream,