First pass at fixing problems when connections to servers are lost.

Need to reset connection state if connections fail at setup and on read/write if there are no more messages queued.
This means we will back-off servers that fail, so we should think about using a shorter backoff default in stubby
because otherwise temporarily loss of the network connection will mean having to restart stubby.
Also some minor changes to logging.
This commit is contained in:
Sara Dickinson 2017-07-06 07:43:31 +02:00
parent d2c685f3ca
commit 2e4e3873e4
4 changed files with 70 additions and 44 deletions

View File

@ -735,63 +735,44 @@ void _getdns_upstream_log(getdns_upstream *upstream, uint64_t system,
}
void
_getdns_upstream_shutdown(getdns_upstream *upstream)
{
/*Set condition to tear down asap to stop any further scheduling*/
upstream->conn_state = GETDNS_CONN_TEARDOWN;
/* Update total stats for the upstream.*/
upstream->total_responses+=upstream->responses_received;
upstream->total_timeouts+=upstream->responses_timeouts;
/* Need the last auth state when using session resumption*/
upstream->last_tls_auth_state = upstream->tls_auth_state;
/* Keep track of the best auth state this upstream has had*/
if (upstream->tls_auth_state > upstream->best_tls_auth_state)
upstream->best_tls_auth_state = upstream->tls_auth_state;
upstream_backoff(getdns_upstream *upstream) {
upstream->conn_state = GETDNS_CONN_BACKOFF;
upstream->conn_retry_time = time(NULL) + upstream->upstreams->tls_backoff_time;
upstream->total_responses = 0;
upstream->total_timeouts = 0;
upstream->conn_completed = 0;
upstream->conn_setup_failed = 0;
upstream->conn_shutdowns = 0;
upstream->conn_backoffs++;
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : Conn closed : Transport=%s - Resp=%d,Timeouts=%d,Auth=%s,Keepalive(ms)=%d\n",
upstream->addr_str,
(upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"),
(int)upstream->responses_received, (int)upstream->responses_timeouts,
_getdns_auth_str(upstream->tls_auth_state), (int)upstream->keepalive_timeout);
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : Upstream stats: Transport=%s - Resp=%d,Timeouts=%d,Best_auth=%s\n",
upstream->addr_str,
(upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"),
(int)upstream->total_responses, (int)upstream->total_timeouts,
_getdns_auth_str(upstream->best_tls_auth_state));
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : Upstream stats: Transport=%s - Conns=%d,Conn_fails=%d,Conn_shutdowns=%d,Backoffs=%d\n",
upstream->addr_str,
(upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"),
(int)upstream->conn_completed, (int)upstream->conn_setup_failed,
(int)upstream->conn_shutdowns, (int)upstream->conn_backoffs);
"%-40s : !Backing off this upstream - Will retry as new upstream at %s",
upstream->addr_str,
asctime(gmtime(&upstream->conn_retry_time)));
}
void
_getdns_upstream_reset(getdns_upstream *upstream)
{
/* Back off connections that never got up service at all (probably no
TCP service or incompatible TLS version/cipher).
Leave choice between working upstreams to the stub.
This back-off should be time based for TLS according to RFC7858. For now,
use the same basis if we simply can't get TCP service either.*/
uint16_t conn_retries = upstream->upstreams->tls_connection_retries;
/* [TLS1]TODO: This arbitrary logic at the moment - review and improve!*/
/*This is the configured number of retries to attempt*/
uint16_t conn_retries = upstream->upstreams->tls_connection_retries;
if (upstream->conn_setup_failed >= conn_retries
|| ((int)upstream->conn_shutdowns >= conn_retries*GETDNS_TRANSPORT_FAIL_MULT
&& upstream->total_responses == 0)
|| (upstream->conn_completed >= conn_retries &&
upstream->total_responses == 0 &&
upstream->total_timeouts > GETDNS_TRANSPORT_FAIL_MULT)) {
upstream->conn_state = GETDNS_CONN_BACKOFF;
upstream->conn_retry_time = time(NULL) + upstream->upstreams->tls_backoff_time;
upstream->total_responses = 0;
upstream->total_timeouts = 0;
upstream->conn_completed = 0;
upstream->conn_setup_failed = 0;
upstream->conn_shutdowns = 0;
upstream->conn_backoffs++;
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : !Backing off this upstream - Will retry as new upstream at %s",
upstream->addr_str,
asctime(gmtime(&upstream->conn_retry_time)));
upstream_backoff(upstream);
}
// Reset per connection counters
upstream->queries_sent = 0;
@ -820,6 +801,39 @@ _getdns_upstream_shutdown(getdns_upstream *upstream)
upstream->conn_state = GETDNS_CONN_CLOSED;
}
void
_getdns_upstream_shutdown(getdns_upstream *upstream)
{
/* Update total stats for the upstream.*/
upstream->total_responses+=upstream->responses_received;
upstream->total_timeouts+=upstream->responses_timeouts;
/* Need the last auth state when using session resumption*/
upstream->last_tls_auth_state = upstream->tls_auth_state;
/* Keep track of the best auth state this upstream has had*/
if (upstream->tls_auth_state > upstream->best_tls_auth_state)
upstream->best_tls_auth_state = upstream->tls_auth_state;
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : Conn closed: %s - Resps=%6d, Timeouts =%6d, Curr_auth =%7s, Keepalive(ms)=%6d\n",
upstream->addr_str,
(upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"),
(int)upstream->responses_received, (int)upstream->responses_timeouts,
_getdns_auth_str(upstream->tls_auth_state), (int)upstream->keepalive_timeout);
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : Upstream : %s - Resps=%6d, Timeouts =%6d, Best_auth =%7s\n",
upstream->addr_str,
(upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"),
(int)upstream->total_responses, (int)upstream->total_timeouts,
_getdns_auth_str(upstream->best_tls_auth_state));
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : Upstream : %s - Conns=%6d, Conn_fails=%6d, Conn_shuts=%7d, Backoffs =%6d\n",
upstream->addr_str,
(upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"),
(int)upstream->conn_completed, (int)upstream->conn_setup_failed,
(int)upstream->conn_shutdowns, (int)upstream->conn_backoffs);
_getdns_upstream_reset(upstream);
}
static int
tls_is_in_transports_list(getdns_context *context)
{

View File

@ -470,4 +470,6 @@ void _getdns_upstreams_dereference(getdns_upstreams *upstreams);
void _getdns_upstream_shutdown(getdns_upstream *upstream);
void _getdns_upstream_reset(getdns_upstream *upstream);
#endif /* _GETDNS_CONTEXT_H_ */

View File

@ -676,6 +676,7 @@ upstream_setup_timeout_cb(void *userarg)
while (upstream->write_queue)
upstream_write_cb(upstream);
}
_getdns_upstream_reset(upstream);
}
@ -1598,6 +1599,8 @@ upstream_read_cb(void *userarg)
case STUB_SETUP_ERROR: /* Can happen for TLS HS*/
case STUB_TCP_ERROR:
upstream_failed(upstream, (q == STUB_TCP_ERROR ? 0:1) );
if (!upstream->write_queue)
_getdns_upstream_shutdown(upstream);
return;
default:
@ -1700,7 +1703,9 @@ upstream_write_cb(void *userarg)
__FUNC__, (void*)netreq);
/* Health checks on current connection */
if (upstream->conn_state == GETDNS_CONN_TEARDOWN)
if (upstream->conn_state == GETDNS_CONN_TEARDOWN ||
upstream->conn_state == GETDNS_CONN_CLOSED ||
upstream->fd == -1)
q = STUB_CONN_GONE;
else if (!upstream_working_ok(upstream))
q = STUB_TCP_ERROR;
@ -1741,6 +1746,8 @@ upstream_write_cb(void *userarg)
_getdns_netreq_change_state(netreq, NET_REQ_ERRORED);
_getdns_check_dns_req_complete(netreq->owner);
}
if (!upstream->write_queue)
_getdns_upstream_shutdown(upstream);
return;
default:
@ -2001,6 +2008,7 @@ upstream_connect(getdns_upstream *upstream, getdns_transport_list_t transport,
fd = tcp_connect(upstream, transport);
if (fd == -1) {
upstream_failed(upstream, 1);
_getdns_upstream_reset(upstream);
return -1;
}
upstream->loop = dnsreq->loop;
@ -2010,6 +2018,7 @@ upstream_connect(getdns_upstream *upstream, getdns_transport_list_t transport,
upstream->tls_obj = tls_create_object(dnsreq, fd, upstream);
if (upstream->tls_obj == NULL) {
upstream_failed(upstream, 1);
_getdns_upstream_reset(upstream);
#ifdef USE_WINSOCK
closesocket(fd);
#else
@ -2021,7 +2030,7 @@ upstream_connect(getdns_upstream *upstream, getdns_transport_list_t transport,
}
upstream->conn_state = GETDNS_CONN_SETUP;
_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
"%-40s : Conn init : Transport=%s - Profile=%s\n",
"%-40s : Conn opened: %s - %s Profile\n",
upstream->addr_str, transport == GETDNS_TRANSPORT_TLS ? "TLS":"TCP",
dnsreq->context->tls_auth_min == GETDNS_AUTHENTICATION_NONE ? "Opportunistic":"Strict");
break;

View File

@ -1769,6 +1769,7 @@ main(int argc, char **argv)
loop->vmt->run(loop);
}
else if (listen_count) {
fprintf(stderr, "Starting Stubby DAEMON....\n");
assert(loop);
#ifndef GETDNS_ON_WINDOWS
if (i_am_stubby && !run_in_foreground) {