From 7e3439efbc3cedd86982a591e8236b4c60d682af Mon Sep 17 00:00:00 2001
From: Sara Dickinson <sara@sinodun.com>
Date: Wed, 24 Jan 2018 13:13:14 +0000
Subject: [PATCH] =?UTF-8?q?Improve=20handling=20of=20opportunistic=20back-?=
 =?UTF-8?q?off.=20If=20other=20transports=20are=20working,=20don=E2=80=99t?=
 =?UTF-8?q?=20forcibly=20promote=20failed=20upstreams=20just=20wait=20for?=
 =?UTF-8?q?=20the=20re-try=20timer.=20Clean=20up=20logs.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/context.c |   3 +-
 src/stub.c    | 141 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 94 insertions(+), 50 deletions(-)

diff --git a/src/context.c b/src/context.c
index c8f14363..927543f9 100644
--- a/src/context.c
+++ b/src/context.c
@@ -779,8 +779,9 @@ upstream_backoff(getdns_upstream *upstream) {
 	upstream->conn_shutdowns = 0;
 	upstream->conn_backoffs++;
 	_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE,
-	    "%-40s : !Backing off this upstream    - Will retry again in %ds at %s",
+	    "%-40s : Upstream   : !Backing off %s on this upstream    - Will retry again in %ds at %s",
 	            upstream->addr_str,
+	            upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP",
 	            upstream->conn_backoff_interval,
 	            asctime(gmtime(&upstream->conn_retry_time)));
 }
diff --git a/src/stub.c b/src/stub.c
index 07f22fcd..ac3bd0fd 100644
--- a/src/stub.c
+++ b/src/stub.c
@@ -529,11 +529,7 @@ upstream_failed(getdns_upstream *upstream, int during_setup)
 	   the queries if there is only one upstream.*/
 	GETDNS_CLEAR_EVENT(upstream->loop, &upstream->event);
 	if (during_setup) {
-		/* Special case if failure was due to authentication issues since this
-		   upstream could be used oppotunistically with no problem.*/
-		if (!(upstream->transport == GETDNS_TRANSPORT_TLS &&
-		    upstream->tls_auth_state == GETDNS_AUTH_FAILED))
-			upstream->conn_setup_failed++;
+		upstream->conn_setup_failed++;
 	} else {
 		upstream->conn_shutdowns++;
 		/* [TLS1]TODO: Re-try these queries if possible.*/
@@ -580,7 +576,7 @@ stub_timeout_cb(void *userarg)
 		netreq->upstream->udp_timeouts++;
 		if (netreq->upstream->udp_timeouts % 100 == 0)
 			_getdns_upstream_log(netreq->upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO,
-			    "%-40s : Upstream stats: Transport=UDP - Resp=%d,Timeouts=%d\n",
+			    "%-40s : Upstream   : UDP - Resps=%6d, Timeouts  =%6d (logged every 100 responses)\n",
 			             netreq->upstream->addr_str,
 			             (int)netreq->upstream->udp_responses, (int)netreq->upstream->udp_timeouts);
 		stub_next_upstream(netreq);
@@ -844,7 +840,7 @@ tls_verify_callback(int preverify_ok, X509_STORE_CTX *ctx)
 #endif
 	if (!preverify_ok && !upstream->tls_fallback_ok)
 		_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR,
-		    "%-40s : Verify failed : Transport=TLS - *Failure* -  (%d) \"%s\"\n",
+		    "%-40s : Verify failed: TLS - *Failure* -  (%d) \"%s\"\n",
 		    upstream->addr_str, err,
 		    X509_verify_cert_error_string(err));
 
@@ -880,7 +876,7 @@ tls_verify_callback(int preverify_ok, X509_STORE_CTX *ctx)
 			            STUB_DEBUG_SETUP_TLS, __FUNC__, upstream->fd);
 		else
 			_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR,
-			    "%-40s : Conn failed   : Transport=TLS - *Failure* - Pinset validation failure\n",
+			    "%-40s : Conn failed: TLS - *Failure* - Pinset validation failure\n",
 			    upstream->addr_str);
 	} else {
 		/* If we _only_ had a pinset and it is good then force succesful
@@ -893,7 +889,7 @@ tls_verify_callback(int preverify_ok, X509_STORE_CTX *ctx)
 			DEBUG_STUB("%s %-35s: FD:  %d, Allowing self-signed (%d) cert since pins match\n",
 		           STUB_DEBUG_SETUP_TLS, __FUNC__, upstream->fd, err);
 			_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG, 
-			    "%-40s : Verify passed : Transport=TLS - Allowing self-signed cert since pins match\n",
+			    "%-40s : Verify passed : TLS - Allowing self-signed cert since pins match\n",
 			    upstream->addr_str);
 		}
 	}
@@ -956,6 +952,7 @@ tls_create_object(getdns_dns_req *dnsreq, int fd, getdns_upstream *upstream)
 			    "%-40s : ERROR: Hostname Authentication not available from TLS library (check library version)\n",
 			    upstream->addr_str);
 			upstream->tls_hs_state = GETDNS_HS_FAILED;
+			upstream->tls_auth_state = GETDNS_AUTH_FAILED;
 			return NULL;
 		}
 #endif
@@ -970,9 +967,13 @@ tls_create_object(getdns_dns_req *dnsreq, int fd, getdns_upstream *upstream)
 				DEBUG_STUB("%s %-35s: Proceeding with only pubkey pinning authentication\n",
 			           STUB_DEBUG_SETUP_TLS, __FUNC__);
 			} else {
-				DEBUG_STUB("%s %-35s: ERROR: No host name or pubkey pinset provided for TLS authentication\n",
+				DEBUG_STUB("%s %-35s: ERROR:No auth name or pinset provided for this upstream for Strict TLS authentication\n",
 			           STUB_DEBUG_SETUP_TLS, __FUNC__);
+				_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR, 
+				    "%-40s : Verify fail: *CONFIG ERROR* - No auth name or pinset provided for this upstream for Strict TLS authentication\n",
+				    upstream->addr_str);
 				upstream->tls_hs_state = GETDNS_HS_FAILED;
+				upstream->tls_auth_state = GETDNS_AUTH_FAILED;
 				return NULL;
 			}
 		} else {
@@ -1409,7 +1410,7 @@ stub_udp_read_cb(void *userarg)
 	if (upstream->udp_responses == 1 || 
 	    upstream->udp_responses % 100 == 0)
 		_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_INFO,
-		    "%-40s : Upstream stats: Transport=UDP - Resp=%d,Timeouts=%d\n",
+		    "%-40s : Upstream   : UDP - Resps=%6d, Timeouts  =%6d (logged every 100 responses)\n",
 		    upstream->addr_str,
 		    (int)upstream->udp_responses, (int)upstream->udp_timeouts);
 	_getdns_check_dns_req_complete(dnsreq);
@@ -1671,7 +1672,7 @@ upstream_write_cb(void *userarg)
 		/* Cleaning up after connection or auth check failure. Need to fallback. */
 		stub_cleanup(netreq);
 		_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_DEBUG,
-		    "%-40s : Conn closed   : Transport=%s - *Failure*\n",
+		    "%-40s : Conn closed: %s - *Failure*\n",
 		    upstream->addr_str,
 		    (upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP"));
 		if (fallback_on_write(netreq) == STUB_TCP_ERROR) {
@@ -1826,6 +1827,34 @@ upstream_valid_and_open(getdns_upstream *upstream,
 	 return 1;
 }
 
+static int
+other_transports_working(getdns_network_req *netreq,
+                         getdns_upstreams *upstreams,
+                         getdns_transport_list_t transport)
+{
+	size_t i,j;
+	for (i = 0; i< netreq->transport_count;i++) {
+		if (netreq->transports[i] == transport)
+			continue;
+		if (netreq->transports[i] == GETDNS_TRANSPORT_UDP) {
+			for (j = 0; j < upstreams->count; j+=GETDNS_UPSTREAM_TRANSPORTS) {
+				if (upstreams->upstreams[j].back_off == 1)
+					return 1;
+			}
+		}
+		else if (netreq->transports[i] == GETDNS_TRANSPORT_TCP ||
+		         netreq->transports[i] == GETDNS_TRANSPORT_TLS) {
+			for (j = 0; j < upstreams->count; j++) {
+				if (netreq->transports[i] == upstreams->upstreams[j].transport &&
+				    upstream_valid(&upstreams->upstreams[j], netreq->transports[i],
+					netreq, 0))
+					return 1;
+			}
+		}
+	}
+	return 0;
+}
+
 static getdns_upstream *
 upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t transport)
 {
@@ -1843,8 +1872,9 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra
 		    upstreams->upstreams[i].conn_retry_time < now) {
 			upstreams->upstreams[i].conn_state = GETDNS_CONN_CLOSED;
 			_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE,
-			    "%-40s : Re-instating upstream\n",
-		            upstreams->upstreams[i].addr_str);
+			    "%-40s : Upstream   : Re-instating %s for this upstream\n",
+			     upstreams->upstreams[i].addr_str,
+			     upstreams->upstreams[i].transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP");
 		}
 	}
 
@@ -1857,12 +1887,13 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra
 	}
 
 	/* OK - Find the next one to use. First check we have at least one valid
-	   upstream (not backed-off) because we completely back off failed 
+	   upstream (not backed-off). Because we completely back off failed 
 	   upstreams we may have no valid upstream at all (in contrast to UDP).*/
 	i = upstreams->current_stateful;
 	do {
-		DEBUG_STUB("%s %-35s: Testing upstreams  %d %d\n", STUB_DEBUG_SETUP, 
-	           __FUNC__, (int)i, (int)upstreams->upstreams[i].conn_state);
+		DEBUG_STUB("%s %-35s: Testing upstreams  %d %d for transport %d \n",
+	            STUB_DEBUG_SETUP, __FUNC__, (int)i,
+	            (int)upstreams->upstreams[i].conn_state, transport);
 		if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 0)) {
 			upstream = &upstreams->upstreams[i];
 			break;
@@ -1871,40 +1902,52 @@ upstream_select_stateful(getdns_network_req *netreq, getdns_transport_list_t tra
 		if (i >= upstreams->count)
 			i = 0;
 	} while (i != upstreams->current_stateful);
+
 	if (!upstream) {
-		/* Oh, oh. We have no valid upstreams. Try to find one that might work so
-		   allow backed off upstreams to be considered valid.
-		   Don't worry about the policy, just use the one with the least bad
-		   stats that still fits the bill (right transport, right authentication)
-		   to try to avoid total failure due to network outages. */
-		do {
-			if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1)) {
-				upstream = &upstreams->upstreams[i];
-				break;
-			}
-			i++;
-			if (i >= upstreams->count)
-				i = 0;
-		} while (i != upstreams->current_stateful);
-		if (!upstream) {
-			/* We _really_ have nothing that authenticates well enough right now...
-			   leave to regular backoff logic. */
+		/* Oh, oh. We have no valid upstreams for this transport. */
+		/* If there are other fallback transports that are working, we should 
+		use them before forcilby promoting failed upstreams for re-try, since 
+		waiting for the the re-try timer to re-instate them is the right thing 
+		in this case. */
+		if (other_transports_working(netreq, upstreams, transport)) {
 			return NULL;
 		}
-		do {
-			i++;
-			if (i >= upstreams->count)
-				i = 0;
-			if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1) &&
-			    upstream_stats(&upstreams->upstreams[i]) > upstream_stats(upstream))
-				upstream = &upstreams->upstreams[i];
-		} while (i != upstreams->current_stateful);
-		upstream->conn_state = GETDNS_CONN_CLOSED;
-		upstream->conn_backoff_interval = 1;
-		_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE,
-		    "%-40s : No valid upstreams... promoting this backed-off upstream for re-try...\n",
-		    upstream->addr_str);
-		return upstream;
+		else {
+			/* Try to find one that might work so
+			   allow backed off upstreams to be considered valid.
+			   Don't worry about the policy, just use the one with the least bad
+			   stats that still fits the bill (right transport, right authentication)
+			   to try to avoid total failure due to network outages. */
+			do {
+				if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1)) {
+					upstream = &upstreams->upstreams[i];
+					break;
+				}
+				i++;
+				if (i >= upstreams->count)
+					i = 0;
+			} while (i != upstreams->current_stateful);
+			if (!upstream) {
+				/* We _really_ have nothing that authenticates well enough right now...
+				   leave to regular backoff logic. */
+				return NULL;
+			}
+			do {
+				i++;
+				if (i >= upstreams->count)
+					i = 0;
+				if (upstream_valid(&upstreams->upstreams[i], transport, netreq, 1) &&
+				    upstream_stats(&upstreams->upstreams[i]) > upstream_stats(upstream))
+					upstream = &upstreams->upstreams[i];
+			} while (i != upstreams->current_stateful);
+			upstream->conn_state = GETDNS_CONN_CLOSED;
+			upstream->conn_backoff_interval = 1;
+			_getdns_upstream_log(upstream, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_NOTICE,
+			    "%-40s : Upstream   : No valid upstreams for %s... promoting this backed-off upstream for re-try...\n",
+			    upstream->addr_str,
+			    upstream->transport == GETDNS_TRANSPORT_TLS ? "TLS" : "TCP");
+			return upstream;
+		}
 	}
 
 	/* Now select the specific upstream */
@@ -2081,7 +2124,7 @@ upstream_find_for_netreq(getdns_network_req *netreq)
 	/* Handle better, will give generic error*/
 	DEBUG_STUB("%s %-35s: MSG: %p No valid upstream! \n", STUB_DEBUG_SCHEDULE, __FUNC__, (void*)netreq);
 	_getdns_context_log(netreq->owner->context, GETDNS_LOG_UPSTREAM_STATS, GETDNS_LOG_ERR,
-	    "*FAILURE* no valid transports or upstreams available!\n");
+	    "   *FAILURE* no valid transports or upstreams available!\n");
 	return -1;
 }