ooni-probe-cli/internal/netxlite/integration_test.go
Simone Basso 16f7407b13
feat(netxlite): observe additional DNS-over-UDP responses (#762)
This diff introduces support for observing additional DNS-over-UDP
responses in some censored environments (e.g. China).

After some uncertainty around whether to use connected or unconnected
UDP sockets, I eventually settled for connected.

Here's a recap:

|                         | connected | unconnected |
| ----------------------- | --------- | ----------- |
| see ICMP errors         | ✔️         |           |
| responses from any server |         | ✔️           |

Because most if not all DNS resolvers expect answers from exactly
the same servers to which they sent the query, I would say that
it's more important to have some limited ability of observing the
effect of ICMP errors (e.g., host_unreachable when we set a low
TTL and send out a query to a server).

Therefore, my choice was to modify the existing DNS-over-UDP transport.

Here's an overview of the changes:

1. introduce a new API for performing an async round trip that returns
a channel wrapper where all responses are posted. The channel will not ever
be closed, so the reader needs to use select for safely reading. If the
reader users the wrapper's Next or TryNextResponses methods, these details
do not matter because they already implement a safe reading pattern.

2. the async round trip API performs the round trip in the background
and stops processing when it sees the first error.

3. the background running code will use an overall deadline derived
from the DNSTransport.IOTimeout field to know when to stop.

4. the background running code will additionally stop running if
noone is reading the channel and there are no empty slots in the
channel's buffer.

5. the RoundTrip method has been rewritten in terms of the async API.

The design I'm using here implements the proposal for async round
trips defined at https://github.com/ooni/probe/issues/2099. I have
chosen not to make all transports async because the DNS transport
seems the only transport that needs to also work in async mode.

While there, I noticed that we were not propagating CloseIdleConnection
to the underlying dialer, which was potentially wrong, so I did it.
2022-05-26 20:09:00 +02:00

551 lines
14 KiB
Go

package netxlite_test
import (
"context"
"crypto/tls"
"fmt"
"net"
"net/http"
"net/http/httptest"
"net/url"
"testing"
"time"
"github.com/apex/log"
"github.com/lucas-clemente/quic-go"
"github.com/ooni/probe-cli/v3/internal/model"
"github.com/ooni/probe-cli/v3/internal/netxlite"
"github.com/ooni/probe-cli/v3/internal/netxlite/filtering"
"github.com/ooni/probe-cli/v3/internal/netxlite/quictesting"
"github.com/ooni/probe-cli/v3/internal/runtimex"
utls "gitlab.com/yawning/utls.git"
)
// This set of integration tests ensures that we continue to
// be able to measure the conditions we care about
func TestMeasureWithSystemResolver(t *testing.T) {
if testing.Short() {
t.Skip("skip test in short mode")
}
//
// Measurement conditions we care about:
//
// - success
//
// - nxdomain
//
// - timeout
//
t.Run("on success", func(t *testing.T) {
r := netxlite.NewResolverStdlib(log.Log)
defer r.CloseIdleConnections()
ctx := context.Background()
addrs, err := r.LookupHost(ctx, "dns.google.com")
if err != nil {
t.Fatal(err)
}
if addrs == nil {
t.Fatal("expected non-nil result here")
}
})
t.Run("for nxdomain", func(t *testing.T) {
r := netxlite.NewResolverStdlib(log.Log)
defer r.CloseIdleConnections()
ctx := context.Background()
addrs, err := r.LookupHost(ctx, "antani.ooni.org")
if err == nil || err.Error() != netxlite.FailureDNSNXDOMAINError {
t.Fatal("not the error we expected", err)
}
if addrs != nil {
t.Fatal("expected nil result here")
}
})
t.Run("for timeout", func(t *testing.T) {
r := netxlite.NewResolverStdlib(log.Log)
defer r.CloseIdleConnections()
const timeout = time.Nanosecond
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
addrs, err := r.LookupHost(ctx, "ooni.org")
if err == nil || err.Error() != netxlite.FailureGenericTimeoutError {
t.Fatal("not the error we expected", err)
}
if addrs != nil {
t.Fatal("expected nil result here")
}
})
}
func TestMeasureWithUDPResolver(t *testing.T) {
if testing.Short() {
t.Skip("skip test in short mode")
}
//
// Measurement conditions we care about:
//
// - success
//
// - nxdomain
//
// - refused
//
// - timeout
//
t.Run("on success", func(t *testing.T) {
dlr := netxlite.NewDialerWithoutResolver(log.Log)
r := netxlite.NewResolverUDP(log.Log, dlr, "8.8.4.4:53")
defer r.CloseIdleConnections()
ctx := context.Background()
addrs, err := r.LookupHost(ctx, "dns.google.com")
if err != nil {
t.Fatal(err)
}
if addrs == nil {
t.Fatal("expected non-nil result here")
}
})
t.Run("for nxdomain", func(t *testing.T) {
proxy := &filtering.DNSServer{
OnQuery: func(domain string) filtering.DNSAction {
return filtering.DNSActionNXDOMAIN
},
}
listener, err := proxy.Start("127.0.0.1:0")
if err != nil {
t.Fatal(err)
}
defer listener.Close()
dlr := netxlite.NewDialerWithoutResolver(log.Log)
r := netxlite.NewResolverUDP(log.Log, dlr, listener.LocalAddr().String())
defer r.CloseIdleConnections()
ctx := context.Background()
addrs, err := r.LookupHost(ctx, "ooni.org")
if err == nil || err.Error() != netxlite.FailureDNSNXDOMAINError {
t.Fatal("not the error we expected", err)
}
if addrs != nil {
t.Fatal("expected nil result here")
}
})
t.Run("for refused", func(t *testing.T) {
proxy := &filtering.DNSServer{
OnQuery: func(domain string) filtering.DNSAction {
return filtering.DNSActionRefused
},
}
listener, err := proxy.Start("127.0.0.1:0")
if err != nil {
t.Fatal(err)
}
defer listener.Close()
dlr := netxlite.NewDialerWithoutResolver(log.Log)
r := netxlite.NewResolverUDP(log.Log, dlr, listener.LocalAddr().String())
defer r.CloseIdleConnections()
ctx := context.Background()
addrs, err := r.LookupHost(ctx, "ooni.org")
if err == nil || err.Error() != netxlite.FailureDNSRefusedError {
t.Fatal("not the error we expected", err)
}
if addrs != nil {
t.Fatal("expected nil result here")
}
})
t.Run("for timeout", func(t *testing.T) {
proxy := &filtering.DNSServer{
OnQuery: func(domain string) filtering.DNSAction {
return filtering.DNSActionTimeout
},
}
listener, err := proxy.Start("127.0.0.1:0")
if err != nil {
t.Fatal(err)
}
defer listener.Close()
dlr := netxlite.NewDialerWithoutResolver(log.Log)
r := netxlite.NewResolverUDP(log.Log, dlr, listener.LocalAddr().String())
defer r.CloseIdleConnections()
ctx := context.Background()
addrs, err := r.LookupHost(ctx, "ooni.org")
if err == nil || err.Error() != netxlite.FailureGenericTimeoutError {
t.Fatal("not the error we expected", err)
}
if addrs != nil {
t.Fatal("expected nil result here")
}
})
}
func TestMeasureWithDialer(t *testing.T) {
if testing.Short() {
t.Skip("skip test in short mode")
}
//
// Measurement conditions we care about:
//
// - success
//
// - connection refused
//
// - timeout
//
t.Run("on success", func(t *testing.T) {
d := netxlite.NewDialerWithoutResolver(log.Log)
defer d.CloseIdleConnections()
ctx := context.Background()
conn, err := d.DialContext(ctx, "tcp", "8.8.4.4:443")
if err != nil {
t.Fatal(err)
}
if conn == nil {
t.Fatal("expected non-nil conn here")
}
conn.Close()
})
t.Run("on connection refused", func(t *testing.T) {
d := netxlite.NewDialerWithoutResolver(log.Log)
defer d.CloseIdleConnections()
ctx := context.Background()
// Here we assume that no-one is listening on 127.0.0.1:1
conn, err := d.DialContext(ctx, "tcp", "127.0.0.1:1")
if err == nil || err.Error() != netxlite.FailureConnectionRefused {
t.Fatal("not the error we expected", err)
}
if conn != nil {
t.Fatal("expected nil conn here")
}
})
t.Run("on timeout", func(t *testing.T) {
// Note: this test was flaky sometimes on macOS. I've seen in
// particular this failure on 2021-09-29:
//
// ```
// --- FAIL: TestMeasureWithDialer (8.25s)
// --- FAIL: TestMeasureWithDialer/on_timeout (8.22s)
// integration_test.go:233: not the error we expected timed_out
// ```
//
// My explanation of this failure is that the ETIMEDOUT from
// the kernel races with the timeout we've configured. For this
// reason, I have set a smaller context timeout (see below).
//
d := netxlite.NewDialerWithoutResolver(log.Log)
defer d.CloseIdleConnections()
const timeout = 5 * time.Second
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
// Here we assume 8.8.4.4:1 is filtered
conn, err := d.DialContext(ctx, "tcp", "8.8.4.4:1")
if err == nil || err.Error() != netxlite.FailureGenericTimeoutError {
t.Fatal("not the error we expected", err)
}
if conn != nil {
t.Fatal("expected nil conn here")
}
})
}
func TestMeasureWithTLSHandshaker(t *testing.T) {
if testing.Short() {
t.Skip("skip test in short mode")
}
//
// Measurement conditions we care about:
//
// - success
//
// - connection reset
//
// - timeout
//
dial := func(ctx context.Context, address string) (net.Conn, error) {
d := netxlite.NewDialerWithoutResolver(log.Log)
return d.DialContext(ctx, "tcp", address)
}
successFlow := func(th model.TLSHandshaker) error {
ctx := context.Background()
conn, err := dial(ctx, "8.8.4.4:443")
if err != nil {
return fmt.Errorf("dial failed: %w", err)
}
defer conn.Close()
config := &tls.Config{
ServerName: "dns.google",
NextProtos: []string{"h2", "http/1.1"},
RootCAs: netxlite.NewDefaultCertPool(),
}
tconn, _, err := th.Handshake(ctx, conn, config)
if err != nil {
return fmt.Errorf("tls handshake failed: %w", err)
}
tconn.Close()
return nil
}
connectionResetFlow := func(th model.TLSHandshaker) error {
tlsProxy := &filtering.TLSProxy{
OnIncomingSNI: func(sni string) filtering.TLSAction {
return filtering.TLSActionReset
},
}
listener, err := tlsProxy.Start("127.0.0.1:0")
if err != nil {
return fmt.Errorf("cannot start proxy: %w", err)
}
defer listener.Close()
ctx := context.Background()
conn, err := dial(ctx, listener.Addr().String())
if err != nil {
return fmt.Errorf("dial failed: %w", err)
}
defer conn.Close()
config := &tls.Config{
ServerName: "dns.google",
NextProtos: []string{"h2", "http/1.1"},
RootCAs: netxlite.NewDefaultCertPool(),
}
tconn, _, err := th.Handshake(ctx, conn, config)
if err == nil {
return fmt.Errorf("tls handshake succeded unexpectedly")
}
if err.Error() != netxlite.FailureConnectionReset {
return fmt.Errorf("not the error we expected: %w", err)
}
if tconn != nil {
return fmt.Errorf("expected nil tconn here")
}
return nil
}
timeoutFlow := func(th model.TLSHandshaker) error {
tlsProxy := &filtering.TLSProxy{
OnIncomingSNI: func(sni string) filtering.TLSAction {
return filtering.TLSActionTimeout
},
}
listener, err := tlsProxy.Start("127.0.0.1:0")
if err != nil {
return fmt.Errorf("cannot start proxy: %w", err)
}
defer listener.Close()
ctx := context.Background()
conn, err := dial(ctx, listener.Addr().String())
if err != nil {
return fmt.Errorf("dial failed: %w", err)
}
defer conn.Close()
config := &tls.Config{
ServerName: "dns.google",
NextProtos: []string{"h2", "http/1.1"},
RootCAs: netxlite.NewDefaultCertPool(),
}
tconn, _, err := th.Handshake(ctx, conn, config)
if err == nil {
return fmt.Errorf("tls handshake succeded unexpectedly")
}
if err.Error() != netxlite.FailureGenericTimeoutError {
return fmt.Errorf("not the error we expected: %w", err)
}
if tconn != nil {
return fmt.Errorf("expected nil tconn here")
}
return nil
}
t.Run("for stdlib handshaker", func(t *testing.T) {
t.Run("on success", func(t *testing.T) {
th := netxlite.NewTLSHandshakerStdlib(log.Log)
err := successFlow(th)
if err != nil {
t.Fatal(err)
}
})
t.Run("on connection reset", func(t *testing.T) {
th := netxlite.NewTLSHandshakerStdlib(log.Log)
err := connectionResetFlow(th)
if err != nil {
t.Fatal(err)
}
})
t.Run("on timeout", func(t *testing.T) {
th := netxlite.NewTLSHandshakerStdlib(log.Log)
err := timeoutFlow(th)
if err != nil {
t.Fatal(err)
}
})
})
t.Run("for utls handshaker", func(t *testing.T) {
t.Run("on success", func(t *testing.T) {
th := netxlite.NewTLSHandshakerUTLS(log.Log, &utls.HelloFirefox_55)
err := successFlow(th)
if err != nil {
t.Fatal(err)
}
})
t.Run("on connection reset", func(t *testing.T) {
th := netxlite.NewTLSHandshakerUTLS(log.Log, &utls.HelloFirefox_55)
err := connectionResetFlow(th)
if err != nil {
t.Fatal(err)
}
})
t.Run("on timeout", func(t *testing.T) {
th := netxlite.NewTLSHandshakerUTLS(log.Log, &utls.HelloFirefox_55)
err := timeoutFlow(th)
if err != nil {
t.Fatal(err)
}
})
})
}
func TestMeasureWithQUICDialer(t *testing.T) {
if testing.Short() {
t.Skip("skip test in short mode")
}
// TODO(bassosimone): here we're not testing the case in which
// the certificate is invalid for the required SNI.
//
// Measurement conditions we care about:
//
// - success
//
// - timeout
//
t.Run("on success", func(t *testing.T) {
ql := netxlite.NewQUICListener()
d := netxlite.NewQUICDialerWithoutResolver(ql, log.Log)
defer d.CloseIdleConnections()
ctx := context.Background()
config := &tls.Config{
ServerName: quictesting.Domain,
NextProtos: []string{"h3"},
RootCAs: netxlite.NewDefaultCertPool(),
}
sess, err := d.DialContext(ctx, "udp", quictesting.Endpoint("443"), config, &quic.Config{})
if err != nil {
t.Fatal(err)
}
if sess == nil {
t.Fatal("expected non-nil sess here")
}
sess.CloseWithError(0, "")
})
t.Run("on timeout", func(t *testing.T) {
ql := netxlite.NewQUICListener()
d := netxlite.NewQUICDialerWithoutResolver(ql, log.Log)
defer d.CloseIdleConnections()
ctx := context.Background()
config := &tls.Config{
ServerName: quictesting.Domain,
NextProtos: []string{"h3"},
RootCAs: netxlite.NewDefaultCertPool(),
}
// Here we assume <target-address>:1 is filtered
sess, err := d.DialContext(ctx, "udp", quictesting.Endpoint("1"), config, &quic.Config{})
if err == nil || err.Error() != netxlite.FailureGenericTimeoutError {
t.Fatal("not the error we expected", err)
}
if sess != nil {
t.Fatal("expected nil sess here")
}
})
}
func TestHTTPTransport(t *testing.T) {
if testing.Short() {
t.Skip("skip test in short mode")
}
t.Run("works as intended", func(t *testing.T) {
d := netxlite.NewDialerWithResolver(log.Log, netxlite.NewResolverStdlib(log.Log))
td := netxlite.NewTLSDialer(d, netxlite.NewTLSHandshakerStdlib(log.Log))
txp := netxlite.NewHTTPTransport(log.Log, d, td)
client := &http.Client{Transport: txp}
resp, err := client.Get("https://www.google.com/robots.txt")
if err != nil {
t.Fatal(err)
}
resp.Body.Close()
client.CloseIdleConnections()
})
t.Run("we can read the body when the connection is closed", func(t *testing.T) {
// See https://github.com/ooni/probe/issues/1965
srvr := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
hj := w.(http.Hijacker) // panic if not possible
conn, bufrw, err := hj.Hijack()
runtimex.PanicOnError(err, "hj.Hijack failed")
bufrw.WriteString("HTTP/1.0 302 Found\r\n")
bufrw.WriteString("Location: /text\r\n\r\n")
bufrw.Flush()
conn.Close()
}))
defer srvr.Close()
txp := netxlite.NewHTTPTransportStdlib(model.DiscardLogger)
req, err := http.NewRequest("GET", srvr.URL, nil)
if err != nil {
t.Fatal(err)
}
resp, err := txp.RoundTrip(req)
if err != nil {
t.Fatal(err)
}
defer resp.Body.Close()
data, err := netxlite.ReadAllContext(req.Context(), resp.Body)
if err != nil {
t.Fatal(err)
}
t.Log(string(data))
})
}
func TestHTTP3Transport(t *testing.T) {
if testing.Short() {
t.Skip("skip test in short mode")
}
t.Run("works as intended", func(t *testing.T) {
d := netxlite.NewQUICDialerWithResolver(
netxlite.NewQUICListener(),
log.Log,
netxlite.NewResolverStdlib(log.Log),
)
txp := netxlite.NewHTTP3Transport(log.Log, d, &tls.Config{})
client := &http.Client{Transport: txp}
URL := (&url.URL{Scheme: "https", Host: quictesting.Domain, Path: "/"}).String()
resp, err := client.Get(URL)
if err != nil {
t.Fatal(err)
}
resp.Body.Close()
txp.CloseIdleConnections()
})
}