ooni-probe-cli/internal/experiment/webconnectivity/cleartextflow.go
Simone Basso c2ea0b4704
feat(webconnectivity): try all the available THs (#980)
We introduce a fork of internal/httpx, named internal/httpapi, where there is a clear split between the concept of an API endpoint (such as https://0.th.ooni.org/) and of an API descriptor (such as using `GET` to access /api/v1/test-list/url).

Additionally, httpapi allows to create a SequenceCaller that tries to call a given API descriptor using multiple API endpoints. The SequenceCaller will stop once an endpoint works or when all the available endpoints have been tried unsuccessfully.

The definition of "success" is the following: we consider "failure" any error that occurs during the HTTP round trip or when reading the response body. We DO NOT consider "failure" errors (1) when parsing the input URL; (2) when the server returns >= 400; (3) when the server returns a string that does not parse as valid JSON. The idea of this classification of failures is that we ONLY want to retry when we see what looks like a network error that may be caused by (collateral or targeted) censorship.

We take advantage of the availability of this new package and we refactor web_connectivity@v0.4 and web_connectivity@v0.5 to use a SequenceCaller for calling the web connectivity TH API. This means that we will now try all the available THs advertised by the backend rather than just selecting and using the first one provided by the backend.

Because this diff is designed to be backported to the `release/3.16` branch, we have omitted additional changes to always use httpapi where we are currently using httpx. Yet, to remind ourselves about the need to do that, we have deprecated the httpx package. We will rewrite all the code currently using httpx to use httpapi as part of future work.

It is also worth noting that httpapi will allow us to refactor the backend code such that (1) we remove code to select a backend URL endpoint at the beginning and (2) we try several endpoints. The design of the code is such that we can add to the mix some endpoints using as `http.Client` a special client using a tunnel. This will allow us to automatically fallback backend queries.

Closes https://github.com/ooni/probe/issues/2353.

Related to https://github.com/ooni/probe/issues/1519.
2022-11-21 16:28:53 +01:00

296 lines
7.9 KiB
Go

package webconnectivity
//
// CleartextFlow
//
// Generated by `boilerplate' using the http template.
//
import (
"context"
"io"
"net"
"net/http"
"net/url"
"sync"
"time"
"github.com/ooni/probe-cli/v3/internal/atomicx"
"github.com/ooni/probe-cli/v3/internal/measurexlite"
"github.com/ooni/probe-cli/v3/internal/model"
"github.com/ooni/probe-cli/v3/internal/netxlite"
)
// Measures HTTP endpoints.
//
// The zero value of this structure IS NOT valid and you MUST initialize
// all the fields marked as MANDATORY before using this structure.
type CleartextFlow struct {
// Address is the MANDATORY address to connect to.
Address string
// DNSCache is the MANDATORY DNS cache.
DNSCache *DNSCache
// IDGenerator is the MANDATORY atomic int64 to generate task IDs.
IDGenerator *atomicx.Int64
// Logger is the MANDATORY logger to use.
Logger model.Logger
// NumRedirects it the MANDATORY counter of the number of redirects.
NumRedirects *NumRedirects
// TestKeys is MANDATORY and contains the TestKeys.
TestKeys *TestKeys
// ZeroTime is the MANDATORY measurement's zero time.
ZeroTime time.Time
// WaitGroup is the MANDATORY wait group this task belongs to.
WaitGroup *sync.WaitGroup
// CookieJar contains the OPTIONAL cookie jar, used for redirects.
CookieJar http.CookieJar
// FollowRedirects is OPTIONAL and instructs this flow
// to follow HTTP redirects (if any).
FollowRedirects bool
// HostHeader is the OPTIONAL host header to use.
HostHeader string
// PrioSelector is the OPTIONAL priority selector to use to determine
// whether this flow is allowed to fetch the webpage.
PrioSelector *prioritySelector
// Referer contains the OPTIONAL referer, used for redirects.
Referer string
// UDPAddress is the OPTIONAL address of the UDP resolver to use. If this
// field is not set we use a default one (e.g., `8.8.8.8:53`).
UDPAddress string
// URLPath is the OPTIONAL URL path.
URLPath string
// URLRawQuery is the OPTIONAL URL raw query.
URLRawQuery string
}
// Start starts this task in a background goroutine.
func (t *CleartextFlow) Start(ctx context.Context) {
t.WaitGroup.Add(1)
index := t.IDGenerator.Add(1)
go func() {
defer t.WaitGroup.Done() // synchronize with the parent
t.Run(ctx, index)
}()
}
// Run runs this task in the current goroutine.
func (t *CleartextFlow) Run(parentCtx context.Context, index int64) {
// create trace
trace := measurexlite.NewTrace(index, t.ZeroTime)
// start the operation logger
ol := measurexlite.NewOperationLogger(
t.Logger, "[#%d] GET http://%s using %s", index, t.HostHeader, t.Address,
)
// perform the TCP connect
const tcpTimeout = 10 * time.Second
tcpCtx, tcpCancel := context.WithTimeout(parentCtx, tcpTimeout)
defer tcpCancel()
tcpDialer := trace.NewDialerWithoutResolver(t.Logger)
tcpConn, err := tcpDialer.DialContext(tcpCtx, "tcp", t.Address)
t.TestKeys.AppendTCPConnectResults(trace.TCPConnects()...)
if err != nil {
ol.Stop(err)
return
}
defer func() {
t.TestKeys.AppendNetworkEvents(trace.NetworkEvents()...)
tcpConn.Close()
}()
alpn := "" // no ALPN because we're not using TLS
// Determine whether we're allowed to fetch the webpage
if t.PrioSelector == nil || !t.PrioSelector.permissionToFetch(t.Address) {
ol.Stop("stop after TCP connect")
return
}
// create HTTP transport
httpTransport := netxlite.NewHTTPTransport(
t.Logger,
netxlite.NewSingleUseDialer(tcpConn),
netxlite.NewNullTLSDialer(),
)
// create HTTP request
const httpTimeout = 10 * time.Second
httpCtx, httpCancel := context.WithTimeout(parentCtx, httpTimeout)
defer httpCancel()
httpReq, err := t.newHTTPRequest(httpCtx)
if err != nil {
if t.Referer == "" {
// when the referer is empty, the failing URL comes from our backend
// or from the user, so it's a fundamental failure. After that, we
// are dealing with websites provided URLs, so we should not flag a
// fundamental failure, because we want to see the measurement submitted.
t.TestKeys.SetFundamentalFailure(err)
}
ol.Stop(err)
return
}
// perform HTTP transaction
httpResp, httpRespBody, err := t.httpTransaction(
httpCtx,
"tcp",
t.Address,
alpn,
httpTransport,
httpReq,
trace,
)
if err != nil {
ol.Stop(err)
return
}
// if enabled, follow possible redirects
t.maybeFollowRedirects(parentCtx, httpResp)
// TODO: insert here additional code if needed
_ = httpRespBody
// completed successfully
ol.Stop(nil)
}
// urlHost computes the host to include into the URL
func (t *CleartextFlow) urlHost(scheme string) (string, error) {
addr, port, err := net.SplitHostPort(t.Address)
if err != nil {
t.Logger.Warnf("BUG: net.SplitHostPort failed for %s: %s", t.Address, err.Error())
return "", err
}
urlHost := t.HostHeader
if urlHost == "" {
urlHost = addr
}
if port == "80" && scheme == "http" {
return urlHost, nil
}
urlHost = net.JoinHostPort(urlHost, port)
return urlHost, nil
}
// newHTTPRequest creates a new HTTP request.
func (t *CleartextFlow) newHTTPRequest(ctx context.Context) (*http.Request, error) {
const urlScheme = "http"
urlHost, err := t.urlHost(urlScheme)
if err != nil {
return nil, err
}
httpURL := &url.URL{
Scheme: urlScheme,
Host: urlHost,
Path: t.URLPath,
RawQuery: t.URLRawQuery,
}
httpReq, err := http.NewRequestWithContext(ctx, "GET", httpURL.String(), nil)
if err != nil {
return nil, err
}
httpReq.Header.Set("Host", t.HostHeader)
httpReq.Header.Set("Accept", model.HTTPHeaderAccept)
httpReq.Header.Set("Accept-Language", model.HTTPHeaderAcceptLanguage)
httpReq.Header.Set("Referer", t.Referer)
httpReq.Header.Set("User-Agent", model.HTTPHeaderUserAgent)
httpReq.Host = t.HostHeader
if t.CookieJar != nil {
for _, cookie := range t.CookieJar.Cookies(httpURL) {
httpReq.AddCookie(cookie)
}
}
return httpReq, nil
}
// httpTransaction runs the HTTP transaction and saves the results.
func (t *CleartextFlow) httpTransaction(ctx context.Context, network, address, alpn string,
txp model.HTTPTransport, req *http.Request, trace *measurexlite.Trace) (*http.Response, []byte, error) {
const maxbody = 1 << 19
started := trace.TimeSince(trace.ZeroTime)
t.TestKeys.AppendNetworkEvents(measurexlite.NewAnnotationArchivalNetworkEvent(
trace.Index, started, "http_transaction_start",
))
resp, err := txp.RoundTrip(req)
var body []byte
if err == nil {
defer resp.Body.Close()
if cookies := resp.Cookies(); t.CookieJar != nil && len(cookies) > 0 {
t.CookieJar.SetCookies(req.URL, cookies)
}
reader := io.LimitReader(resp.Body, maxbody)
body, err = StreamAllContext(ctx, reader)
}
finished := trace.TimeSince(trace.ZeroTime)
t.TestKeys.AppendNetworkEvents(measurexlite.NewAnnotationArchivalNetworkEvent(
trace.Index, finished, "http_transaction_done",
))
ev := measurexlite.NewArchivalHTTPRequestResult(
trace.Index,
started,
network,
address,
alpn,
txp.Network(),
req,
resp,
maxbody,
body,
err,
finished,
)
t.TestKeys.AppendRequests(ev)
return resp, body, err
}
// maybeFollowRedirects follows redirects if configured and needed
func (t *CleartextFlow) maybeFollowRedirects(ctx context.Context, resp *http.Response) {
if !t.FollowRedirects || !t.NumRedirects.CanFollowOneMoreRedirect() {
return // not configured or too many redirects
}
switch resp.StatusCode {
case 301, 302, 307, 308:
location, err := resp.Location()
if err != nil {
return // broken response from server
}
t.Logger.Infof("redirect to: %s", location.String())
resolvers := &DNSResolvers{
CookieJar: t.CookieJar,
DNSCache: t.DNSCache,
Domain: location.Hostname(),
IDGenerator: t.IDGenerator,
Logger: t.Logger,
NumRedirects: t.NumRedirects,
TestKeys: t.TestKeys,
URL: location,
ZeroTime: t.ZeroTime,
WaitGroup: t.WaitGroup,
Referer: resp.Request.URL.String(),
Session: nil, // no need to issue another control request
TestHelpers: nil, // ditto
UDPAddress: t.UDPAddress,
}
resolvers.Start(ctx)
default:
// no redirect to follow
}
}