ooni-probe-cli/internal/engine/netx/archival/archival.go
Simone Basso fc19c9901a
fix(webconnectivity): expose network events (#258)
* fix(webconnectivity): expose network events

By not exposing network events in webconnectivity, we are missing
several interesting, explanatory data points.

This diff fixes the issue by:

1. enriching the definition of network events to include extra
data useful for performing (manual) data analysis;

2. adding a tags field to network events such that we can add
tags to specific events and understand where they come from;

3. exposing all the (tagged) network events that happen when running
a webconnectivity experiment.

See https://github.com/ooni/probe-engine/issues/1157.

* progress

* more work towards landing this diff

* Apply suggestions from code review
2021-03-23 16:46:46 +01:00

585 lines
18 KiB
Go

// Package archival contains data formats used for archival.
//
// See https://github.com/ooni/spec.
package archival
import (
"crypto/x509"
"encoding/base64"
"encoding/json"
"errors"
"net"
"net/http"
"sort"
"strconv"
"strings"
"time"
"unicode/utf8"
"github.com/ooni/probe-cli/v3/internal/engine/geolocate"
"github.com/ooni/probe-cli/v3/internal/engine/model"
"github.com/ooni/probe-cli/v3/internal/engine/netx/errorx"
"github.com/ooni/probe-cli/v3/internal/engine/netx/trace"
)
// ExtSpec describes a data format extension
type ExtSpec struct {
Name string // extension name
V int64 // extension version
}
// AddTo adds the current ExtSpec to the specified measurement
func (spec ExtSpec) AddTo(m *model.Measurement) {
if m.Extensions == nil {
m.Extensions = make(map[string]int64)
}
m.Extensions[spec.Name] = spec.V
}
var (
// ExtDNS is the version of df-002-dnst.md
ExtDNS = ExtSpec{Name: "dnst", V: 0}
// ExtNetevents is the version of df-008-netevents.md
ExtNetevents = ExtSpec{Name: "netevents", V: 0}
// ExtHTTP is the version of df-001-httpt.md
ExtHTTP = ExtSpec{Name: "httpt", V: 0}
// ExtTCPConnect is the version of df-005-tcpconnect.md
ExtTCPConnect = ExtSpec{Name: "tcpconnect", V: 0}
// ExtTLSHandshake is the version of df-006-tlshandshake.md
ExtTLSHandshake = ExtSpec{Name: "tlshandshake", V: 0}
// ExtTunnel is the version of df-009-tunnel.md
ExtTunnel = ExtSpec{Name: "tunnel", V: 0}
)
// TCPConnectStatus contains the TCP connect status.
//
// The Blocked field breaks the separation between measurement and analysis
// we have been enforcing for quite some time now. It is a legacy from the
// Web Connectivity experiment and it should be here because of that.
type TCPConnectStatus struct {
Blocked *bool `json:"blocked,omitempty"` // Web Connectivity only
Failure *string `json:"failure"`
Success bool `json:"success"`
}
// TCPConnectEntry contains one of the entries that are part
// of the "tcp_connect" key of a OONI report.
type TCPConnectEntry struct {
ConnID int64 `json:"conn_id,omitempty"`
DialID int64 `json:"dial_id,omitempty"`
IP string `json:"ip"`
Port int `json:"port"`
Status TCPConnectStatus `json:"status"`
T float64 `json:"t"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
// NewTCPConnectList creates a new TCPConnectList
func NewTCPConnectList(begin time.Time, events []trace.Event) []TCPConnectEntry {
var out []TCPConnectEntry
for _, event := range events {
if event.Name != errorx.ConnectOperation {
continue
}
if event.Proto != "tcp" {
continue
}
// We assume Go is passing us legit data structures
ip, sport, _ := net.SplitHostPort(event.Address)
iport, _ := strconv.Atoi(sport)
out = append(out, TCPConnectEntry{
IP: ip,
Port: iport,
Status: TCPConnectStatus{
Failure: NewFailure(event.Err),
Success: event.Err == nil,
},
T: event.Time.Sub(begin).Seconds(),
})
}
return out
}
// NewFailure creates a failure nullable string from the given error
func NewFailure(err error) *string {
if err == nil {
return nil
}
// The following code guarantees that the error is always wrapped even
// when we could not actually hit our code that does the wrapping. A case
// in which this happen is with context deadline for HTTP.
err = errorx.SafeErrWrapperBuilder{
Error: err,
Operation: errorx.TopLevelOperation,
}.MaybeBuild()
errWrapper := err.(*errorx.ErrWrapper)
s := errWrapper.Failure
if s == "" {
s = "unknown_failure: errWrapper.Failure is empty"
}
return &s
}
// NewFailedOperation creates a failed operation string from the given error.
func NewFailedOperation(err error) *string {
if err == nil {
return nil
}
var (
errWrapper *errorx.ErrWrapper
s = errorx.UnknownOperation
)
if errors.As(err, &errWrapper) && errWrapper.Operation != "" {
s = errWrapper.Operation
}
return &s
}
// HTTPTor contains Tor information
type HTTPTor struct {
ExitIP *string `json:"exit_ip"`
ExitName *string `json:"exit_name"`
IsTor bool `json:"is_tor"`
}
// MaybeBinaryValue is a possibly binary string. We use this helper class
// to define a custom JSON encoder that allows us to choose the proper
// representation depending on whether the Value field is valid UTF-8 or not.
type MaybeBinaryValue struct {
Value string
}
// MarshalJSON marshals a string-like to JSON following the OONI spec that
// says that UTF-8 content is represened as string and non-UTF-8 content is
// instead represented using `{"format":"base64","data":"..."}`.
func (hb MaybeBinaryValue) MarshalJSON() ([]byte, error) {
if utf8.ValidString(hb.Value) {
return json.Marshal(hb.Value)
}
er := make(map[string]string)
er["format"] = "base64"
er["data"] = base64.StdEncoding.EncodeToString([]byte(hb.Value))
return json.Marshal(er)
}
// UnmarshalJSON is the opposite of MarshalJSON.
func (hb *MaybeBinaryValue) UnmarshalJSON(d []byte) error {
if err := json.Unmarshal(d, &hb.Value); err == nil {
return nil
}
er := make(map[string]string)
if err := json.Unmarshal(d, &er); err != nil {
return err
}
if v, ok := er["format"]; !ok || v != "base64" {
return errors.New("missing or invalid format field")
}
if _, ok := er["data"]; !ok {
return errors.New("missing data field")
}
b64, err := base64.StdEncoding.DecodeString(er["data"])
if err != nil {
return err
}
hb.Value = string(b64)
return nil
}
// HTTPBody is an HTTP body. As an implementation note, this type must be
// an alias for the MaybeBinaryValue type, otherwise the specific serialisation
// mechanism implemented by MaybeBinaryValue is not working.
type HTTPBody = MaybeBinaryValue
// HTTPHeader is a single HTTP header.
type HTTPHeader struct {
Key string
Value MaybeBinaryValue
}
// MarshalJSON marshals a single HTTP header to a tuple where the first
// element is a string and the second element is maybe-binary data.
func (hh HTTPHeader) MarshalJSON() ([]byte, error) {
if utf8.ValidString(hh.Value.Value) {
return json.Marshal([]string{hh.Key, hh.Value.Value})
}
value := make(map[string]string)
value["format"] = "base64"
value["data"] = base64.StdEncoding.EncodeToString([]byte(hh.Value.Value))
return json.Marshal([]interface{}{hh.Key, value})
}
// UnmarshalJSON is the opposite of MarshalJSON.
func (hh *HTTPHeader) UnmarshalJSON(d []byte) error {
var pair []interface{}
if err := json.Unmarshal(d, &pair); err != nil {
return err
}
if len(pair) != 2 {
return errors.New("unexpected pair length")
}
key, ok := pair[0].(string)
if !ok {
return errors.New("the key is not a string")
}
value, ok := pair[1].(string)
if !ok {
mapvalue, ok := pair[1].(map[string]interface{})
if !ok {
return errors.New("the value is neither a string nor a map[string]interface{}")
}
if _, ok := mapvalue["format"]; !ok {
return errors.New("missing format")
}
if v, ok := mapvalue["format"].(string); !ok || v != "base64" {
return errors.New("invalid format")
}
if _, ok := mapvalue["data"]; !ok {
return errors.New("missing data field")
}
v, ok := mapvalue["data"].(string)
if !ok {
return errors.New("the data field is not a string")
}
b64, err := base64.StdEncoding.DecodeString(v)
if err != nil {
return err
}
value = string(b64)
}
hh.Key, hh.Value = key, MaybeBinaryValue{Value: value}
return nil
}
// HTTPRequest contains an HTTP request.
//
// Headers are a map in Web Connectivity data format but
// we have added support for a list since January 2020.
type HTTPRequest struct {
Body HTTPBody `json:"body"`
BodyIsTruncated bool `json:"body_is_truncated"`
HeadersList []HTTPHeader `json:"headers_list"`
Headers map[string]MaybeBinaryValue `json:"headers"`
Method string `json:"method"`
Tor HTTPTor `json:"tor"`
Transport string `json:"x_transport"`
URL string `json:"url"`
}
// HTTPResponse contains an HTTP response.
//
// Headers are a map in Web Connectivity data format but
// we have added support for a list since January 2020.
type HTTPResponse struct {
Body HTTPBody `json:"body"`
BodyIsTruncated bool `json:"body_is_truncated"`
Code int64 `json:"code"`
HeadersList []HTTPHeader `json:"headers_list"`
Headers map[string]MaybeBinaryValue `json:"headers"`
// The following fields are not serialised but are useful to simplify
// analysing the measurements in telegram, whatsapp, etc.
Locations []string `json:"-"`
}
// RequestEntry is one of the entries that are part of
// the "requests" key of a OONI report.
type RequestEntry struct {
Failure *string `json:"failure"`
Request HTTPRequest `json:"request"`
Response HTTPResponse `json:"response"`
T float64 `json:"t"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
func addheaders(
source http.Header,
destList *[]HTTPHeader,
destMap *map[string]MaybeBinaryValue,
) {
for key, values := range source {
for index, value := range values {
value := MaybeBinaryValue{Value: value}
// With the map representation we can only represent a single
// value for every key. Hence the list representation.
if index == 0 {
(*destMap)[key] = value
}
*destList = append(*destList, HTTPHeader{
Key: key,
Value: value,
})
}
}
sort.Slice(*destList, func(i, j int) bool {
return (*destList)[i].Key < (*destList)[j].Key
})
}
// NewRequestList returns the list for "requests"
func NewRequestList(begin time.Time, events []trace.Event) []RequestEntry {
// OONI wants the last request to appear first
var out []RequestEntry
tmp := newRequestList(begin, events)
for i := len(tmp) - 1; i >= 0; i-- {
out = append(out, tmp[i])
}
return out
}
func newRequestList(begin time.Time, events []trace.Event) []RequestEntry {
var (
out []RequestEntry
entry RequestEntry
)
for _, ev := range events {
switch ev.Name {
case "http_transaction_start":
entry = RequestEntry{}
entry.T = ev.Time.Sub(begin).Seconds()
case "http_request_body_snapshot":
entry.Request.Body.Value = string(ev.Data)
entry.Request.BodyIsTruncated = ev.DataIsTruncated
case "http_request_metadata":
entry.Request.Headers = make(map[string]MaybeBinaryValue)
addheaders(
ev.HTTPHeaders, &entry.Request.HeadersList, &entry.Request.Headers)
entry.Request.Method = ev.HTTPMethod
entry.Request.URL = ev.HTTPURL
entry.Request.Transport = ev.Transport
case "http_response_metadata":
entry.Response.Headers = make(map[string]MaybeBinaryValue)
addheaders(
ev.HTTPHeaders, &entry.Response.HeadersList, &entry.Response.Headers)
entry.Response.Code = int64(ev.HTTPStatusCode)
entry.Response.Locations = ev.HTTPHeaders.Values("Location")
case "http_response_body_snapshot":
entry.Response.Body.Value = string(ev.Data)
entry.Response.BodyIsTruncated = ev.DataIsTruncated
case "http_transaction_done":
entry.Failure = NewFailure(ev.Err)
out = append(out, entry)
}
}
return out
}
// DNSAnswerEntry is the answer to a DNS query
type DNSAnswerEntry struct {
ASN int64 `json:"asn,omitempty"`
ASOrgName string `json:"as_org_name,omitempty"`
AnswerType string `json:"answer_type"`
Hostname string `json:"hostname,omitempty"`
IPv4 string `json:"ipv4,omitempty"`
IPv6 string `json:"ipv6,omitempty"`
TTL *uint32 `json:"ttl"`
}
// DNSQueryEntry is a DNS query with possibly an answer
type DNSQueryEntry struct {
Answers []DNSAnswerEntry `json:"answers"`
DialID int64 `json:"dial_id,omitempty"`
Engine string `json:"engine"`
Failure *string `json:"failure"`
Hostname string `json:"hostname"`
QueryType string `json:"query_type"`
ResolverHostname *string `json:"resolver_hostname"`
ResolverPort *string `json:"resolver_port"`
ResolverAddress string `json:"resolver_address"`
T float64 `json:"t"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
type dnsQueryType string
// NewDNSQueriesList returns a list of DNS queries.
func NewDNSQueriesList(begin time.Time, events []trace.Event, dbpath string) []DNSQueryEntry {
// TODO(bassosimone): add support for CNAME lookups.
var out []DNSQueryEntry
for _, ev := range events {
if ev.Name != "resolve_done" {
continue
}
for _, qtype := range []dnsQueryType{"A", "AAAA"} {
entry := qtype.makequeryentry(begin, ev)
for _, addr := range ev.Addresses {
if qtype.ipoftype(addr) {
entry.Answers = append(
entry.Answers, qtype.makeanswerentry(addr, dbpath))
}
}
if len(entry.Answers) <= 0 && ev.Err == nil {
// This allows us to skip cases where the server does not have
// an IPv6 address but has an IPv4 address. Instead, when we
// receive an error, we want to track its existence. The main
// issue here is that we are cheating, because we are creating
// entries representing queries, but we don't know what the
// resolver actually did, especially the system resolver. So,
// this output is just our best guess.
continue
}
out = append(out, entry)
}
}
return out
}
func (qtype dnsQueryType) ipoftype(addr string) bool {
switch qtype {
case "A":
return strings.Contains(addr, ":") == false
case "AAAA":
return strings.Contains(addr, ":") == true
}
return false
}
func (qtype dnsQueryType) makeanswerentry(addr string, dbpath string) DNSAnswerEntry {
answer := DNSAnswerEntry{AnswerType: string(qtype)}
asn, org, _ := geolocate.LookupASN(dbpath, addr)
answer.ASN = int64(asn)
answer.ASOrgName = org
switch qtype {
case "A":
answer.IPv4 = addr
case "AAAA":
answer.IPv6 = addr
}
return answer
}
func (qtype dnsQueryType) makequeryentry(begin time.Time, ev trace.Event) DNSQueryEntry {
return DNSQueryEntry{
Engine: ev.Proto,
Failure: NewFailure(ev.Err),
Hostname: ev.Hostname,
QueryType: string(qtype),
ResolverAddress: ev.Address,
T: ev.Time.Sub(begin).Seconds(),
}
}
// NetworkEvent is a network event. It contains all the possible fields
// and most fields are optional. They are only added when it makes sense
// for them to be there _and_ we have data to show.
type NetworkEvent struct {
Address string `json:"address,omitempty"`
ConnID int64 `json:"conn_id,omitempty"`
DialID int64 `json:"dial_id,omitempty"`
Failure *string `json:"failure"`
NumBytes int64 `json:"num_bytes,omitempty"`
Operation string `json:"operation"`
Proto string `json:"proto,omitempty"`
T float64 `json:"t"`
Tags []string `json:"tags,omitempty"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
// NewNetworkEventsList returns a list of DNS queries.
func NewNetworkEventsList(begin time.Time, events []trace.Event) []NetworkEvent {
var out []NetworkEvent
for _, ev := range events {
if ev.Name == errorx.ConnectOperation {
out = append(out, NetworkEvent{
Address: ev.Address,
Failure: NewFailure(ev.Err),
Operation: ev.Name,
Proto: ev.Proto,
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
if ev.Name == errorx.ReadOperation {
out = append(out, NetworkEvent{
Failure: NewFailure(ev.Err),
Operation: ev.Name,
NumBytes: int64(ev.NumBytes),
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
if ev.Name == errorx.WriteOperation {
out = append(out, NetworkEvent{
Failure: NewFailure(ev.Err),
Operation: ev.Name,
NumBytes: int64(ev.NumBytes),
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
if ev.Name == errorx.ReadFromOperation {
out = append(out, NetworkEvent{
Address: ev.Address,
Failure: NewFailure(ev.Err),
Operation: ev.Name,
NumBytes: int64(ev.NumBytes),
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
if ev.Name == errorx.WriteToOperation {
out = append(out, NetworkEvent{
Address: ev.Address,
Failure: NewFailure(ev.Err),
Operation: ev.Name,
NumBytes: int64(ev.NumBytes),
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
out = append(out, NetworkEvent{
Failure: NewFailure(ev.Err),
Operation: ev.Name,
T: ev.Time.Sub(begin).Seconds(),
})
}
return out
}
// TLSHandshake contains TLS handshake data
type TLSHandshake struct {
CipherSuite string `json:"cipher_suite"`
ConnID int64 `json:"conn_id,omitempty"`
Failure *string `json:"failure"`
NegotiatedProtocol string `json:"negotiated_protocol"`
NoTLSVerify bool `json:"no_tls_verify"`
PeerCertificates []MaybeBinaryValue `json:"peer_certificates"`
ServerName string `json:"server_name"`
T float64 `json:"t"`
Tags []string `json:"tags"`
TLSVersion string `json:"tls_version"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
// NewTLSHandshakesList creates a new TLSHandshakesList
func NewTLSHandshakesList(begin time.Time, events []trace.Event) []TLSHandshake {
var out []TLSHandshake
for _, ev := range events {
if !strings.Contains(ev.Name, "_handshake_done") {
continue
}
out = append(out, TLSHandshake{
CipherSuite: ev.TLSCipherSuite,
Failure: NewFailure(ev.Err),
NegotiatedProtocol: ev.TLSNegotiatedProto,
NoTLSVerify: ev.NoTLSVerify,
PeerCertificates: makePeerCerts(ev.TLSPeerCerts),
ServerName: ev.TLSServerName,
T: ev.Time.Sub(begin).Seconds(),
TLSVersion: ev.TLSVersion,
})
}
return out
}
func makePeerCerts(in []*x509.Certificate) (out []MaybeBinaryValue) {
for _, e := range in {
out = append(out, MaybeBinaryValue{Value: string(e.Raw)})
}
return
}