ooni-probe-cli/internal/engine/netx/archival/archival.go
Simone Basso 31e478b04e
refactor: redesign how we import assets (#260)
* fix(pkg.go.dev): import a subpackage containing the assets

We're trying to fix this issue that pkg.go.dev does not build.

Thanks to @hellais for this very neat idea! Let's keep our
fingers crossed and see whether it fixes!

* feat: use embedded geoip databases

Closes https://github.com/ooni/probe/issues/1372.

Work done as part of https://github.com/ooni/probe/issues/1369.

* fix(assetsx): add tests

* feat: simplify and just vendor uncompressed DBs

* remove tests that seems not necessary anymore

* fix: run go mod tidy

* Address https://github.com/ooni/probe-cli/pull/260/files#r605181364

* rewrite a test in a better way

* fix: gently cleanup the legacy assetsdir

Do not remove the whole directory with brute force. Just zap the
files whose name we know. Then attempt to delete the legacy directory
as well. If not empty, just fail. This is fine because it means the
user has stored other files inside the directory.

* fix: create .miniooni if missing
2021-04-01 16:57:31 +02:00

585 lines
18 KiB
Go

// Package archival contains data formats used for archival.
//
// See https://github.com/ooni/spec.
package archival
import (
"crypto/x509"
"encoding/base64"
"encoding/json"
"errors"
"net"
"net/http"
"sort"
"strconv"
"strings"
"time"
"unicode/utf8"
"github.com/ooni/probe-cli/v3/internal/engine/geolocate"
"github.com/ooni/probe-cli/v3/internal/engine/model"
"github.com/ooni/probe-cli/v3/internal/engine/netx/errorx"
"github.com/ooni/probe-cli/v3/internal/engine/netx/trace"
)
// ExtSpec describes a data format extension
type ExtSpec struct {
Name string // extension name
V int64 // extension version
}
// AddTo adds the current ExtSpec to the specified measurement
func (spec ExtSpec) AddTo(m *model.Measurement) {
if m.Extensions == nil {
m.Extensions = make(map[string]int64)
}
m.Extensions[spec.Name] = spec.V
}
var (
// ExtDNS is the version of df-002-dnst.md
ExtDNS = ExtSpec{Name: "dnst", V: 0}
// ExtNetevents is the version of df-008-netevents.md
ExtNetevents = ExtSpec{Name: "netevents", V: 0}
// ExtHTTP is the version of df-001-httpt.md
ExtHTTP = ExtSpec{Name: "httpt", V: 0}
// ExtTCPConnect is the version of df-005-tcpconnect.md
ExtTCPConnect = ExtSpec{Name: "tcpconnect", V: 0}
// ExtTLSHandshake is the version of df-006-tlshandshake.md
ExtTLSHandshake = ExtSpec{Name: "tlshandshake", V: 0}
// ExtTunnel is the version of df-009-tunnel.md
ExtTunnel = ExtSpec{Name: "tunnel", V: 0}
)
// TCPConnectStatus contains the TCP connect status.
//
// The Blocked field breaks the separation between measurement and analysis
// we have been enforcing for quite some time now. It is a legacy from the
// Web Connectivity experiment and it should be here because of that.
type TCPConnectStatus struct {
Blocked *bool `json:"blocked,omitempty"` // Web Connectivity only
Failure *string `json:"failure"`
Success bool `json:"success"`
}
// TCPConnectEntry contains one of the entries that are part
// of the "tcp_connect" key of a OONI report.
type TCPConnectEntry struct {
ConnID int64 `json:"conn_id,omitempty"`
DialID int64 `json:"dial_id,omitempty"`
IP string `json:"ip"`
Port int `json:"port"`
Status TCPConnectStatus `json:"status"`
T float64 `json:"t"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
// NewTCPConnectList creates a new TCPConnectList
func NewTCPConnectList(begin time.Time, events []trace.Event) []TCPConnectEntry {
var out []TCPConnectEntry
for _, event := range events {
if event.Name != errorx.ConnectOperation {
continue
}
if event.Proto != "tcp" {
continue
}
// We assume Go is passing us legit data structures
ip, sport, _ := net.SplitHostPort(event.Address)
iport, _ := strconv.Atoi(sport)
out = append(out, TCPConnectEntry{
IP: ip,
Port: iport,
Status: TCPConnectStatus{
Failure: NewFailure(event.Err),
Success: event.Err == nil,
},
T: event.Time.Sub(begin).Seconds(),
})
}
return out
}
// NewFailure creates a failure nullable string from the given error
func NewFailure(err error) *string {
if err == nil {
return nil
}
// The following code guarantees that the error is always wrapped even
// when we could not actually hit our code that does the wrapping. A case
// in which this happen is with context deadline for HTTP.
err = errorx.SafeErrWrapperBuilder{
Error: err,
Operation: errorx.TopLevelOperation,
}.MaybeBuild()
errWrapper := err.(*errorx.ErrWrapper)
s := errWrapper.Failure
if s == "" {
s = "unknown_failure: errWrapper.Failure is empty"
}
return &s
}
// NewFailedOperation creates a failed operation string from the given error.
func NewFailedOperation(err error) *string {
if err == nil {
return nil
}
var (
errWrapper *errorx.ErrWrapper
s = errorx.UnknownOperation
)
if errors.As(err, &errWrapper) && errWrapper.Operation != "" {
s = errWrapper.Operation
}
return &s
}
// HTTPTor contains Tor information
type HTTPTor struct {
ExitIP *string `json:"exit_ip"`
ExitName *string `json:"exit_name"`
IsTor bool `json:"is_tor"`
}
// MaybeBinaryValue is a possibly binary string. We use this helper class
// to define a custom JSON encoder that allows us to choose the proper
// representation depending on whether the Value field is valid UTF-8 or not.
type MaybeBinaryValue struct {
Value string
}
// MarshalJSON marshals a string-like to JSON following the OONI spec that
// says that UTF-8 content is represened as string and non-UTF-8 content is
// instead represented using `{"format":"base64","data":"..."}`.
func (hb MaybeBinaryValue) MarshalJSON() ([]byte, error) {
if utf8.ValidString(hb.Value) {
return json.Marshal(hb.Value)
}
er := make(map[string]string)
er["format"] = "base64"
er["data"] = base64.StdEncoding.EncodeToString([]byte(hb.Value))
return json.Marshal(er)
}
// UnmarshalJSON is the opposite of MarshalJSON.
func (hb *MaybeBinaryValue) UnmarshalJSON(d []byte) error {
if err := json.Unmarshal(d, &hb.Value); err == nil {
return nil
}
er := make(map[string]string)
if err := json.Unmarshal(d, &er); err != nil {
return err
}
if v, ok := er["format"]; !ok || v != "base64" {
return errors.New("missing or invalid format field")
}
if _, ok := er["data"]; !ok {
return errors.New("missing data field")
}
b64, err := base64.StdEncoding.DecodeString(er["data"])
if err != nil {
return err
}
hb.Value = string(b64)
return nil
}
// HTTPBody is an HTTP body. As an implementation note, this type must be
// an alias for the MaybeBinaryValue type, otherwise the specific serialisation
// mechanism implemented by MaybeBinaryValue is not working.
type HTTPBody = MaybeBinaryValue
// HTTPHeader is a single HTTP header.
type HTTPHeader struct {
Key string
Value MaybeBinaryValue
}
// MarshalJSON marshals a single HTTP header to a tuple where the first
// element is a string and the second element is maybe-binary data.
func (hh HTTPHeader) MarshalJSON() ([]byte, error) {
if utf8.ValidString(hh.Value.Value) {
return json.Marshal([]string{hh.Key, hh.Value.Value})
}
value := make(map[string]string)
value["format"] = "base64"
value["data"] = base64.StdEncoding.EncodeToString([]byte(hh.Value.Value))
return json.Marshal([]interface{}{hh.Key, value})
}
// UnmarshalJSON is the opposite of MarshalJSON.
func (hh *HTTPHeader) UnmarshalJSON(d []byte) error {
var pair []interface{}
if err := json.Unmarshal(d, &pair); err != nil {
return err
}
if len(pair) != 2 {
return errors.New("unexpected pair length")
}
key, ok := pair[0].(string)
if !ok {
return errors.New("the key is not a string")
}
value, ok := pair[1].(string)
if !ok {
mapvalue, ok := pair[1].(map[string]interface{})
if !ok {
return errors.New("the value is neither a string nor a map[string]interface{}")
}
if _, ok := mapvalue["format"]; !ok {
return errors.New("missing format")
}
if v, ok := mapvalue["format"].(string); !ok || v != "base64" {
return errors.New("invalid format")
}
if _, ok := mapvalue["data"]; !ok {
return errors.New("missing data field")
}
v, ok := mapvalue["data"].(string)
if !ok {
return errors.New("the data field is not a string")
}
b64, err := base64.StdEncoding.DecodeString(v)
if err != nil {
return err
}
value = string(b64)
}
hh.Key, hh.Value = key, MaybeBinaryValue{Value: value}
return nil
}
// HTTPRequest contains an HTTP request.
//
// Headers are a map in Web Connectivity data format but
// we have added support for a list since January 2020.
type HTTPRequest struct {
Body HTTPBody `json:"body"`
BodyIsTruncated bool `json:"body_is_truncated"`
HeadersList []HTTPHeader `json:"headers_list"`
Headers map[string]MaybeBinaryValue `json:"headers"`
Method string `json:"method"`
Tor HTTPTor `json:"tor"`
Transport string `json:"x_transport"`
URL string `json:"url"`
}
// HTTPResponse contains an HTTP response.
//
// Headers are a map in Web Connectivity data format but
// we have added support for a list since January 2020.
type HTTPResponse struct {
Body HTTPBody `json:"body"`
BodyIsTruncated bool `json:"body_is_truncated"`
Code int64 `json:"code"`
HeadersList []HTTPHeader `json:"headers_list"`
Headers map[string]MaybeBinaryValue `json:"headers"`
// The following fields are not serialised but are useful to simplify
// analysing the measurements in telegram, whatsapp, etc.
Locations []string `json:"-"`
}
// RequestEntry is one of the entries that are part of
// the "requests" key of a OONI report.
type RequestEntry struct {
Failure *string `json:"failure"`
Request HTTPRequest `json:"request"`
Response HTTPResponse `json:"response"`
T float64 `json:"t"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
func addheaders(
source http.Header,
destList *[]HTTPHeader,
destMap *map[string]MaybeBinaryValue,
) {
for key, values := range source {
for index, value := range values {
value := MaybeBinaryValue{Value: value}
// With the map representation we can only represent a single
// value for every key. Hence the list representation.
if index == 0 {
(*destMap)[key] = value
}
*destList = append(*destList, HTTPHeader{
Key: key,
Value: value,
})
}
}
sort.Slice(*destList, func(i, j int) bool {
return (*destList)[i].Key < (*destList)[j].Key
})
}
// NewRequestList returns the list for "requests"
func NewRequestList(begin time.Time, events []trace.Event) []RequestEntry {
// OONI wants the last request to appear first
var out []RequestEntry
tmp := newRequestList(begin, events)
for i := len(tmp) - 1; i >= 0; i-- {
out = append(out, tmp[i])
}
return out
}
func newRequestList(begin time.Time, events []trace.Event) []RequestEntry {
var (
out []RequestEntry
entry RequestEntry
)
for _, ev := range events {
switch ev.Name {
case "http_transaction_start":
entry = RequestEntry{}
entry.T = ev.Time.Sub(begin).Seconds()
case "http_request_body_snapshot":
entry.Request.Body.Value = string(ev.Data)
entry.Request.BodyIsTruncated = ev.DataIsTruncated
case "http_request_metadata":
entry.Request.Headers = make(map[string]MaybeBinaryValue)
addheaders(
ev.HTTPHeaders, &entry.Request.HeadersList, &entry.Request.Headers)
entry.Request.Method = ev.HTTPMethod
entry.Request.URL = ev.HTTPURL
entry.Request.Transport = ev.Transport
case "http_response_metadata":
entry.Response.Headers = make(map[string]MaybeBinaryValue)
addheaders(
ev.HTTPHeaders, &entry.Response.HeadersList, &entry.Response.Headers)
entry.Response.Code = int64(ev.HTTPStatusCode)
entry.Response.Locations = ev.HTTPHeaders.Values("Location")
case "http_response_body_snapshot":
entry.Response.Body.Value = string(ev.Data)
entry.Response.BodyIsTruncated = ev.DataIsTruncated
case "http_transaction_done":
entry.Failure = NewFailure(ev.Err)
out = append(out, entry)
}
}
return out
}
// DNSAnswerEntry is the answer to a DNS query
type DNSAnswerEntry struct {
ASN int64 `json:"asn,omitempty"`
ASOrgName string `json:"as_org_name,omitempty"`
AnswerType string `json:"answer_type"`
Hostname string `json:"hostname,omitempty"`
IPv4 string `json:"ipv4,omitempty"`
IPv6 string `json:"ipv6,omitempty"`
TTL *uint32 `json:"ttl"`
}
// DNSQueryEntry is a DNS query with possibly an answer
type DNSQueryEntry struct {
Answers []DNSAnswerEntry `json:"answers"`
DialID int64 `json:"dial_id,omitempty"`
Engine string `json:"engine"`
Failure *string `json:"failure"`
Hostname string `json:"hostname"`
QueryType string `json:"query_type"`
ResolverHostname *string `json:"resolver_hostname"`
ResolverPort *string `json:"resolver_port"`
ResolverAddress string `json:"resolver_address"`
T float64 `json:"t"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
type dnsQueryType string
// NewDNSQueriesList returns a list of DNS queries.
func NewDNSQueriesList(begin time.Time, events []trace.Event) []DNSQueryEntry {
// TODO(bassosimone): add support for CNAME lookups.
var out []DNSQueryEntry
for _, ev := range events {
if ev.Name != "resolve_done" {
continue
}
for _, qtype := range []dnsQueryType{"A", "AAAA"} {
entry := qtype.makequeryentry(begin, ev)
for _, addr := range ev.Addresses {
if qtype.ipoftype(addr) {
entry.Answers = append(
entry.Answers, qtype.makeanswerentry(addr))
}
}
if len(entry.Answers) <= 0 && ev.Err == nil {
// This allows us to skip cases where the server does not have
// an IPv6 address but has an IPv4 address. Instead, when we
// receive an error, we want to track its existence. The main
// issue here is that we are cheating, because we are creating
// entries representing queries, but we don't know what the
// resolver actually did, especially the system resolver. So,
// this output is just our best guess.
continue
}
out = append(out, entry)
}
}
return out
}
func (qtype dnsQueryType) ipoftype(addr string) bool {
switch qtype {
case "A":
return !strings.Contains(addr, ":")
case "AAAA":
return strings.Contains(addr, ":")
}
return false
}
func (qtype dnsQueryType) makeanswerentry(addr string) DNSAnswerEntry {
answer := DNSAnswerEntry{AnswerType: string(qtype)}
asn, org, _ := geolocate.LookupASN(addr)
answer.ASN = int64(asn)
answer.ASOrgName = org
switch qtype {
case "A":
answer.IPv4 = addr
case "AAAA":
answer.IPv6 = addr
}
return answer
}
func (qtype dnsQueryType) makequeryentry(begin time.Time, ev trace.Event) DNSQueryEntry {
return DNSQueryEntry{
Engine: ev.Proto,
Failure: NewFailure(ev.Err),
Hostname: ev.Hostname,
QueryType: string(qtype),
ResolverAddress: ev.Address,
T: ev.Time.Sub(begin).Seconds(),
}
}
// NetworkEvent is a network event. It contains all the possible fields
// and most fields are optional. They are only added when it makes sense
// for them to be there _and_ we have data to show.
type NetworkEvent struct {
Address string `json:"address,omitempty"`
ConnID int64 `json:"conn_id,omitempty"`
DialID int64 `json:"dial_id,omitempty"`
Failure *string `json:"failure"`
NumBytes int64 `json:"num_bytes,omitempty"`
Operation string `json:"operation"`
Proto string `json:"proto,omitempty"`
T float64 `json:"t"`
Tags []string `json:"tags,omitempty"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
// NewNetworkEventsList returns a list of DNS queries.
func NewNetworkEventsList(begin time.Time, events []trace.Event) []NetworkEvent {
var out []NetworkEvent
for _, ev := range events {
if ev.Name == errorx.ConnectOperation {
out = append(out, NetworkEvent{
Address: ev.Address,
Failure: NewFailure(ev.Err),
Operation: ev.Name,
Proto: ev.Proto,
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
if ev.Name == errorx.ReadOperation {
out = append(out, NetworkEvent{
Failure: NewFailure(ev.Err),
Operation: ev.Name,
NumBytes: int64(ev.NumBytes),
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
if ev.Name == errorx.WriteOperation {
out = append(out, NetworkEvent{
Failure: NewFailure(ev.Err),
Operation: ev.Name,
NumBytes: int64(ev.NumBytes),
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
if ev.Name == errorx.ReadFromOperation {
out = append(out, NetworkEvent{
Address: ev.Address,
Failure: NewFailure(ev.Err),
Operation: ev.Name,
NumBytes: int64(ev.NumBytes),
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
if ev.Name == errorx.WriteToOperation {
out = append(out, NetworkEvent{
Address: ev.Address,
Failure: NewFailure(ev.Err),
Operation: ev.Name,
NumBytes: int64(ev.NumBytes),
T: ev.Time.Sub(begin).Seconds(),
})
continue
}
out = append(out, NetworkEvent{
Failure: NewFailure(ev.Err),
Operation: ev.Name,
T: ev.Time.Sub(begin).Seconds(),
})
}
return out
}
// TLSHandshake contains TLS handshake data
type TLSHandshake struct {
CipherSuite string `json:"cipher_suite"`
ConnID int64 `json:"conn_id,omitempty"`
Failure *string `json:"failure"`
NegotiatedProtocol string `json:"negotiated_protocol"`
NoTLSVerify bool `json:"no_tls_verify"`
PeerCertificates []MaybeBinaryValue `json:"peer_certificates"`
ServerName string `json:"server_name"`
T float64 `json:"t"`
Tags []string `json:"tags"`
TLSVersion string `json:"tls_version"`
TransactionID int64 `json:"transaction_id,omitempty"`
}
// NewTLSHandshakesList creates a new TLSHandshakesList
func NewTLSHandshakesList(begin time.Time, events []trace.Event) []TLSHandshake {
var out []TLSHandshake
for _, ev := range events {
if !strings.Contains(ev.Name, "_handshake_done") {
continue
}
out = append(out, TLSHandshake{
CipherSuite: ev.TLSCipherSuite,
Failure: NewFailure(ev.Err),
NegotiatedProtocol: ev.TLSNegotiatedProto,
NoTLSVerify: ev.NoTLSVerify,
PeerCertificates: makePeerCerts(ev.TLSPeerCerts),
ServerName: ev.TLSServerName,
T: ev.Time.Sub(begin).Seconds(),
TLSVersion: ev.TLSVersion,
})
}
return out
}
func makePeerCerts(in []*x509.Certificate) (out []MaybeBinaryValue) {
for _, e := range in {
out = append(out, MaybeBinaryValue{Value: string(e.Raw)})
}
return
}