2018-02-12 17:29:03 +01:00
|
|
|
-- +migrate Down
|
|
|
|
-- +migrate StatementBegin
|
|
|
|
|
|
|
|
DROP TABLE `results`;
|
|
|
|
DROP TABLE `measurements`;
|
2018-08-07 17:51:05 +02:00
|
|
|
DROP TABLE `urls`;
|
|
|
|
DROP TABLE `networks`;
|
2018-02-12 17:29:03 +01:00
|
|
|
|
|
|
|
-- +migrate StatementEnd
|
|
|
|
|
|
|
|
-- +migrate Up
|
|
|
|
-- +migrate StatementBegin
|
|
|
|
|
2018-09-05 17:58:06 +02:00
|
|
|
CREATE TABLE `urls` (
|
|
|
|
`id` INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
`url` VARCHAR(255) NOT NULL, -- XXX is this long enough?
|
|
|
|
`category_code` VARCHAR(5) NOT NULL, -- The citizenlab category code for the
|
|
|
|
-- site. We use the string NONE to denote
|
|
|
|
-- no known category code.
|
|
|
|
|
|
|
|
`country_code` VARCHAR(2) NOT NULL -- The two letter country code which this
|
|
|
|
-- URL belongs to
|
|
|
|
);
|
|
|
|
|
|
|
|
-- We create a separate table for networks for 2 reasons:
|
|
|
|
-- 1. For some of the views where need the total number of measured networks,
|
|
|
|
-- it's going to be much more efficient to just lookup the count of rows in this
|
|
|
|
-- table.
|
|
|
|
-- 2. (most important) We want to avoid duplicating a bunch of information that
|
|
|
|
-- is going to be common to several networks the user is on.
|
|
|
|
-- Example:
|
|
|
|
-- We may wish to add to this table the location from of the probe from the GPS
|
|
|
|
-- or add support for allowing the user to "correct" a misclassified measurement
|
|
|
|
-- or distinguishing between wifi and mobile.
|
|
|
|
CREATE TABLE `networks` (
|
|
|
|
`id` INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
`network_name` VARCHAR(255), -- String name representing the network_name which by default is populated based
|
|
|
|
-- on the ASN.
|
|
|
|
-- We use a separate key to reference the rows in
|
|
|
|
-- this tables, because we may wish to "enrich"
|
|
|
|
-- this with more data in the future.
|
|
|
|
`network_type` VARCHAR(16), -- One of wifi, mobile
|
|
|
|
|
|
|
|
`ip` VARCHAR(40) NOT NULL, -- Stores a string representation of an ipv4 or ipv6 address.
|
|
|
|
-- The longest ip is an ipv6 address like:
|
|
|
|
-- 0000:0000:0000:0000:0000:0000:0000:0000,
|
|
|
|
-- which is 39 chars.
|
|
|
|
`asn` INT(4) NOT NULL,
|
|
|
|
`country_code` VARCHAR(2) NOT NULL -- The two letter country code
|
|
|
|
);
|
|
|
|
|
2018-02-12 17:29:03 +01:00
|
|
|
CREATE TABLE `results` (
|
|
|
|
`id` INTEGER PRIMARY KEY AUTOINCREMENT,
|
2018-08-07 17:51:05 +02:00
|
|
|
-- This can be one of "websites", "im", "performance", "middlebox".
|
|
|
|
`test_group_name` VARCHAR(16) NOT NULL,
|
|
|
|
-- We use a different start_time and runtime, because we want to also have
|
|
|
|
-- data to measure the overhead of creating a report and other factors that
|
|
|
|
-- go into the test.
|
|
|
|
-- That is to say: `SUM(runtime) FROM measurements` will always be <=
|
|
|
|
-- `runtime FROM results` (most times <)
|
|
|
|
`start_time` DATETIME NOT NULL,
|
|
|
|
`runtime` REAL NOT NULL,
|
|
|
|
|
|
|
|
-- Used to indicate if the user has seen this result
|
|
|
|
`is_viewed` TINYINT(1) NOT NULL,
|
|
|
|
|
|
|
|
-- This is a flag used to indicate if the result is done or is currently running.
|
|
|
|
`is_done` TINYINT(1) NOT NULL,
|
|
|
|
`data_usage_up` INTEGER NOT NULL,
|
|
|
|
`data_usage_down` INTEGER NOT NULL,
|
|
|
|
-- It's probably reasonable to set the maximum length to 260 as this is the
|
|
|
|
-- maximum length of file paths on windows.
|
|
|
|
`log_file_path` VARCHAR(260) NOT NULL
|
2018-02-12 17:29:03 +01:00
|
|
|
);
|
|
|
|
|
|
|
|
CREATE TABLE `measurements` (
|
|
|
|
`id` INTEGER PRIMARY KEY AUTOINCREMENT,
|
2018-08-07 17:51:05 +02:00
|
|
|
-- This can be one of:
|
|
|
|
-- facebook_messenger
|
|
|
|
-- telegram
|
|
|
|
-- whatsapp
|
|
|
|
-- http_header_field_manipulation
|
|
|
|
-- http_invalid_request_line
|
|
|
|
-- dash
|
|
|
|
-- ndt
|
|
|
|
`test_name` VARCHAR(64) NOT NULL,
|
|
|
|
`start_time` DATETIME NOT NULL,
|
|
|
|
`runtime` REAL NOT NULL,
|
|
|
|
|
|
|
|
-- For the purpose of populating the probe information in the results
|
|
|
|
-- views, you should pick the first measurement in the JOIN sorted by
|
|
|
|
-- start_time.
|
|
|
|
-- You don't have the guarantee that every (ip, asn, country, network_name)
|
|
|
|
-- is the same in a "measurement set" associated to a "result".
|
|
|
|
`network_id` INTEGER NOT NULL,
|
|
|
|
|
|
|
|
-- Note for golang: we used to have state be one of `done` and `active`, so
|
|
|
|
-- this is equivalent to done being true or false.
|
|
|
|
-- `state` TEXT,
|
|
|
|
`is_done` TINYINT(1) NOT NULL,
|
|
|
|
-- The reason to have a dedicated is_uploaded flag, instead of just using
|
|
|
|
-- is_upload_failed, is that we may not have uploaded the measurement due
|
|
|
|
-- to a setting.
|
|
|
|
`is_uploaded` TINYINT(1) NOT NULL,
|
|
|
|
|
|
|
|
-- This is the measurement failed to run and the user should be offerred to
|
|
|
|
-- re-run it.
|
|
|
|
`is_failed` TINYINT(1) NOT NULL,
|
|
|
|
`failure_msg` VARCHAR(255),
|
|
|
|
|
|
|
|
`is_upload_failed` TINYINT(1) NOT NULL,
|
|
|
|
`upload_failure_msg` VARCHAR(255),
|
|
|
|
|
|
|
|
-- Is used to indicate that this particular measurement has been re-run and
|
|
|
|
-- therefore the UI can take this into account to either hide it from the
|
|
|
|
-- result view or at the very least disable the ability to re-run it.
|
|
|
|
-- XXX do we also want to have a reference to the re-run measurement?
|
|
|
|
`is_rerun` TINYINT(1) NOT NULL,
|
|
|
|
|
|
|
|
-- This is the server-side report_id returned by the collector. By using
|
|
|
|
-- report_id & input, you can query the api to fetch this measurement.
|
|
|
|
-- Ex.
|
|
|
|
-- GET https://api.ooni.io/api/v1/measurements?input=$INPUT&report_id=$REPORT_ID
|
|
|
|
-- Extract the first item from the `result[]` list and then fetch:
|
|
|
|
-- `measurement_url` to get the JSON of this measurement row.
|
|
|
|
-- These two values (`report_id`, `input`) are useful to fetch a
|
|
|
|
-- measurement that has already been processed by the pipeline, to
|
|
|
|
-- implement cleanup of already uploaded measurements.
|
|
|
|
`report_id` VARCHAR(255), -- This can be NULL when no report file has been
|
|
|
|
-- created.
|
|
|
|
|
|
|
|
`url_id` INTEGER NOT NULL,
|
|
|
|
|
|
|
|
-- This is not yet a feature of the collector, but we are planning to add
|
|
|
|
-- this at some point in the near future.
|
|
|
|
-- See: https://github.com/ooni/pipeline/blob/master/docs/ooni-uuid.md &
|
|
|
|
-- https://github.com/ooni/pipeline/issues/48
|
|
|
|
`measurement_id` INT(64),
|
|
|
|
|
|
|
|
-- This indicates in the case of a websites test, that a site is likely
|
|
|
|
-- blocked, or for an IM test if the IM tests says the app is likely
|
|
|
|
-- blocked, or if a middlebox was detected.
|
|
|
|
-- You can `JOIN` a `COUNT()` of this value in the results view to get a count of
|
|
|
|
-- blocked sites or blocked IM apps
|
|
|
|
`is_anomaly` TINYINT(1),
|
|
|
|
|
|
|
|
-- This is an opaque JSON structure, where we store some of the test_keys
|
|
|
|
-- we need for the measurement details views and some result views (ex. the
|
|
|
|
-- upload/download speed of NDT, the reason for blocking of a site,
|
|
|
|
-- etc.)
|
|
|
|
`test_keys` JSON,
|
|
|
|
|
|
|
|
-- The cross table reference to JOIN the two tables together.
|
|
|
|
`result_id` INTEGER NOT NULL,
|
2018-09-05 17:58:06 +02:00
|
|
|
|
2018-08-07 17:51:05 +02:00
|
|
|
|
|
|
|
-- This is a variable used internally to track the path to the on-disk
|
|
|
|
-- measurements.json. It may make sense to write one file per entry by
|
|
|
|
-- hooking MK and preventing it from writing to a file on disk which may
|
|
|
|
-- have many measurements per file.
|
|
|
|
`report_file_path` VARCHAR(260) NOT NULL,
|
|
|
|
|
2018-09-05 17:58:06 +02:00
|
|
|
FOREIGN KEY (`result_id`) REFERENCES `results`(`id`)
|
|
|
|
ON DELETE CASCADE ON UPDATE CASCADE, -- If we delete a result we also want
|
|
|
|
-- all the measurements to be deleted as well.
|
|
|
|
FOREIGN KEY (`url_id`) REFERENCES `urls`(`id`),
|
|
|
|
FOREIGN KEY(`network_id`) REFERENCES `networks` (`id`)
|
2018-02-12 17:29:03 +01:00
|
|
|
);
|
|
|
|
|
|
|
|
-- +migrate StatementEnd
|