From 8efb71dce835b4f62b6a69a7c2f215c9dc785c20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 7 Aug 2018 17:51:05 +0200 Subject: [PATCH] Update the schema for the measurements and results tables Based on discussions with @xanscale and @lorenzoPrimi in: https://github.com/ooni/probe-cli/issues/14 --- data/migrations/1_create_msmt_results.sql | 173 ++++++++++++++++++---- 1 file changed, 147 insertions(+), 26 deletions(-) diff --git a/data/migrations/1_create_msmt_results.sql b/data/migrations/1_create_msmt_results.sql index ebe2576..4992704 100644 --- a/data/migrations/1_create_msmt_results.sql +++ b/data/migrations/1_create_msmt_results.sql @@ -3,6 +3,8 @@ DROP TABLE `results`; DROP TABLE `measurements`; +DROP TABLE `urls`; +DROP TABLE `networks`; -- +migrate StatementEnd @@ -11,36 +13,155 @@ DROP TABLE `measurements`; CREATE TABLE `results` ( `id` INTEGER PRIMARY KEY AUTOINCREMENT, - `name` VARCHAR(255), - `start_time` DATETIME, - `runtime` REAL, - `summary` JSON, - `done` TINYINT(1), - `country` VARCHAR(2), - `asn` VARCHAR(16), - `network_name` VARCHAR(255), - `data_usage_up` INTEGER, - `data_usage_down` INTEGER + -- This can be one of "websites", "im", "performance", "middlebox". + `test_group_name` VARCHAR(16) NOT NULL, + -- We use a different start_time and runtime, because we want to also have + -- data to measure the overhead of creating a report and other factors that + -- go into the test. + -- That is to say: `SUM(runtime) FROM measurements` will always be <= + -- `runtime FROM results` (most times <) + `start_time` DATETIME NOT NULL, + `runtime` REAL NOT NULL, + + -- Used to indicate if the user has seen this result + `is_viewed` TINYINT(1) NOT NULL, + + -- This is a flag used to indicate if the result is done or is currently running. + `is_done` TINYINT(1) NOT NULL, + `data_usage_up` INTEGER NOT NULL, + `data_usage_down` INTEGER NOT NULL, + -- It's probably reasonable to set the maximum length to 260 as this is the + -- maximum length of file paths on windows. + `log_file_path` VARCHAR(260) NOT NULL ); CREATE TABLE `measurements` ( `id` INTEGER PRIMARY KEY AUTOINCREMENT, - `name` VARCHAR(255), - `start_time` DATETIME, - `runtime` REAL, - `summary` JSON, - `ip` VARCHAR(255), - `asn` VARCHAR(16), - `country` VARCHAR(2), - `network_name` VARCHAR(255), - `state` TEXT, - `failure` VARCHAR(255), - `upload_failure` VARCHAR(255), - `uploaded` TINYINT(1), - `report_file` VARCHAR(255), - `report_id` VARCHAR(255), - `input` VARCHAR(255), - `result_id` INTEGER REFERENCES `results` (`id`) ON DELETE SET NULL ON UPDATE CASCADE + -- This can be one of: + -- facebook_messenger + -- telegram + -- whatsapp + -- http_header_field_manipulation + -- http_invalid_request_line + -- dash + -- ndt + `test_name` VARCHAR(64) NOT NULL, + `start_time` DATETIME NOT NULL, + `runtime` REAL NOT NULL, + + -- For the purpose of populating the probe information in the results + -- views, you should pick the first measurement in the JOIN sorted by + -- start_time. + -- You don't have the guarantee that every (ip, asn, country, network_name) + -- is the same in a "measurement set" associated to a "result". + `network_id` INTEGER NOT NULL, + FOREIGN KEY (`network_id`) REFERENCES `networks`(`id`), + + -- Note for golang: we used to have state be one of `done` and `active`, so + -- this is equivalent to done being true or false. + -- `state` TEXT, + `is_done` TINYINT(1) NOT NULL, + -- The reason to have a dedicated is_uploaded flag, instead of just using + -- is_upload_failed, is that we may not have uploaded the measurement due + -- to a setting. + `is_uploaded` TINYINT(1) NOT NULL, + + -- This is the measurement failed to run and the user should be offerred to + -- re-run it. + `is_failed` TINYINT(1) NOT NULL, + `failure_msg` VARCHAR(255), + + `is_upload_failed` TINYINT(1) NOT NULL, + `upload_failure_msg` VARCHAR(255), + + -- Is used to indicate that this particular measurement has been re-run and + -- therefore the UI can take this into account to either hide it from the + -- result view or at the very least disable the ability to re-run it. + -- XXX do we also want to have a reference to the re-run measurement? + `is_rerun` TINYINT(1) NOT NULL, + + -- This is the server-side report_id returned by the collector. By using + -- report_id & input, you can query the api to fetch this measurement. + -- Ex. + -- GET https://api.ooni.io/api/v1/measurements?input=$INPUT&report_id=$REPORT_ID + -- Extract the first item from the `result[]` list and then fetch: + -- `measurement_url` to get the JSON of this measurement row. + -- These two values (`report_id`, `input`) are useful to fetch a + -- measurement that has already been processed by the pipeline, to + -- implement cleanup of already uploaded measurements. + `report_id` VARCHAR(255), -- This can be NULL when no report file has been + -- created. + + `url_id` INTEGER NOT NULL, + FOREIGN KEY (`url_id`) REFERENCES `urls`(`id`), + + -- This is not yet a feature of the collector, but we are planning to add + -- this at some point in the near future. + -- See: https://github.com/ooni/pipeline/blob/master/docs/ooni-uuid.md & + -- https://github.com/ooni/pipeline/issues/48 + `measurement_id` INT(64), + + -- This indicates in the case of a websites test, that a site is likely + -- blocked, or for an IM test if the IM tests says the app is likely + -- blocked, or if a middlebox was detected. + -- You can `JOIN` a `COUNT()` of this value in the results view to get a count of + -- blocked sites or blocked IM apps + `is_anomaly` TINYINT(1), + + -- This is an opaque JSON structure, where we store some of the test_keys + -- we need for the measurement details views and some result views (ex. the + -- upload/download speed of NDT, the reason for blocking of a site, + -- etc.) + `test_keys` JSON, + + -- The cross table reference to JOIN the two tables together. + `result_id` INTEGER NOT NULL, + FOREIGN KEY (`result_id`) REFERENCES `results`(`id`) + ON DELETE CASCADE ON UPDATE CASCADE, -- If we delete a result we also want + -- all the measurements to be deleted as well. + + -- This is a variable used internally to track the path to the on-disk + -- measurements.json. It may make sense to write one file per entry by + -- hooking MK and preventing it from writing to a file on disk which may + -- have many measurements per file. + `report_file_path` VARCHAR(260) NOT NULL, +); + +CREATE TABLE `urls` ( + `id` INTEGER PRIMARY KEY AUTOINCREMENT, + `input` VARCHAR(255) NOT NULL, + `category_code` VARCHAR(5) NOT NULL, -- The citizenlab category code for the + -- site. We use the string NONE to denote + -- no known category code. + + `country_code` VARCHAR(2) NOT NULL -- The two letter country code which this + -- URL belongs to +) + +-- We create a separate table for networks for 2 reasons: +-- 1. For some of the views where need the total number of measured networks, +-- it's going to be much more efficient to just lookup the count of rows in this +-- table. +-- 2. (most important) We want to avoid duplicating a bunch of information that +-- is going to be common to several networks the user is on. +-- Example: +-- We may wish to add to this table the location from of the probe from the GPS +-- or add support for allowing the user to "correct" a misclassified measurement +-- or distinguishing between wifi and mobile. +CREATE TABLE `networks` ( + `id` INTEGER PRIMARY KEY AUTOINCREMENT, + `network_name` VARCHAR(255), -- String name representing the network_name which by default is populated based + -- on the ASN. + -- We use a separate key to reference the rows in + -- this tables, because we may wish to "enrich" + -- this with more data in the future. + + `ip` VARCHAR(40) NOT NULL, -- Stores a string representation of an ipv4 or ipv6 address. + -- The longest ip is an ipv6 address like: + -- 0000:0000:0000:0000:0000:0000:0000:0000, + -- which is 39 chars. + `asn` INT(4) NOT NULL, + `country` VARCHAR(2) NOT NULL, -- The two letter country code ); -- +migrate StatementEnd