Verified Commit 03d47b1b authored by Torsten Grote's avatar Torsten Grote
Browse files

Refactor shared code into super class

parent 432201d3
#!/usr/bin/env python3
import argparse
import collections
import csv
import json
import lzma
import os
import re
import sys
from datetime import datetime
BRIDGES = {}
from ooni_analyzer import OoniAnalyser
COUNTRIES = [
'BY', 'CN', 'EG', 'TR', 'IR', # Tor assumed to be blocked
'US', 'GB', 'DE', 'IT', 'RU', # Tor should not be blocked (control group)
'US', 'GB', 'DE', 'IT', # Tor should not be blocked (control group)
]
FAILURES = [
'generic_timeout_error',
......@@ -22,125 +16,97 @@ FAILURES = [
'tcp_timed_out_error',
'dns_lookup_error'
]
RESULTS = FAILURES + ['success']
DATE_REGEX = re.compile("^(\d{8}T\d{6}Z)-")
def main():
parser = argparse.ArgumentParser(
description='Get countries where Tor bridges might be blocked.')
parser.add_argument('path', metavar='path', type=str,
help='directory with OONI reports as supplied by ooni-sync')
parser.add_argument('--since', dest='since', type=date, metavar='YY-MM-DD',
help='only consider reports after that date, e.g. 2017-12-24')
args = parser.parse_args()
if not os.path.isdir(args.path):
fail("Could not find directory '%s'" % args.path)
for file in sorted(os.listdir(args.path)):
if not file.endswith('.json.xz'):
continue
# only consider reports since the supplied date
if args.since:
match = DATE_REGEX.match(file)
if match:
file_date = datetime.strptime(match.group(1), '%Y%m%dT%H%M%SZ')
if file_date < args.since:
continue
# open compressed report file
with lzma.open(os.path.join(args.path, file), 'r') as f:
for line in f.readlines():
data = json.loads(line)
# filter out measurements with missing information
if 'test_class' not in data['annotations']:
continue
if data['annotations']['test_class'] != 'tor_bridge_reachability':
continue
if 'connection' not in data['test_keys']:
continue
# initialize bridge data if needed
bridge = data['input']
if bridge not in BRIDGES:
BRIDGES[bridge] = get_new_bridge()
BRIDGES[bridge]['total'] += 1
# add results per country
cc = data['probe_cc']
if cc not in COUNTRIES:
continue
if cc not in BRIDGES[bridge]:
BRIDGES[bridge][cc] = get_new_country()
result = data['test_keys']['connection']
if result == 'success' or result in FAILURES:
BRIDGES[bridge][cc][result] += 1
else:
fail("Unknown connection result: %s" % result)
# re-order data for CSV export
csv_dict = collections.OrderedDict()
csv_field_names = set()
csv_field_names.add('@bridge')
csv_field_names.add('@total')
for bridge, data in BRIDGES.items():
for key, value in sorted(data.items()):
# add bridge name
if bridge not in csv_dict:
csv_dict[bridge] = {'@bridge': bridge}
# get total and ignore two few measurements
if key == 'total':
if value is None or value < 500:
del csv_dict[bridge]
break
csv_dict[bridge]['@total'] = value
continue # not a country
# calculate success rate
country = value
for result, num in country.items():
if result != 'total' and result != 'failure':
country['total'] += num
if result in FAILURES:
country['failure'] += num
field_name = key
csv_field_names.add(field_name)
success_rate = round(country['success'] / country['total'] * 100, 2)
csv_dict[bridge][field_name] = success_rate
field_name = "#" + key
csv_field_names.add(field_name)
csv_dict[bridge][field_name] = country['total']
# write CSV file
with open('countries-bridges.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=sorted(list(csv_field_names)))
writer.writeheader()
for row in csv_dict.values():
writer.writerow(row)
def date(date_str):
# Converts a string supplied at the command line into a date
return datetime.strptime(date_str, "%Y-%m-%d")
def get_new_bridge():
return {
'total': 0,
}
def get_new_country():
country = {
'total': 0,
'success': 0,
'failure': 0,
}
for failure in FAILURES:
country[failure] = 0
return country
def fail(msg=""):
sys.stderr.write("Error: %s\n" % msg)
sys.exit(1)
analyzer = BridgeAnalyzer('Get countries where Tor bridges might be blocked.')
analyzer.analyze()
class BridgeAnalyzer(OoniAnalyser):
bridge_data = {}
def parse_report(self, data):
# filter out measurements with missing information
if 'test_class' not in data['annotations']:
return
if data['annotations']['test_class'] != 'tor_bridge_reachability':
return
if 'connection' not in data['test_keys']:
return
# initialize bridge data if needed
bridge = data['input']
if bridge not in self.bridge_data:
self.bridge_data[bridge] = self.get_new_bridge()
self.bridge_data[bridge]['total'] += 1
# add results per country
cc = data['probe_cc']
if cc not in COUNTRIES:
return
if cc not in self.bridge_data[bridge]:
self.bridge_data[bridge][cc] = self.get_new_country()
result = data['test_keys']['connection']
if result == 'success' or result in FAILURES:
self.bridge_data[bridge][cc][result] += 1
else:
self.fail("Unknown connection result: %s" % result)
def process_data(self):
# re-order data for CSV export
csv_dict = collections.OrderedDict()
csv_field_names = set()
csv_field_names.add('@bridge')
csv_field_names.add('@total')
for bridge, data in self.bridge_data.items():
for key, value in sorted(data.items()):
# add bridge name
if bridge not in csv_dict:
csv_dict[bridge] = {'@bridge': bridge}
# get total and ignore two few measurements
if key == 'total':
if value is None or value < 500:
del csv_dict[bridge]
break
csv_dict[bridge]['@total'] = value
continue # not a country
# calculate success rate
country = value
for result, num in country.items():
if result != 'total' and result != 'failure':
country['total'] += num
if result in FAILURES:
country['failure'] += num
field_name = key
csv_field_names.add(field_name)
success_rate = round(country['success'] / country['total'] * 100, 2)
csv_dict[bridge][field_name] = success_rate
field_name = "#" + key
csv_field_names.add(field_name)
csv_dict[bridge][field_name] = country['total']
# write CSV file
with open('countries-bridges.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=sorted(list(csv_field_names)))
writer.writeheader()
for row in csv_dict.values():
writer.writerow(row)
@staticmethod
def get_new_bridge():
return {
'total': 0,
}
@staticmethod
def get_new_country():
country = {
'total': 0,
'success': 0,
'failure': 0,
}
for failure in FAILURES:
country[failure] = 0
return country
if __name__ == "__main__":
......
#!/usr/bin/env python3
import argparse
import collections
import json
import lzma
import os
import re
import sys
from datetime import datetime
import emojiflags.lookup
import pycountry
COUNTRY_DATA = collections.OrderedDict()
DATE_REGEX = re.compile("^(\d{8}T\d{6}Z)-")
from ooni_analyzer import OoniAnalyser
def main():
parser = argparse.ArgumentParser(description='Get countries where Tor might be blocked.')
parser.add_argument('path', metavar='path', type=str,
help='directory with OONI reports as supplied by ooni-sync')
parser.add_argument('--since', dest='since', type=date, metavar='YY-MM-DD',
help='only consider reports after that date, e.g. 2017-12-24')
parser.add_argument('-t', '--success-threshold', dest='threshold', type=float, default=50,
metavar='PERCENT',
help='consider countries as blocking with a success rate less than this' +
'default: 50')
parser.add_argument('-j', '--json', dest='json', action='store_true',
help='create a \'countries.json\' file with the results')
args = parser.parse_args()
if not os.path.isdir(args.path):
fail("Could not find directory '%s'" % args.path)
# get success and failure counts per country from report data
for file in sorted(os.listdir(args.path)):
if not file.endswith('.json.xz'):
continue
# only consider reports since the supplied date
if args.since:
match = DATE_REGEX.match(file)
if match:
file_date = datetime.strptime(match.group(1), '%Y%m%dT%H%M%SZ')
if file_date < args.since:
continue
# open compressed report file
with lzma.open(os.path.join(args.path, file), 'r') as f:
for line in f.readlines():
data = json.loads(line)
if data['test_keys']['success'] is None:
continue
cc = data['probe_cc']
if cc not in COUNTRY_DATA:
COUNTRY_DATA[cc] = get_new_country(cc)
COUNTRY_DATA[cc]['total_count'] += 1
if data["test_keys"]["success"]:
COUNTRY_DATA[cc]['success_count'] += 1
else:
COUNTRY_DATA[cc]['failure_count'] += 1
# calculate success percentage and identify blocking countries
blocking_countries = []
for country, data in sorted(COUNTRY_DATA.items()):
data['success_percent'] = round(data['success_count'] / data['total_count'] * 100, 2)
if data['total_count'] > 5:
if data['success_percent'] < args.threshold:
blocking_countries.append(country)
print("%s: %d reports, %.2f success rate" % (
country, data['total_count'], data['success_percent']))
# print blocking countries
blocking_countries = sorted(blocking_countries)
print()
print("Blocked Countries: %s" % str(blocking_countries))
print()
for country in blocking_countries:
print("%s: %s" % (country, str(COUNTRY_DATA[country])))
# write a JSON file with all data to be published on the web
if args.json:
with open('countries.json', 'w') as f:
f.write(json.dumps(list(COUNTRY_DATA.values()), indent=4))
def date(date_str):
# Converts a string supplied at the command line into a date
return datetime.strptime(date_str, "%Y-%m-%d")
def get_new_country(cc):
try:
country = pycountry.countries.get(alpha_2=cc).name
flag = emojiflags.lookup.lookup(cc)
country = "%s %s" % (flag, country)
except KeyError:
country = cc
return {
'country': country,
'total_count': 0,
'success_count': 0,
'failure_count': 0,
'success_percent': None,
}
def fail(msg=""):
sys.stderr.write("Error: %s\n" % msg)
sys.exit(1)
analyzer = TorAnalyzer('Get countries where Tor might be blocked.')
analyzer.analyze()
class TorAnalyzer(OoniAnalyser):
country_data = collections.OrderedDict()
def init_arg_parser(self):
parser = super().init_arg_parser()
parser.add_argument('-t', '--success-threshold', dest='threshold', type=float, default=50,
metavar='PERCENT',
help='consider countries as blocking with a success rate less than this'
+ '. default: 50')
parser.add_argument('-j', '--json', dest='json', action='store_true',
help='create a \'countries.json\' file with the results')
return parser
def parse_report(self, data):
if data['test_keys']['success'] is None:
return
cc = data['probe_cc']
if cc not in self.country_data:
self.country_data[cc] = self.get_new_country(cc)
self.country_data[cc]['total_count'] += 1
if data["test_keys"]["success"]:
self.country_data[cc]['success_count'] += 1
else:
self.country_data[cc]['failure_count'] += 1
def process_data(self):
# calculate success percentage and identify blocking countries
blocking_countries = []
for country, data in sorted(self.country_data.items()):
data['success_percent'] = round(data['success_count'] / data['total_count'] * 100, 2)
if data['total_count'] > 5:
if data['success_percent'] < self.args.threshold:
blocking_countries.append(country)
print("%s: %d reports, %.2f success rate" % (
country, data['total_count'], data['success_percent']))
# print blocking countries
blocking_countries = sorted(blocking_countries)
print()
print("Blocked Countries: %s" % str(blocking_countries))
print()
for country in blocking_countries:
print("%s: %s" % (country, str(self.country_data[country])))
# write a JSON file with all data to be published on the web
if self.args.json:
with open('countries.json', 'w') as f:
f.write(json.dumps(list(self.country_data.values()), indent=4))
@staticmethod
def get_new_country(cc):
try:
country = pycountry.countries.get(alpha_2=cc).name
flag = emojiflags.lookup.lookup(cc)
country = "%s %s" % (flag, country)
except KeyError:
country = cc
return {
'country': country,
'total_count': 0,
'success_count': 0,
'failure_count': 0,
'success_percent': None,
}
if __name__ == "__main__":
......
import argparse
import json
import lzma
import os
import re
import sys
from datetime import datetime
DATE_REGEX = re.compile("^(\d{8}T\d{6}Z)-")
class OoniAnalyser:
def __init__(self, description):
self.args = None
self.description = description
parser = self.init_arg_parser()
self.args = parser.parse_args()
def init_arg_parser(self):
parser = argparse.ArgumentParser(description=self.description)
parser.add_argument('path', metavar='path', type=str,
help='directory with OONI reports as supplied by ooni-sync')
parser.add_argument('--since', dest='since', type=date, metavar='YY-MM-DD',
help='only consider reports after that date, e.g. 2017-12-24')
return parser
def analyze(self):
if not os.path.isdir(self.args.path):
self.fail("Could not find directory '%s'" % self.args.path)
# get success and failure counts per country from report data
for file in sorted(os.listdir(self.args.path)):
if not file.endswith('.json.xz'):
continue
# only consider reports since the supplied date
if self.args.since:
match = DATE_REGEX.match(file)
if match:
file_date = datetime.strptime(match.group(1), '%Y%m%dT%H%M%SZ')
if file_date < self.args.since:
continue
# open compressed report file
with lzma.open(os.path.join(self.args.path, file), 'r') as f:
for line in f.readlines():
data = json.loads(line)
self.parse_report(data)
self.process_data()
def parse_report(self, data):
raise NotImplementedError()
def process_data(self):
raise NotImplementedError()
@staticmethod
def fail(msg=""):
sys.stderr.write("Error: %s\n" % msg)
sys.exit(1)
def date(date_str):
# Converts a string supplied at the command line into a date
return datetime.strptime(date_str, "%Y-%m-%d")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment