Skip to content
This repository has been archived by the owner on May 4, 2021. It is now read-only.

Ticket30406 refactor header constants #357

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 92 additions & 73 deletions sbws/lib/v3bwfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
TORFLOW_SCALING, SBWS_SCALING, TORFLOW_BW_MARGIN,
TORFLOW_OBS_LAST, TORFLOW_OBS_MEAN,
PROP276_ROUND_DIG, MIN_REPORT, MAX_BW_DIFF_PERC)

# Using Stem's code header constant to don't maintain 2 versions.
from .stem_bandwidth_file import HEADER_ATTR, _int
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please write CI tests for sbws that use stem 1.7.0 and stem master.

from sbws.lib.resultdump import ResultSuccess, _ResultType
from sbws.util.filelock import DirectoryLock
from sbws.util.timestamp import (now_isodt_str, unixts_to_isodt_str,
Expand All @@ -27,15 +30,14 @@
KEYVALUE_SEP_V1 = '='
KEYVALUE_SEP_V2 = ' '

# NOTE: in a future refactor make make all the KeyValues be a dictionary
# with their type, so that it's more similar to stem parser.

# NOTE: Start using Stem's header constant in 1.1.1-dev0.
#
# Header KeyValues
# =================
# List of the extra KeyValues accepted by the class
EXTRA_ARG_KEYVALUES = ['software', 'software_version', 'file_created',
'earliest_bandwidth', 'generator_started',
'scanner_country', 'destinations_countries']

# XXX: These constants need to be kept even if using Stem's header one.
# Unless Stem's will add a property type to the attributes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this comment.
Please change the comment so it is clearer.

What does stem need to do?
Is there a stem ticket for this change?


# number_eligible_relays is the number that ends in the bandwidth file
# ie, have not been excluded by one of the filters in 4. below
# They should be call recent_measurement_included_count to be congruent
Expand All @@ -44,74 +46,82 @@
'number_consensus_relays', 'percent_eligible_relays',
'minimum_percent_eligible_relays']

# KeyValues that count the number of relays that are in the bandwidth file,
# but ignored by Tor when voting, because they do not have a
# measured bandwidth.
BW_HEADER_KEYVALUES_RECENT_MEASUREMENTS_EXCLUDED = [
# Number of relays that were measured but all the measurements failed
# because of network failures or it was
# not found a suitable helper relay
'recent_measurements_excluded_error_count',
# Number of relays that have successful measurements but the measurements
# were not away from each other in X time (by default 1 day).
'recent_measurements_excluded_near_count',
# Number of relays that have successful measurements and they are away from
# each other but they are not X time recent.
# By default this is 5 days, which is the same time the older
# the measurements can be by default.
'recent_measurements_excluded_old_count',
# Number of relays that have successful measurements and they are away from
# each other and recent
# but the number of measurements are less than X (by default 2).
'recent_measurements_excluded_few_count',
]
# Added in #29591
# NOTE: recent_consensus_count, recent_priority_list_count,
# recent_measurement_attempt_count and recent_priority_relay_count
# are not reset when the scanner is stop.
# They will accumulate the values since the scanner was ever started.
BW_HEADER_KEYVALUES_MONITOR = [
# 1.1 header: the number of different consensuses, that sbws has seen,
# since the last 5 days
'recent_consensus_count',
# 2.4 Number of times a priority list has been created
'recent_priority_list_count',
# 2.5 Number of relays that there were in a priority list
# [50, number of relays in the network * 0.05]
'recent_priority_relay_count',
# 3.6 header: the number of times that sbws has tried to measure any relay,
# since the last 5 days
# This would be the number of times a relays were in a priority list
'recent_measurement_attempt_count',
# 3.7 header: the number of times that sbws has tried to measure any relay,
# since the last 5 days, but it didn't work
# This should be the number of attempts - number of ResultSuccess -
# something else we don't know yet
# So far is the number of ResultError
'recent_measurement_failure_count',
# The time it took to report about half of the network.
'time_to_report_half_network',
] + BW_HEADER_KEYVALUES_RECENT_MEASUREMENTS_EXCLUDED
BANDWIDTH_HEADER_KEY_VALUES_INIT = \
['earliest_bandwidth', 'generator_started',
'scanner_country', 'destinations_countries']\
+ STATS_KEYVALUES \
+ BW_HEADER_KEYVALUES_MONITOR

KEYVALUES_INT = STATS_KEYVALUES + BW_HEADER_KEYVALUES_MONITOR
# List of all unordered KeyValues currently being used to generate the file
UNORDERED_KEYVALUES = EXTRA_ARG_KEYVALUES + STATS_KEYVALUES + \
['latest_bandwidth'] + \
BW_HEADER_KEYVALUES_MONITOR
# List of all the KeyValues currently being used to generate the file
ALL_KEYVALUES = ['version'] + UNORDERED_KEYVALUES
# # KeyValues that count the number of relays that are in the bandwidth file,
asn-d6 marked this conversation as resolved.
Show resolved Hide resolved
# # but ignored by Tor when voting, because they do not have a
# # measured bandwidth.
# BW_HEADER_KEYVALUES_RECENT_MEASUREMENTS_EXCLUDED = [
# # Number of relays that were measured but all the measurements failed
# # because of network failures or it was
# # not found a suitable helper relay
# 'recent_measurements_excluded_error_count',
# # Number of relays that have successful measurements but the measurements
# # were not away from each other in X time (by default 1 day).
# 'recent_measurements_excluded_near_count',
# # Number of relays that have successful measurements and they are away
# # from each other but they are not X time recent.
# # By default this is 5 days, which is the same time the older
# # the measurements can be by default.
# 'recent_measurements_excluded_old_count',
# # Number of relays that have successful measurements and they are away
# # from each other and recent
# # but the number of measurements are less than X (by default 2).
# 'recent_measurements_excluded_few_count',
# ]

# The stem's attribute name for these Keys starts with this string.
BW_HEADER_KEYVALUES_RECENT_MEASUREMENTS_EXCLUDED = \
[k for n, (k, _) in HEADER_ATTR.items()
if n.startswith("recent_stats.relay_failures.")]

# The commented code is kept so that
# it can be known what each Key means.

# # Added in #29591
# # NOTE: recent_consensus_count, recent_priority_list_count,
# # recent_measurement_attempt_count and recent_priority_relay_count
# # are not reset when the scanner is stop.
# # They will accumulate the values since the scanner was ever started.
# BW_HEADER_KEYVALUES_MONITOR = [
# # 1.1 header: the number of different consensuses, that sbws has seen,
# # since the last 5 days
# 'recent_consensus_count',
# # 2.4 Number of times a priority list has been created
# 'recent_priority_list_count',
# # 2.5 Number of relays that there were in a priority list
# # [50, number of relays in the network * 0.05]
# 'recent_priority_relay_count',
# # 3.6 header: the number of times that sbws has tried to measure any
# # relay,s ince the last 5 days
# # This would be the number of times a relays were in a priority list
# 'recent_measurement_attempt_count',
# # 3.7 header: the number of times that sbws has tried to measure any
# # relay, since the last 5 days, but it didn't work
# # This should be the number of attempts - number of ResultSuccess -
# # something else we don't know yet
# # So far is the number of ResultError
# 'recent_measurement_failure_count',
# # The time it took to report about half of the network.
# 'time_to_report_half_network',
# ] + BW_HEADER_KEYVALUES_RECENT_MEASUREMENTS_EXCLUDED
# BANDWIDTH_HEADER_KEY_VALUES_INIT = \
# ['earliest_bandwidth', 'generator_started',
# 'scanner_country', 'destinations_countries']\
# + STATS_KEYVALUES \
# + BW_HEADER_KEYVALUES_MONITOR

# XXX: The following constants are kept for compatibility with the generator/
# parser here. They should be replaced by Stem's one.
KEYVALUES_INT = [k for k, t in HEADER_ATTR.values() if t is _int]
# # List of all unordered KeyValues currently being used to generate the file
UNORDERED_KEYVALUES = [k for k, _ in HEADER_ATTR.values()]
UNORDERED_KEYVALUES.remove('version')

TERMINATOR = '====='

# Bandwidth Lines KeyValues
# =========================
# Num header lines in v1.X.X using all the KeyValues
NUM_LINES_HEADER_V1 = len(ALL_KEYVALUES) + 2
NUM_LINES_HEADER_V1 = len(HEADER_ATTR) + 2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why 2?
Please make 2 into a named constant.

LINE_TERMINATOR = TERMINATOR + LINE_SEP

# KeyValue separator in Bandwidth Lines
Expand Down Expand Up @@ -250,20 +260,28 @@ class V3BWHeader(object):
when the generator started
"""
def __init__(self, timestamp, **kwargs):
# XXX: do not convert all the arguments to string here, convert them
# only in __str__ and check their actual types.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment explains what the code is doing.
But why are the types important?

assert isinstance(timestamp, str)
for v in kwargs.values():
assert isinstance(v, str)
self.timestamp = timestamp
# KeyValues with default value when not given by kwargs

# Initialize all defined attrs,
# XXX: Since using the stem's name for the attributes will require
# other refactoring here, use the name in the actual file.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is "the name in the actual file"?
Do you mean "the key names from the bandwidth file spec"?

header_keys = [k for k, _ in HEADER_ATTR.values()]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not easy to understand if this code is refactoring or changes the behavior of sbws (adds/removes headers).

Could we have a unittest which makes sure that the old code and the new code will output the exact same thing? Perhaps instead of commenting out the code above, you can move it to the tests directory and make a unittest that checks the output of the old with the new code. Would this make sense?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not easy to understand if this code is refactoring or changes the behavior of sbws (adds/removes headers).

Could we have a unittest which makes sure that the old code and the new code will output the exact same thing? Perhaps instead of commenting out the code above, you can move it to the tests directory and make a unittest that checks the output of the old with the new code. Would this make sense?

Makes sense but then need to keep this old code in the test.
Stem has already tests for all the bandwidth files versions using data generated from sbws.
I checked, and including the stem's tests would require adding many more files.

So i came out with other solution that it's something i wanted to do time ago: include real measurements and bandwidth files generated with the test network for the tests.

So i run the integration tests to generate those files with master branch, then i created a test that generates the bandwidth file from the previous results and compare that the bandwidth file is the same.

I'm not pushing these changes yet until knowing your opinion. If you don't like that solution will open other ticket to include them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other way of ensuring that i didn't accidentally change functionality is the fact that i didn't have to change any test for this PR. It's true though that i don't think there're unit tests checking all the headers, but the previous idea would solve that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pushed the new test

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this new test very much. I think it's a good step.
I added a few comments while I was trying to understand what the test does.

[setattr(self, k, v) for k, v in kwargs.items() if k in header_keys]

# Then initialize the ones with defaults, if they weren't already.
# Ideally these defaults could be defined in the HEADER_ATTR constant.
self.version = kwargs.get('version', SPEC_VERSION)
self.software = kwargs.get('software', 'sbws')
self.software_version = kwargs.get('software_version', __version__)
self.file_created = kwargs.get('file_created', now_isodt_str())
# latest_bandwidth should not be in kwargs, since it MUST be the
# same as timestamp
self.latest_bandwidth = unixts_to_isodt_str(timestamp)
[setattr(self, k, v) for k, v in kwargs.items()
if k in BANDWIDTH_HEADER_KEY_VALUES_INIT]

def __str__(self):
if self.version.startswith('1.'):
Expand Down Expand Up @@ -337,9 +355,10 @@ def from_lines_v1(cls, lines):
log.warn('Terminator is not in lines')
return None
ts = lines[0]
header_keys = [k for k, _ in HEADER_ATTR.values()]
kwargs = dict([l.split(KEYVALUE_SEP_V1)
for l in lines[:index_terminator]
if l.split(KEYVALUE_SEP_V1)[0] in ALL_KEYVALUES])
if l.split(KEYVALUE_SEP_V1)[0] in header_keys])
h = cls(ts, **kwargs)
# last line is new line
return h, lines[index_terminator + 1:-1]
Expand Down