From 66834a52af2ba4f5218d72fe74360d173b90aa6a Mon Sep 17 00:00:00 2001 From: Naji Obeid Date: Thu, 19 Oct 2023 04:31:21 +0000 Subject: [PATCH] update test-data readme and delete old script --- test-data/README.md | 42 ++- test-data/write-test-data.pl | 681 ----------------------------------- 2 files changed, 28 insertions(+), 695 deletions(-) delete mode 100755 test-data/write-test-data.pl diff --git a/test-data/README.md b/test-data/README.md index 596bd76..7931168 100644 --- a/test-data/README.md +++ b/test-data/README.md @@ -1,15 +1,10 @@ -The -[write-test-data.pl](https://github.com/maxmind/MaxMind-DB/blob/main/test-data/write-test-data.pl) -script will create a small set of test databases with a variety of data and -record sizes (24, 28, & 32 bit). +## How to generate test data +Use the [write-test-data](https://github.com/maxmind/MaxMind-DB/blob/main/cmd/write-test-data) +go tool to create a small set of test databases with a variety of data and +record sizes. These test databases are useful for testing code that reads MaxMind DB files. -There is also a `maps-with-pointers.raw` file. This contains the raw output of -the MaxMind::DB::Writer::Serializer module, when given a series of maps which -share some keys and values. It is used to test that decoder code can handle -pointers to map keys and values, as well as to the whole map. - There are several ways to figure out what IP addresses are actually in the test databases. You can take a look at the [source-data directory](https://github.com/maxmind/MaxMind-DB/tree/main/source-data) @@ -21,8 +16,27 @@ You can also use the in the [MaxMind-DB-Reader-perl repository](https://github.com/maxmind/MaxMind-DB-Reader-perl). -Some databases are intentionally broken and cannot be dumped. You can look at -the -[script which generates these databases](https://github.com/maxmind/MaxMind-DB/blob/main/test-data/write-test-data.pl) -to see what IP addresses they include, which will be necessary for those -databases which cannot be dumped because they contain intentional errors. +## Static test data +Some of the test files are remnants of the +[old perl test data writer](https://github.com/maxmind/MaxMind-DB/blob/f0a85c671c5b6e9c5e514bd66162724ee1dedea3/test-data/write-test-data.pl) +and cannot be generated with the go tool. These databases are intentionally broken, +and exploited functionality simply not available in the go mmdbwriter: + +- MaxMind-DB-test-broken-pointers-24.mmdb +- MaxMind-DB-test-broken-search-tree-24.mmdb +- MaxMind-DB-test-pointer-decoder.mmdb +- GeoIP2-City-Test-Broken-Double-Format.mmdb +- GeoIP2-City-Test-Invalid-Node-Count.mmdb +- maps-with-pointers.raw + +## Usage +``` +Usage of ./write-test-data: + -source string + Source data directory + -target string + Destination directory for the generated mmdb files +``` + +Example: +`./write-test-data --source ../../source-data --target ../../test-data` diff --git a/test-data/write-test-data.pl b/test-data/write-test-data.pl deleted file mode 100755 index ee34943..0000000 --- a/test-data/write-test-data.pl +++ /dev/null @@ -1,681 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; -use autodie; -use utf8; - -use Cwd qw( abs_path ); -use File::Basename qw( dirname ); -use File::Slurper qw( read_binary write_binary ); -use Cpanel::JSON::XS 4.16 qw( decode_json ); -use Math::Int128 qw( MAX_UINT128 string_to_uint128 uint128 ); -use MaxMind::DB::Writer::Serializer 0.100004; -use MaxMind::DB::Writer::Tree 0.100004; -use MaxMind::DB::Writer::Util qw( key_for_data ); -use Net::Works::Network (); -use Test::MaxMind::DB::Common::Util qw( standard_test_metadata ); - -my $Dir = dirname( abs_path($0) ); - -sub main { - my @sizes = ( 24, 28, 32 ); - my @ipv4_range = ( '1.1.1.1', '1.1.1.32' ); - - my @ipv4_subnets = Net::Works::Network->range_as_subnets(@ipv4_range); - for my $record_size (@sizes) { - write_test_db( - $record_size, \@ipv4_subnets, { ip_version => 4 }, - 'ipv4', - ); - } - - write_broken_pointers_test_db( - 24, \@ipv4_subnets, { ip_version => 4 }, - 'broken-pointers', - ); - - write_broken_search_tree_db( - 24, \@ipv4_subnets, { ip_version => 4 }, - 'broken-search-tree', - ); - - my @ipv6_subnets = Net::Works::Network->range_as_subnets( - '::1:ffff:ffff', - '::2:0000:0059' - ); - - for my $record_size (@sizes) { - write_test_db( - $record_size, \@ipv6_subnets, { ip_version => 6 }, - 'ipv6', - ); - - write_test_db( - $record_size, - [ - @ipv6_subnets, - Net::Works::Network->range_as_subnets( @ipv4_range, 6 ), - ], - { ip_version => 6 }, - 'mixed', - ); - } - - write_decoder_test_db(); - write_pointer_decoder_test_db(); - write_deeply_nested_structures_db(); - - write_geoip2_dbs(); - write_broken_geoip2_city_db(); - write_invalid_node_count(); - - write_no_ipv4_tree_db(); - - write_no_map_db( \@ipv4_subnets ); - - write_test_serialization_data(); - - write_db_with_metadata_pointers(); -} - -sub write_broken_pointers_test_db { - no warnings 'redefine'; - - my $orig_store_data = MaxMind::DB::Writer::Serializer->can('store_data'); - - # This breaks the value of the record for the 1.1.1.32 network, causing it - # to point outside the database. - local *MaxMind::DB::Writer::Serializer::store_data = sub { - my $data_pointer = shift->$orig_store_data(@_); - my $value = $_[1]; - if ( ref($value) eq 'HASH' - && exists $value->{ip} - && $value->{ip} eq '1.1.1.32' ) { - - $data_pointer += 100_000; - } - return $data_pointer; - }; - - # The next hack will poison the data section for the 1.1.16/28 subnet - # value. It's value will be a pointer that resolves to an offset outside - # the database. - - my $key_to_poison = key_for_data( { ip => '1.1.1.16' } ); - - my $orig_position_for_data - = MaxMind::DB::Writer::Serializer->can('_position_for_data'); - local *MaxMind::DB::Writer::Serializer::_position_for_data = sub { - my $key = $_[1]; - - if ( $key eq $key_to_poison ) { - return 1_000_000; - } - else { - return shift->$orig_position_for_data(@_); - } - }; - - write_test_db(@_); - - return; -} - -sub write_broken_search_tree_db { - my $filename = ( write_test_db(@_) )[1]; - - my $content = read_binary($filename); - - # This causes the right record of the first node to be 0, meaning it - # points back to the top of the tree. This should never happen in a - # database that follows the spec. - substr( $content, 5, 1 ) = "\0"; - write_binary( $filename, $content ); - - return; -} - -sub write_test_db { - my $record_size = shift; - my $subnets = shift; - my $metadata = shift; - my $ip_version_name = shift; - - my $writer = MaxMind::DB::Writer::Tree->new( - ip_version => $subnets->[0]->version(), - record_size => $record_size, - alias_ipv6_to_ipv4 => ( $subnets->[0]->version() == 6 ? 1 : 0 ), - map_key_type_callback => sub { 'utf8_string' }, - standard_test_metadata(), - %{$metadata}, - ); - - for my $subnet ( @{$subnets} ) { - $writer->insert_network( - $subnet, - { ip => $subnet->first()->as_string() } - ); - } - - my $filename = sprintf( - "$Dir/MaxMind-DB-test-%s-%i.mmdb", - $ip_version_name, $record_size, - ); - open my $fh, '>', $filename; - - $writer->write_tree($fh); - - close $fh; - - return ( $writer, $filename ); -} - -{ - # We will store this once for each subnet so we will also be testing - # pointers, since the serializer will generate a pointer to this - # structure. - my %all_types = ( - utf8_string => 'unicode! ☯ - ♫', - double => 42.123456, - bytes => pack( 'N', 42 ), - uint16 => 100, - uint32 => 2**28, - int32 => -1 * ( 2**28 ), - uint64 => uint128(1) << 60, - uint128 => uint128(1) << 120, - array => [ 1, 2, 3, ], - map => { - mapX => { - utf8_stringX => 'hello', - arrayX => [ 7, 8, 9 ], - }, - }, - boolean => 1, - float => 1.1, - ); - - my %all_types_0 = ( - utf8_string => q{}, - double => 0, - bytes => q{}, - uint16 => 0, - uint32 => 0, - int32 => 0, - uint64 => uint128(0), - uint128 => uint128(0), - array => [], - map => {}, - boolean => 0, - float => 0, - ); - - # We limit this to numeric types as the other types would generate - # very large databases - my %numeric_types_max = ( - double => 'Inf', - float => 'Inf', - int32 => 0x7fffffff, - uint16 => 0xffff, - uint32 => string_to_uint128('0xffff_ffff'), - uint64 => string_to_uint128('0xffff_ffff_ffff_ffff'), - uint128 => MAX_UINT128, - ); - - sub write_decoder_test_db { - my $writer = _decoder_writer(); - - my @subnets - = map { Net::Works::Network->new_from_string( string => $_ ) } - qw( - ::1.1.1.0/120 - ::2.2.0.0/112 - ::3.0.0.0/104 - ::4.5.6.7/128 - abcd::/64 - 1000::1234:0000/112 - ); - - for my $subnet (@subnets) { - $writer->insert_network( $subnet, \%all_types, ); - } - - $writer->insert_network( - Net::Works::Network->new_from_string( string => '::0.0.0.0/128' ), - \%all_types_0, - ); - - $writer->insert_network( - Net::Works::Network->new_from_string( - string => '::255.255.255.255/128' - ), - \%numeric_types_max, - ); - - open my $fh, '>', "$Dir/MaxMind-DB-test-decoder.mmdb"; - $writer->write_tree($fh); - close $fh; - - return; - } - - sub write_pointer_decoder_test_db { - - # We want to create a database where most values are pointers - no warnings 'redefine'; - local *MaxMind::DB::Writer::Serializer::_should_cache_value - = sub { 1 }; - my $writer = _decoder_writer(); - - # We add these slightly different records so that we end up with - # pointers for the individual values in the maps, not just pointers - # to the map - $writer->insert_network( - '1.0.0.0/32', - { - %all_types, - booleanX => 0, - arrayX => [ 1, 2, 3, 4, ], - mapXX => { - utf8_stringX => 'hello', - arrayX => [ 7, 8, 9, 10 ], - booleanX => 0, - }, - }, - ); - - $writer->insert_network( - '1.1.1.0/32', - { - %all_types, - - # This has to be 0 rather than 1 as otherwise the buggy - # Perl writer will think it is the same as an uint32 value of - # 1 and make a pointer to a value of a different type. - boolean => 0, - }, - ); - - open my $fh, '>', "$Dir/MaxMind-DB-test-pointer-decoder.mmdb"; - $writer->write_tree($fh); - close $fh; - - return; - } - - sub _decoder_writer { - return MaxMind::DB::Writer::Tree->new( - ip_version => 6, - record_size => 24, - database_type => 'MaxMind DB Decoder Test', - languages => ['en'], - description => { - en => - 'MaxMind DB Decoder Test database - contains every MaxMind DB data type', - }, - alias_ipv6_to_ipv4 => 1, - remove_reserved_networks => 0, - map_key_type_callback => sub { - my $key = $_[0]; - $key =~ s/X*$//; - return $key eq 'array' ? [ 'array', 'uint32' ] : $key; - }, - ); - } -} - -{ - my %nested = ( - map1 => { - map2 => { - array => [ - { - map3 => { a => 1, b => 2, c => 3 }, - }, - ], - }, - }, - ); - - sub write_deeply_nested_structures_db { - my $writer = MaxMind::DB::Writer::Tree->new( - ip_version => 6, - record_size => 24, - ip_version => 6, - database_type => 'MaxMind DB Nested Data Structures', - languages => ['en'], - description => { - en => - 'MaxMind DB Nested Data Structures Test database - contains deeply nested map/array structures', - }, - alias_ipv6_to_ipv4 => 1, - map_key_type_callback => sub { - my $key = shift; - return - $key =~ /^map/ ? 'map' - : $key eq 'array' ? [ 'array', 'map' ] - : 'uint32'; - } - ); - - my @subnets - = map { Net::Works::Network->new_from_string( string => $_ ) } - qw( - ::1.1.1.0/120 - ::2.2.0.0/112 - ::3.0.0.0/104 - ::4.5.6.7/128 - abcd::/64 - 1000::1234:0000/112 - ); - - for my $subnet (@subnets) { - $writer->insert_network( $subnet, \%nested, ); - } - - open my $fh, '>', "$Dir/MaxMind-DB-test-nested.mmdb"; - $writer->write_tree($fh); - close $fh; - - return; - } -} - -sub write_geoip2_dbs { - _write_geoip2_db( @{$_}[ 0, 1 ], 'Test' ) - for ( - [ 'GeoIP2-Anonymous-IP', {} ], - ['GeoIP2-City'], - ['GeoIP2-Connection-Type'], - ['GeoIP2-Country'], - ['GeoIP2-DensityIncome'], - ['GeoIP2-Domain'], - ['GeoIP2-Enterprise'], - ['GeoIP2-ISP'], - ['GeoIP2-Precision-Enterprise'], - ['GeoIP2-Static-IP-Score'], - ['GeoIP2-User-Count'], - ['GeoLite2-ASN'], - ['GeoLite2-City'], - ['GeoLite2-Country'], - ); -} - -sub write_broken_geoip2_city_db { - no warnings 'redefine'; - - # This is how we _used_ to encode doubles. Storing them this way with the - # current reader tools can lead to weird errors. This broken database is a - # good way to test the robustness of reader code in the face of broken - # databases. - local *MaxMind::DB::Writer::Serializer::_encode_double = sub { - my $self = shift; - my $value = shift; - - $self->_simple_encode( double => $value ); - }; - - _write_geoip2_db( 'GeoIP2-City', 0, 'Test Broken Double Format' ); -} - -sub write_invalid_node_count { - no warnings 'redefine'; - local *MaxMind::DB::Writer::Tree::node_count = sub { 100000 }; - - _write_geoip2_db( 'GeoIP2-City', 0, 'Test Invalid Node Count' ); -} - -sub _universal_map_key_type_callback { - my $map = { - - # languages - de => 'utf8_string', - en => 'utf8_string', - es => 'utf8_string', - fr => 'utf8_string', - ja => 'utf8_string', - 'pt-BR' => 'utf8_string', - ru => 'utf8_string', - 'zh-CN' => 'utf8_string', - - # production - accuracy_radius => 'uint16', - autonomous_system_number => 'uint32', - autonomous_system_organization => 'utf8_string', - average_income => 'uint32', - city => 'map', - code => 'utf8_string', - confidence => 'uint16', - connection_type => 'utf8_string', - continent => 'map', - country => 'map', - domain => 'utf8_string', - geoname_id => 'uint32', - ip_risk => 'double', - ipv4_24 => 'uint32', - ipv4_32 => 'uint32', - ipv6_32 => 'uint32', - ipv6_48 => 'uint32', - ipv6_64 => 'uint32', - is_anonymous => 'boolean', - is_anonymous_proxy => 'boolean', - is_anonymous_vpn => 'boolean', - is_hosting_provider => 'boolean', - is_in_european_union => 'boolean', - is_legitimate_proxy => 'boolean', - is_public_proxy => 'boolean', - is_residential_proxy => 'boolean', - is_satellite_provider => 'boolean', - is_tor_exit_node => 'boolean', - iso_code => 'utf8_string', - isp => 'utf8_string', - latitude => 'double', - location => 'map', - longitude => 'double', - metro_code => 'uint16', - mobile_country_code => 'utf8_string', - mobile_network_code => 'utf8_string', - names => 'map', - organization => 'utf8_string', - population_density => 'uint32', - postal => 'map', - registered_country => 'map', - represented_country => 'map', - score => 'double', - static_ip_score => 'double', - subdivisions => [ 'array', 'map' ], - time_zone => 'utf8_string', - traits => 'map', - traits => 'map', - type => 'utf8_string', - user_type => 'utf8_string', - - # for testing only - foo => 'utf8_string', - bar => 'utf8_string', - buzz => 'utf8_string', - our_value => 'utf8_string', - }; - - my $callback = sub { - my $key = shift; - - return $map->{$key} || die <<"ERROR"; -Unknown tree key '$key'. - -The universal_map_key_type_callback doesn't know what type to use for the passed -key. If you are adding a new key that will be used in a frozen tree / mmdb then -you should update the mapping in both our internal code and here. -ERROR - }; - - return $callback; -} - -sub _write_geoip2_db { - my $type = shift; - my $populate_all_networks_with_data = shift; - my $description = shift; - - my $writer = MaxMind::DB::Writer::Tree->new( - ip_version => 6, - record_size => 28, - ip_version => 6, - database_type => $type, - languages => [ 'en', $type eq 'GeoIP2-City' ? ('zh') : () ], - description => { - en => ( $type =~ s/-/ /gr ) - . " $description Database (fake GeoIP2 data, for example purposes only)", - $type eq 'GeoIP2-City' ? ( zh => '小型数据库' ) : (), - }, - alias_ipv6_to_ipv4 => 1, - map_key_type_callback => _universal_map_key_type_callback(), - ); - - _populate_all_networks( $writer, $populate_all_networks_with_data ) - if $populate_all_networks_with_data; - - my $value = shift; - my $nodes - = decode_json( read_binary("$Dir/../source-data/$type-Test.json") ); - - for my $node (@$nodes) { - for my $network ( keys %$node ) { - $writer->insert_network( - Net::Works::Network->new_from_string( string => $network ), - $node->{$network} - ); - } - } - - my $suffix = $description =~ s/ /-/gr; - open my $output_fh, '>', "$Dir/$type-$suffix.mmdb"; - $writer->write_tree($output_fh); - close $output_fh; - - return; -} - -sub _populate_all_networks { - my $writer = shift; - my $data = shift; - - my $max_uint128 = uint128(0) - 1; - my @networks = Net::Works::Network->range_as_subnets( - Net::Works::Address->new_from_integer( - integer => 0, - version => 6, - ), - Net::Works::Address->new_from_integer( - integer => $max_uint128, - version => 6, - ), - ); - - for my $network (@networks) { - $writer->insert_network( $network => $data ); - } -} - -sub write_no_ipv4_tree_db { - my $subnets = shift; - - my $writer = MaxMind::DB::Writer::Tree->new( - ip_version => 6, - record_size => 24, - ip_version => 6, - database_type => 'MaxMind DB No IPv4 Search Tree', - languages => ['en'], - description => { - en => 'MaxMind DB No IPv4 Search Tree', - }, - remove_reserved_networks => 0, - root_data_type => 'utf8_string', - map_key_type_callback => sub { {} }, - ); - - my $subnet = Net::Works::Network->new_from_string( string => '::/64' ); - $writer->insert_network( $subnet, $subnet->as_string() ); - - open my $output_fh, '>', "$Dir/MaxMind-DB-no-ipv4-search-tree.mmdb"; - $writer->write_tree($output_fh); - close $output_fh; - - return; -} - -# The point of this database is to provide something where we can test looking -# up a single value. In other words, each IP address points to a non-compound -# value, a string rather than a map or array. -sub write_no_map_db { - my $subnets = shift; - - my $writer = MaxMind::DB::Writer::Tree->new( - ip_version => 4, - record_size => 24, - database_type => 'MaxMind DB String Value Entries', - languages => ['en'], - description => { - en => - 'MaxMind DB String Value Entries (no maps or arrays as values)', - }, - root_data_type => 'utf8_string', - map_key_type_callback => sub { {} }, - ); - - for my $subnet ( @{$subnets} ) { - $writer->insert_network( $subnet, $subnet->as_string() ); - } - - open my $output_fh, '>', "$Dir/MaxMind-DB-string-value-entries.mmdb"; - $writer->write_tree($output_fh); - close $output_fh; - - return; -} - -sub write_test_serialization_data { - my $serializer = MaxMind::DB::Writer::Serializer->new( - map_key_type_callback => sub { 'utf8_string' } ); - - $serializer->store_data( map => { long_key => 'long_value1' } ); - $serializer->store_data( map => { long_key => 'long_value2' } ); - $serializer->store_data( map => { long_key2 => 'long_value1' } ); - $serializer->store_data( map => { long_key2 => 'long_value2' } ); - $serializer->store_data( map => { long_key => 'long_value1' } ); - $serializer->store_data( map => { long_key2 => 'long_value2' } ); - - open my $fh, '>', "$Dir/maps-with-pointers.raw"; - print {$fh} ${ $serializer->buffer() } - or die "Cannot write to maps-with-pointers.raw: $!"; - close $fh; - - return; -} - -sub write_db_with_metadata_pointers { - my $repeated_string = 'Lots of pointers in metadata'; - my $writer = MaxMind::DB::Writer::Tree->new( - ip_version => 6, - record_size => 24, - map_key_type_callback => sub { 'utf8_string' }, - database_type => $repeated_string, - languages => [ 'en', 'es', 'zh' ], - description => { - en => $repeated_string, - es => $repeated_string, - zh => $repeated_string, - }, - - ); - - _populate_all_networks( $writer, {} ); - - open my $fh, '>', "$Dir/MaxMind-DB-test-metadata-pointers.mmdb"; - - $writer->write_tree($fh); - - close $fh; -} - -main();