diff --git a/lib/Convert/Pheno.pm b/lib/Convert/Pheno.pm index f0f3042..abc46ea 100644 --- a/lib/Convert/Pheno.pm +++ b/lib/Convert/Pheno.pm @@ -434,7 +434,7 @@ sub omop2bff { # We read all tables in memory say $msg if ( $self->{verbose} || $self->{debug} ); $data->{$table_name} = - read_csv( { in => $file, sep => $self->{sep}} ); + read_csv( { in => $file, sep => $self->{sep}, self => $self } ); } # --stream @@ -442,7 +442,7 @@ sub omop2bff { if ( any { $_ eq $table_name } @stream_ram_memory_tables ) { say $msg if ( $self->{verbose} || $self->{debug} ); $data->{$table_name} = - read_csv( { in => $file, sep => $self->{sep} } ); + read_csv( { in => $file, sep => $self->{sep}, self => $self } ); } else { push @filepaths, $file; diff --git a/lib/Convert/Pheno/IO/CSVHandler.pm b/lib/Convert/Pheno/IO/CSVHandler.pm index a21f1ea..4e73f4e 100755 --- a/lib/Convert/Pheno/IO/CSVHandler.pm +++ b/lib/Convert/Pheno/IO/CSVHandler.pm @@ -21,7 +21,7 @@ use Convert::Pheno::Schema; use Convert::Pheno::Mapping; use Exporter 'import'; our @EXPORT = - qw(read_csv read_csv_stream read_redcap_dict_file read_mapping_file read_sqldump read_sqldump_stream sqldump2csv transpose_omop_data_structure write_csv open_filehandle load_exposures get_headers convert_table_aoh_to_hoh to_gb); + qw(read_csv read_csv_stream read_redcap_dict_file read_mapping_file read_sqldump read_sqldump_stream sqldump2csv transpose_omop_data_structure write_csv open_filehandle load_exposures get_headers convert_table_aoh_to_hoh); use constant DEVEL_MODE => 0; @@ -116,10 +116,9 @@ sub read_mapping_file { sub read_sqldump { - my $arg = shift; - my $filepath = $arg->{in}; - my $self = $arg->{self}; - my $print_interval = 1_000; + my $arg = shift; + my $filepath = $arg->{in}; + my $self = $arg->{self}; # Before resorting to writting this subroutine I performed an exhaustive search on CPAN: # - Tested MySQL::Dump::Parser::XS but I could not make it work... @@ -141,6 +140,9 @@ sub read_sqldump { # Start reading the SQL dump my $fh = open_filehandle( $filepath, 'r' ); + # Determine the print interval based on file size + my $print_interval = get_print_interval($filepath); + # We'll store the data in the hashref $data my $data = {}; @@ -255,14 +257,13 @@ sub read_sqldump { sub read_sqldump_stream { - my $arg = shift; - my $filein = $arg->{in}; - my $self = $arg->{self}; - my $person = $arg->{person}; - my $fileout = $self->{out_file}; - my $table_name = $self->{omop_tables}[0]; - my $table_name_lc = lc($table_name); - my $print_interval = 10_000; + my $arg = shift; + my $filein = $arg->{in}; + my $self = $arg->{self}; + my $person = $arg->{person}; + my $fileout = $self->{out_file}; + my $table_name = $self->{omop_tables}[0]; + my $table_name_lc = lc($table_name); # Define variables that modify what we load my $max_lines_sql = $self->{max_lines_sql}; @@ -271,6 +272,9 @@ sub read_sqldump_stream { my $fh_in = open_filehandle( $filein, 'r' ); my $fh_out = open_filehandle( $fileout, 'a' ); + # Determine the print interval based on file size + my $print_interval = get_print_interval($filein); + # Start printing the array #say $fh_out "["; @@ -542,6 +546,7 @@ sub read_csv { my $arg = shift; my $filepath = $arg->{in}; my $sep = $arg->{sep}; + my $self = exists $arg->{self} ? $arg->{self} : { verbose => 0 }; # Define split record separator from file extension my ( $separator, $encoding ) = define_separator( $filepath, $sep ); @@ -569,6 +574,9 @@ sub read_csv { # Open fh my $fh = open_filehandle( $filepath, 'r' ); + # Determine the print interval based on file size + my $print_interval = get_print_interval($filepath); + # Get headers my $headers = $csv->getline($fh); $csv->column_names(@$headers); @@ -578,16 +586,26 @@ sub read_csv { "Are you sure you are using the right --sep <$separator> for your data?\n" if is_separator_incorrect($headers); - # Load data my @aoh; + my $count = 0; while ( my $row = $csv->getline_hr($fh) ) { push @aoh, $row; + $count++; + + say "Rows read: $count" + if ( $self->{verbose} && $count % $print_interval == 0 ); + } # Close fh close $fh; + # Print if verbose + print +"==========================\nRows read (total): $count\n==========================\n\n" + if $self->{verbose}; + # Coercing the data before returning it for my $item (@aoh) { for my $key ( @{$headers} ) { @@ -595,6 +613,10 @@ sub read_csv { } } + # RAM usage + say ram_usage_str( "read_csv($filepath)", \@aoh ) + if ( DEVEL_MODE || $self->{verbose} ); + # Return data return \@aoh; } @@ -929,4 +951,17 @@ sub convert_table_aoh_to_hoh { return $hoh; } +sub get_print_interval { + + my $filepath = shift; + + # Determine file size + my $file_size = -s $filepath; + + # Set print interval based on file size (threshold: 10 MB) + my $print_interval = $file_size > 10 * 1024 * 1024 ? 10_000 : 1_000; + + return $print_interval; +} + 1;