Skip to content

Commit

Permalink
Update verbosity read_csv
Browse files Browse the repository at this point in the history
  • Loading branch information
mrueda committed Jul 18, 2024
1 parent 9a4e996 commit 5453941
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 16 deletions.
4 changes: 2 additions & 2 deletions lib/Convert/Pheno.pm
Original file line number Diff line number Diff line change
Expand Up @@ -434,15 +434,15 @@ sub omop2bff {
# We read all tables in memory
say $msg if ( $self->{verbose} || $self->{debug} );
$data->{$table_name} =
read_csv( { in => $file, sep => $self->{sep}} );
read_csv( { in => $file, sep => $self->{sep}, self => $self } );
}

# --stream
else {
if ( any { $_ eq $table_name } @stream_ram_memory_tables ) {
say $msg if ( $self->{verbose} || $self->{debug} );
$data->{$table_name} =
read_csv( { in => $file, sep => $self->{sep} } );
read_csv( { in => $file, sep => $self->{sep}, self => $self } );
}
else {
push @filepaths, $file;
Expand Down
63 changes: 49 additions & 14 deletions lib/Convert/Pheno/IO/CSVHandler.pm
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use Convert::Pheno::Schema;
use Convert::Pheno::Mapping;
use Exporter 'import';
our @EXPORT =
qw(read_csv read_csv_stream read_redcap_dict_file read_mapping_file read_sqldump read_sqldump_stream sqldump2csv transpose_omop_data_structure write_csv open_filehandle load_exposures get_headers convert_table_aoh_to_hoh to_gb);
qw(read_csv read_csv_stream read_redcap_dict_file read_mapping_file read_sqldump read_sqldump_stream sqldump2csv transpose_omop_data_structure write_csv open_filehandle load_exposures get_headers convert_table_aoh_to_hoh);

use constant DEVEL_MODE => 0;

Expand Down Expand Up @@ -116,10 +116,9 @@ sub read_mapping_file {

sub read_sqldump {

my $arg = shift;
my $filepath = $arg->{in};
my $self = $arg->{self};
my $print_interval = 1_000;
my $arg = shift;
my $filepath = $arg->{in};
my $self = $arg->{self};

# Before resorting to writting this subroutine I performed an exhaustive search on CPAN:
# - Tested MySQL::Dump::Parser::XS but I could not make it work...
Expand All @@ -141,6 +140,9 @@ sub read_sqldump {
# Start reading the SQL dump
my $fh = open_filehandle( $filepath, 'r' );

# Determine the print interval based on file size
my $print_interval = get_print_interval($filepath);

# We'll store the data in the hashref $data
my $data = {};

Expand Down Expand Up @@ -255,14 +257,13 @@ sub read_sqldump {

sub read_sqldump_stream {

my $arg = shift;
my $filein = $arg->{in};
my $self = $arg->{self};
my $person = $arg->{person};
my $fileout = $self->{out_file};
my $table_name = $self->{omop_tables}[0];
my $table_name_lc = lc($table_name);
my $print_interval = 10_000;
my $arg = shift;
my $filein = $arg->{in};
my $self = $arg->{self};
my $person = $arg->{person};
my $fileout = $self->{out_file};
my $table_name = $self->{omop_tables}[0];
my $table_name_lc = lc($table_name);

# Define variables that modify what we load
my $max_lines_sql = $self->{max_lines_sql};
Expand All @@ -271,6 +272,9 @@ sub read_sqldump_stream {
my $fh_in = open_filehandle( $filein, 'r' );
my $fh_out = open_filehandle( $fileout, 'a' );

# Determine the print interval based on file size
my $print_interval = get_print_interval($filein);

# Start printing the array
#say $fh_out "[";

Expand Down Expand Up @@ -542,6 +546,7 @@ sub read_csv {
my $arg = shift;
my $filepath = $arg->{in};
my $sep = $arg->{sep};
my $self = exists $arg->{self} ? $arg->{self} : { verbose => 0 };

# Define split record separator from file extension
my ( $separator, $encoding ) = define_separator( $filepath, $sep );
Expand Down Expand Up @@ -569,6 +574,9 @@ sub read_csv {
# Open fh
my $fh = open_filehandle( $filepath, 'r' );

# Determine the print interval based on file size
my $print_interval = get_print_interval($filepath);

# Get headers
my $headers = $csv->getline($fh);
$csv->column_names(@$headers);
Expand All @@ -578,23 +586,37 @@ sub read_csv {
"Are you sure you are using the right --sep <$separator> for your data?\n"
if is_separator_incorrect($headers);


# Load data
my @aoh;
my $count = 0;
while ( my $row = $csv->getline_hr($fh) ) {
push @aoh, $row;
$count++;

say "Rows read: $count"
if ( $self->{verbose} && $count % $print_interval == 0 );

}

# Close fh
close $fh;

# Print if verbose
print
"==========================\nRows read (total): $count\n==========================\n\n"
if $self->{verbose};

# Coercing the data before returning it
for my $item (@aoh) {
for my $key ( @{$headers} ) {
$item->{$key} = dotify_and_coerce_number( $item->{$key} );
}
}

# RAM usage
say ram_usage_str( "read_csv($filepath)", \@aoh )
if ( DEVEL_MODE || $self->{verbose} );

# Return data
return \@aoh;
}
Expand Down Expand Up @@ -929,4 +951,17 @@ sub convert_table_aoh_to_hoh {
return $hoh;
}

sub get_print_interval {

my $filepath = shift;

# Determine file size
my $file_size = -s $filepath;

# Set print interval based on file size (threshold: 10 MB)
my $print_interval = $file_size > 10 * 1024 * 1024 ? 10_000 : 1_000;

return $print_interval;
}

1;

0 comments on commit 5453941

Please sign in to comment.