Skip to content

Simple benchmarking notes

Lalaland edited this page Oct 25, 2022 · 6 revisions

Time and memory to perform the following operation on the 1% extract:

lengths = collections.defaultdict(int)
for patient in patients:
    num_nones = sum(event.value is not None for event in patient.events)
    lengths[num_nones] += 1
Approach Time Memory Disk
C++ in memory 0.46 seconds 0.2 GB 4.6 GB
Python in memory 4.58 seconds 26 GB 5.5 GB
Python in C++ database 136 seconds 3.9 GB 4.6 GB
Python in pickle database 69 seconds 5.8 GB 5.5 GB

Code for each result

C++ in memory

#include "absl/container/flat_hash_map.h"
#include "database.hh"

boost::filesystem::path extract =
    "/local-scratch/nigam/projects/ethanid/piton/target/";

int main() {
    PatientDatabase database(extract, true);

    absl::flat_hash_map<uint32_t, uint32_t> length_counts;

    auto iter = database.iterator();

    for (uint32_t patient_id = 0; patient_id < database.size(); patient_id++) {
        const Patient& p = iter.get_patient(patient_id);
	int count = 0;
	for (const auto& event : p.events) {
		count += event.value_type != ValueType::NONE;
	}

        length_counts[count] += 1;
    }
}

Python in memory

import piton.datasets
import collections

source = "/local-scratch/nigam/projects/ethanid/piton/target"

data = piton.datasets.PatientDatabase(source, True)

patients = {}

try:
    for patient in data:
        patients[patient.patient_id] = patient
except:
    pass

import time

start = time.time()

lengths = collections.defaultdict(int)

for patient in patients.values():
    num_nones = sum(event.value is not None for event in patient.events)
    lengths[num_nones] += 1

end = time.time()

print(end - start)

Python in C++ database

import piton.datasets
import collections

source = "/local-scratch/nigam/projects/ethanid/piton/target"

data = piton.datasets.PatientDatabase(source, True)

lengths = collections.defaultdict(int)

for patient in data:
    num_nones = sum(event.value is not None for event in patient.events)
    lengths[num_nones] += 1

Python in pickle database

import constdb
import collections
import pickle

lengths = collections.defaultdict(int)

with constdb.MmapReader("patient_database") as reader:
    for k in reader.keys():
        patient = pickle.loads(reader.get(k))
        num_nones = sum(event.value is not None for event in patient.events)
        lengths[num_nones] += 1