Skip to content

Commit

Permalink
Improved CSV file handling for volumes, ensuring accurate information…
Browse files Browse the repository at this point in the history
… is added to the system table for datasets.
  • Loading branch information
KFilippopolitis committed Oct 11, 2023
1 parent 5010847 commit 9113b54
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 23 deletions.
7 changes: 7 additions & 0 deletions mipdb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,13 @@ def get_row_count(self, table):
res = self.execute(f"select COUNT(*) from {table}").fetchone()
return res[0]

def get_column_distinct(self, column, table):
datasets = list(self.execute(
f"SELECT DISTINCT({column}) FROM {table};"
))
datasets = [dataset[0] for dataset in datasets]
return datasets

def get_dataset(self, dataset_id, columns):
columns_query = ", ".join(columns) if columns else "*"

Expand Down
15 changes: 7 additions & 8 deletions mipdb/tables.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import os
from abc import ABC, abstractmethod
import json
from enum import Enum
from typing import Union, List

import sqlalchemy as sql
from sqlalchemy import ForeignKey, MetaData
from sqlalchemy import ForeignKey, Integer, MetaData
from sqlalchemy.ext.compiler import compiles

from mipdb.database import DataBase, Connection, credentials_from_config
from mipdb.data_frame import DATASET_COLUMN_NAME
from mipdb.database import DataBase, Connection
from mipdb.database import METADATA_SCHEMA
from mipdb.database import METADATA_TABLE
from mipdb.dataelements import CommonDataElement
Expand Down Expand Up @@ -70,6 +71,9 @@ def delete(self, db: Union[DataBase, Connection]):
def get_row_count(self, db):
return db.get_row_count(self.table.fullname)

def get_column_distinct(self, column, db):
return db.get_column_distinct(column, self.table.fullname)

def drop(self, db: Union[DataBase, Connection]):
db.drop_table(self._table)

Expand Down Expand Up @@ -375,7 +379,7 @@ def validate_csv(self, csv_path, cdes_with_min_max, cdes_with_enumerations, db):
break

validated_datasets = set(validated_datasets) | set(
self.get_unique_datasets(db)
self.get_column_distinct(DATASET_COLUMN_NAME, db)
)
self._validate_enumerations_restriction(cdes_with_enumerations, db)
self._validate_min_max_restriction(cdes_with_min_max, db)
Expand Down Expand Up @@ -432,10 +436,5 @@ def _validate_enumerations_restriction(self, cdes_with_enumerations, db):
f"In the column: '{cde}' the following values are invalid: '{cde_invalid_values}'"
)

def get_unique_datasets(self, db):
return db.execute(
f"SELECT DISTINCT(dataset) FROM {self.table.fullname};"
).fetchone()

def set_table(self, table):
self._table = table
5 changes: 2 additions & 3 deletions mipdb/usecases.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@
TemporaryTable,
RECORDS_PER_COPY,
)
from mipdb.data_frame import DataFrame
from mipdb.data_frame import DataFrame, DATASET_COLUMN_NAME

DATASET_COLUMN_NAME = "dataset"
LONGITUDINAL = "longitudinal"


Expand Down Expand Up @@ -350,7 +349,7 @@ def insert_csv_to_db(self, csv_path, temporary_table, data_model, db):
break

imported_datasets = set(imported_datasets) | set(
temporary_table.get_unique_datasets(db)
temporary_table.get_column_distinct(DATASET_COLUMN_NAME, db)
)
db.copy_data_table_to_another_table(primary_data_table, temporary_table)
temporary_table.delete(db)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "mipdb"
version = "2.4.4"
version = "2.4.5"
description = ""
authors = ["Your Name <[email protected]>"]

Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
FAIL_DATA_FOLDER = DATA_FOLDER + "fail"
ABSOLUTE_PATH_DATA_FOLDER = f"{os.path.dirname(os.path.realpath(__file__))}/data/"
ABSOLUTE_PATH_DATASET_FILE = f"{os.path.dirname(os.path.realpath(__file__))}/data/success/data_model_v_1_0/dataset.csv"
ABSOLUTE_PATH_DATASET_FILE_MULTIPLE_DATASET = f"{os.path.dirname(os.path.realpath(__file__))}/data/success/data_model_v_1_0/dataset123.csv"
ABSOLUTE_PATH_SUCCESS_DATA_FOLDER = ABSOLUTE_PATH_DATA_FOLDER + "success"
ABSOLUTE_PATH_FAIL_DATA_FOLDER = ABSOLUTE_PATH_DATA_FOLDER + "fail"
IP = "127.0.0.1"
Expand Down
2 changes: 1 addition & 1 deletion tests/data/success/data_model_v_1_0/dataset10.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ subjectcode,var1,var3,dataset
2,2,22,dataset10
2,1,23,dataset10
5,1,24,dataset10
5,2,25,dataset2
5,2,25,dataset10
22 changes: 15 additions & 7 deletions tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
ABSOLUTE_PATH_SUCCESS_DATA_FOLDER,
SUCCESS_DATA_FOLDER,
ABSOLUTE_PATH_FAIL_DATA_FOLDER,
DEFAULT_OPTIONS,
DEFAULT_OPTIONS, ABSOLUTE_PATH_DATASET_FILE_MULTIPLE_DATASET,
)
from tests.conftest import DATA_MODEL_FILE

Expand Down Expand Up @@ -1072,13 +1072,13 @@ def test_list_datasets(db):
runner.invoke(
add_dataset,
[
DATASET_FILE,
ABSOLUTE_PATH_DATASET_FILE_MULTIPLE_DATASET,
"--data-model",
"data_model",
"-v",
"1.0",
"--copy_from_file",
False,
True,
]
+ DEFAULT_OPTIONS,
)
Expand All @@ -1089,12 +1089,20 @@ def test_list_datasets(db):
assert result.stdout == "There are no datasets.\n"
assert result_with_dataset.exit_code == ExitCode.OK
assert (
"dataset_id data_model_id code label status count"
in result_with_dataset.stdout
"dataset_id data_model_id code label status count".strip(" ")
in result_with_dataset.stdout.strip(" ")
)
assert (
"dataset2 Dataset 2 ENABLED 2".strip(" ")
in result_with_dataset.stdout.strip(" ")
)
assert (
"dataset1 Dataset 1 ENABLED 2".strip(" ")
in result_with_dataset.stdout.strip(" ")
)
assert (
"0 1 1 dataset Dataset ENABLED 5"
in result_with_dataset.stdout
"dataset Dataset ENABLED 1".strip(" ")
in result_with_dataset.stdout.strip(" ")
)


Expand Down
6 changes: 3 additions & 3 deletions tests/test_usecases.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,14 +360,14 @@ def test_add_dataset_with_db_with_multiple_datasets(db, data_model_metadata):

# Test
ImportCSV(db).execute(
csv_path="tests/data/success/data_model_v_1_0/dataset10.csv",
csv_path="tests/data/success/data_model_v_1_0/dataset123.csv",
copy_from_file=False,
data_model_code="data_model",
data_model_version="1.0",
)
datasets = db.get_values(columns=["data_model_id", "code"])
assert len(datasets) == 2
assert all(code in ["dataset2", "dataset10"] for dmi, code in datasets)
assert len(datasets) == 3
assert all(code in ["dataset", "dataset1", "dataset2"] for dmi, code in datasets)


@pytest.mark.database
Expand Down

0 comments on commit 9113b54

Please sign in to comment.