Skip to content

Commit

Permalink
Added README.md for the data_quality_tool.
Browse files Browse the repository at this point in the history
  • Loading branch information
KFilippopolitis committed Jun 13, 2024
1 parent f8cbc2f commit a906590
Show file tree
Hide file tree
Showing 37 changed files with 1,955 additions and 182 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
run: |
poetry config virtualenvs.create false
poetry install
working-directory: converter
working-directory: data_quality_tool

- name: Set PYTHONPATH
run: echo "PYTHONPATH=${{ github.workspace }}" >> $GITHUB_ENV
Expand All @@ -35,7 +35,7 @@ jobs:
run: |
poetry run coverage run -m pytest
poetry run coverage xml
working-directory: converter/tests
working-directory: data_quality_tool/tests

- name: Upload Coverage to Codecov
uses: codecov/[email protected]
Expand Down
28 changes: 28 additions & 0 deletions data_quality_tool/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Use an official Python runtime as a parent image
FROM python:3.9-slim

# The working directory should be set to where your Dockerfile and pyproject.toml are located
WORKDIR /app

# Since the Dockerfile and pyproject.toml are in the same directory, copy the current directory
COPY . .

# Install Poetry
RUN pip install poetry

# Configure Poetry: disable the creation of virtual environments
RUN poetry config virtualenvs.create false

# Install project dependencies
RUN poetry install --no-dev

# Expose the port your app runs on
EXPOSE 8000

# Environment variables for Gunicorn to run your Flask app
ENV MODULE_NAME=converter.controller
ENV VARIABLE_NAME=app
ENV PORT=8000

# Use the environment variable in the command
CMD ["sh", "-c", "poetry run gunicorn --bind 0.0.0.0:${PORT} controller:app"]
21 changes: 21 additions & 0 deletions data_quality_tool/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
## Data quality tool service

### Build docker image

To build a new image you must be on folder `datacatalogue/data_quality_tool`, then

```
docker build -t <USERNAME>/data_quality_tool:<IMAGETAG> .
Example:
docker build -t madgik/data_quality_tool:latest .
```


Then start the container with:

```
docker run -d -p 8000:8000 --name <CONTAINER_NAME> <USERNAME>/data_quality_tool:<IMAGETAG>
Example:
docker run -d -p 8000:8000 --name data_quality_tool madgik/data_quality_tool:latest
```
59 changes: 59 additions & 0 deletions data_quality_tool/common_entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Mapping of Excel columns to JSON keys, adjust as necessary,
# Did not contain the values because it is not in a 1 to 1 scenario.
EXCEL_JSON_FIELDS_MAP = {
"csvFile": "csvFile",
"name": "label",
"code": "code",
"type": "type",
"values": "enumerations",
"unit": "units",
"description": "description",
"canBeNull": "canBeNull",
"comments": "comments",
"conceptPath": "conceptPath",
"methodology": "methodology",
}


EXCEL_TYPE_2_SQL_TYPE_ISCATEGORICAL_MAP = {
"nominal": ("text", True),
"real": ("real", False),
"integer": ("int", False),
"text": ("text", False),
}

EXCEL_COLUMNS = [
"csvFile",
"name",
"code",
"type",
"values",
"unit",
"description",
"canBeNull",
"comments",
"conceptPath",
"methodology",
]
REQUIRED_COLUMNS = ["name", "code", "type", "conceptPath"]


class InvalidDataModelError(Exception):
"""Exception raised for errors in the input data model."""


JSON_EXCEL_FIELDS_MAP = {
"label": "name",
"code": "code",
"type": "type",
"enumerations": "values",
"minValue": "values",
"maxValue": "values",
"units": "units",
"description": "description",
}
MIN_MAX_PATTERN = r"^([-+]?\d*\.?\d+)-([-+]?\d*\.?\d+)$"
ENUMERATION_PATTERN = r'^\{"[^"]+",\s*"[^"]+"\}(,\s*\{"[^"]+",\s*"[^"]+"\})*$'



36 changes: 34 additions & 2 deletions converter/controller.py → data_quality_tool/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from converter.excel_to_json import convert_excel_to_json
from converter.json_to_excel import convert_json_to_excel
from validator import json_validator, excel_validator

app = Flask(__name__)

Expand All @@ -24,18 +25,19 @@ def excel_to_json():
file_stream = BytesIO(file.read())
# Use the file_stream object with pandas
df = pd.read_excel(file_stream, engine="openpyxl")
excel_validator.validate_excel(df)
# Read the Excel file into a Pandas DataFrame
df = df.astype(str).replace('nan', None)
json_data = convert_excel_to_json(df)
return jsonify(json_data)


@app.route("/json-to-excel", methods=["POST"])
def json_to_excel():
if not request.json:
return "", 400
return "Please provide the json", 400
json_data = request.json
df = convert_json_to_excel(json_data)
json_validator.validate_json(json_data)
# Create a BytesIO buffer to save the Excel file
output = BytesIO()
with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
Expand All @@ -48,3 +50,33 @@ def json_to_excel():
download_name="output.xlsx", # Use download_name for newer Flask versions if attachment_filename causes issues
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)


@app.route("/validate-json", methods=["POST"])
def validate_json():
json_data = request.json
if not request.json:
return "Please provide the json", 400
try:
json_validator.validate_json(json_data)
return jsonify({"message": "Data model is valid."})
except json_validator.InvalidDataModelError as e:
return jsonify({"error": str(e)}), 400


@app.route("/validate-excel", methods=["POST"])
def validate_excel():
if "file" not in request.files:
return jsonify({"error": "No file part"}), 400
file = request.files["file"]
if file.filename == "":
return jsonify({"error": "No selected file"}), 400
if file:
file_stream = BytesIO(file.read())
# Use the file_stream object with pandas
df = pd.read_excel(file_stream, engine="openpyxl")
try:
excel_validator.validate_excel(df)
return jsonify({"message": "Data model is valid."})
except json_validator.InvalidDataModelError as e:
return jsonify({"error": str(e)}), 400
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import pandas as pd
import json

# Mapping of Excel columns to JSON keys, adjust as necessary,
# Did not contain the values because it is not in a 1 to 1 scenario.
EXCEL_JSON_FIELDS_MAP = {
from common_entities import EXCEL_TYPE_2_SQL_TYPE_ISCATEGORICAL_MAP, InvalidDataModelError


EXCEL_JSON_FIELDS_MAP_WITHOUT_VALUES = {
"name": "label",
"code": "code",
"type": "type",
Expand All @@ -15,33 +16,24 @@
"methodology": "methodology",
}

TYPE_2_SQL = {
"nominal": ("text", True),
"real": ("real", False),
"integer": ("int", False),
"text": ("text", False),
}


def process_enumerations(enumerations_str):
def process_enumerations(values):
"""
Parses a custom-formatted string into a list of dictionaries with 'code' and 'label'.
Expected format: '{"code1", "label1"}, {"code2", "label2"}'.
"""
# Transform the string to a JSON-compatible format
try:
enumerations_str = (
"[" + enumerations_str.replace('","', '":"').replace('", "', '": "') + "]"
)
parsed = json.loads(enumerations_str)
except json.decoder.JSONDecodeError:
raise ValueError(
"Could not parse enumerations: "
+ enumerations_str
+ '"proper format is {"code1", "label1"}, {"code2", "label2"}"'
# Transforming {"key","value"} into [{"key": "value"}]
transformed_values = "[" + values.replace('","', '":"').replace('", "', '": "') + "]"
enumerations = json.loads(transformed_values)
except json.JSONDecodeError:
raise InvalidDataModelError(
'Nominal values format error: \'{"code", "label"}, {"code", "label"}\' expected but got ' + values + "."
)
return [
{"code": list(item.keys())[0], "label": list(item.values())[0]}
for item in parsed
for item in enumerations
]


Expand Down Expand Up @@ -78,16 +70,18 @@ def process_values_based_on_type(row, variable):
values = row.get("values")
variable_type = row.get("type")

if variable_type in ["real", "integer"]:
if not values:
return

# Split the range and strip whitespace
if variable_type in ["real", "integer"] and values:
try:
print(values)
min_value, max_value = map(str.strip, values.split("-"))
except ValueError:
raise ValueError(f"Invalid range format for variable {code}: {values}")
# Attempt to split the string into exactly two parts and unpack
min_value, max_value = values.split('-')

# Try to convert both parts into floats
float(min_value)
float(max_value)
except Exception:
raise InvalidDataModelError(
f"Values must match format '<float or integer>-<float or integer>' but got '{values}'."
)

# Convert min and max values to the appropriate type
try:
Expand All @@ -100,23 +94,23 @@ def process_values_based_on_type(row, variable):
max_value
)
except ValueError:
raise ValueError(
raise InvalidDataModelError(
f"Range values for variable {code} must be valid {variable_type} numbers"
)

elif variable_type == "nominal":
if not values:
raise ValueError(
raise InvalidDataModelError(
f"The 'values' should not be empty for variable {code} when type is 'nominal'"
)
variable["enumerations"] = process_enumerations(values)


def validate_variable_type(row):
valid_types = set(TYPE_2_SQL.keys())
valid_types = set(EXCEL_TYPE_2_SQL_TYPE_ISCATEGORICAL_MAP.keys())
if "type" not in row or row["type"] not in valid_types:
valid_types_str = ", ".join(valid_types)
raise ValueError(
raise InvalidDataModelError(
f"The row must have a 'type' field with a valid value."
f" Valid values are: {valid_types_str}, got '{row.get('type')}' instead."
)
Expand All @@ -130,46 +124,60 @@ def process_variable(row):
# Validate variable type first
validate_variable_type(row)

# Initialize the variable dictionary with mappings from EXCEL_JSON_FIELDS_MAP
# Initialize the variable dictionary with mappings from EXCEL_JSON_FIELDS_MAP_WITHOUT_VALUES
variable = {
json_key: row[excel_col]
for excel_col, json_key in EXCEL_JSON_FIELDS_MAP.items()
for excel_col, json_key in EXCEL_JSON_FIELDS_MAP_WITHOUT_VALUES.items()
if excel_col in row and pd.notnull(row[excel_col])
}

# Process 'values' based on variable type, which might modify 'variable' in-place
process_values_based_on_type(row, variable)

variable["sql_type"], variable["isCategorical"] = TYPE_2_SQL[variable["type"]]
variable["sql_type"], variable["isCategorical"] = EXCEL_TYPE_2_SQL_TYPE_ISCATEGORICAL_MAP[variable["type"]]

return variable


def clean_empty_fields(data):
if isinstance(data, dict): # If the item is a dictionary
keys_to_delete = [key for key, value in data.items() if (key in ['variables', 'groups', 'enumerations'] and not value) or value == ""]
for key in keys_to_delete:
del data[key] # Delete the key if its value is an empty list or an empty string
for key in data: # Recursively clean remaining dictionary items
clean_empty_fields(data[key])
elif isinstance(data, list): # If the item is a list, apply the function to each element
for item in data:
clean_empty_fields(item)


def convert_excel_to_json(df):
"""
Converts a DataFrame from Excel into a JSON structure, handling enumerations specifically,
and adds 'isCategorical' and 'sql_type' based on the 'type'.
"""
df = df.astype(str).replace("nan", None)
root = {"variables": [], "groups": [], "code": "root"}

for _, row in df.iterrows():
try:

variable = process_variable(row.to_dict())
if "conceptPath" in variable and variable["conceptPath"]:
if "conceptPath" in variable and variable["conceptPath"] and variable["conceptPath"] != "None":
path = variable["conceptPath"].split("/")
del variable["conceptPath"]
insert_variable_into_structure(root, variable, path)
else:
raise ValueError(
raise InvalidDataModelError(
f"The variable {variable['code']} is missing the conceptPath"
)
except ValueError as e:
raise ValueError(f"Error processing variable: {e}")
except InvalidDataModelError as e:
raise InvalidDataModelError(f"Error processing variable: {e}")

if root["groups"]:
data_model = root["groups"][0]
data_model["version"] = "to be defined"
clean_empty_fields(data_model)
return data_model
else:
return {"code": "No groups found", "groups": [], "variables": root["variables"]}
Loading

0 comments on commit a906590

Please sign in to comment.