Added README.md for the data_quality_tool.

madgik · Jun 13, 2024 · a906590 · a906590
1 parent f8cbc2f
commit a906590
Show file tree

Hide file tree

Showing 37 changed files with 1,955 additions and 182 deletions.
diff --git a/.github/workflows/converter.yml → .github/workflows/data_quality_tool.yml b/.github/workflows/converter.yml → .github/workflows/data_quality_tool.yml
@@ -26,7 +26,7 @@ jobs:
         run: |
           poetry config virtualenvs.create false
           poetry install
-        working-directory: converter
+        working-directory: data_quality_tool
 
       - name: Set PYTHONPATH
         run: echo "PYTHONPATH=${{ github.workspace }}" >> $GITHUB_ENV
@@ -35,7 +35,7 @@ jobs:
         run: |
           poetry run coverage run -m pytest
           poetry run coverage xml
-        working-directory: converter/tests
+        working-directory: data_quality_tool/tests
 
       - name: Upload Coverage to Codecov
         uses: codecov/[email protected]

diff --git a/data_quality_tool/Dockerfile b/data_quality_tool/Dockerfile
@@ -0,0 +1,28 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+
+# The working directory should be set to where your Dockerfile and pyproject.toml are located
+WORKDIR /app
+
+# Since the Dockerfile and pyproject.toml are in the same directory, copy the current directory
+COPY . .
+
+# Install Poetry
+RUN pip install poetry
+
+# Configure Poetry: disable the creation of virtual environments
+RUN poetry config virtualenvs.create false
+
+# Install project dependencies
+RUN poetry install --no-dev
+
+# Expose the port your app runs on
+EXPOSE 8000
+
+# Environment variables for Gunicorn to run your Flask app
+ENV MODULE_NAME=converter.controller
+ENV VARIABLE_NAME=app
+ENV PORT=8000
+
+# Use the environment variable in the command
+CMD ["sh", "-c", "poetry run gunicorn --bind 0.0.0.0:${PORT} controller:app"]
diff --git a/data_quality_tool/README.md b/data_quality_tool/README.md
@@ -0,0 +1,21 @@
+## Data quality tool service
+
+### Build docker image
+
+To build a new image you must be on folder `datacatalogue/data_quality_tool`, then
+
+```
+docker build -t <USERNAME>/data_quality_tool:<IMAGETAG> .
+Example: 
+    docker build -t madgik/data_quality_tool:latest .
+
+```
+
+
+Then start the container with:
+
+```
+docker run -d -p 8000:8000 --name <CONTAINER_NAME> <USERNAME>/data_quality_tool:<IMAGETAG>
+Example:
+    docker run -d  -p 8000:8000 --name data_quality_tool madgik/data_quality_tool:latest
+```
diff --git a/data_quality_tool/common_entities.py b/data_quality_tool/common_entities.py
@@ -0,0 +1,59 @@
+# Mapping of Excel columns to JSON keys, adjust as necessary,
+# Did not contain the values because it is not in a 1 to 1 scenario.
+EXCEL_JSON_FIELDS_MAP = {
+    "csvFile": "csvFile",
+    "name": "label",
+    "code": "code",
+    "type": "type",
+    "values": "enumerations",
+    "unit": "units",
+    "description": "description",
+    "canBeNull": "canBeNull",
+    "comments": "comments",
+    "conceptPath": "conceptPath",
+    "methodology": "methodology",
+}
+
+
+EXCEL_TYPE_2_SQL_TYPE_ISCATEGORICAL_MAP = {
+    "nominal": ("text", True),
+    "real": ("real", False),
+    "integer": ("int", False),
+    "text": ("text", False),
+}
+
+EXCEL_COLUMNS = [
+    "csvFile",
+    "name",
+    "code",
+    "type",
+    "values",
+    "unit",
+    "description",
+    "canBeNull",
+    "comments",
+    "conceptPath",
+    "methodology",
+]
+REQUIRED_COLUMNS = ["name", "code", "type", "conceptPath"]
+
+
+class InvalidDataModelError(Exception):
+    """Exception raised for errors in the input data model."""
+
+
+JSON_EXCEL_FIELDS_MAP = {
+    "label": "name",
+    "code": "code",
+    "type": "type",
+    "enumerations": "values",
+    "minValue": "values",
+    "maxValue": "values",
+    "units": "units",
+    "description": "description",
+}
+MIN_MAX_PATTERN = r"^([-+]?\d*\.?\d+)-([-+]?\d*\.?\d+)$"
+ENUMERATION_PATTERN = r'^\{"[^"]+",\s*"[^"]+"\}(,\s*\{"[^"]+",\s*"[^"]+"\})*$'
+
+
+
diff --git a/converter/controller.py → data_quality_tool/controller.py b/converter/controller.py → data_quality_tool/controller.py
@@ -4,6 +4,7 @@
 
 from converter.excel_to_json import convert_excel_to_json
 from converter.json_to_excel import convert_json_to_excel
+from validator import json_validator, excel_validator
 
 app = Flask(__name__)
 
@@ -24,18 +25,19 @@ def excel_to_json():
         file_stream = BytesIO(file.read())
         # Use the file_stream object with pandas
         df = pd.read_excel(file_stream, engine="openpyxl")
+        excel_validator.validate_excel(df)
         # Read the Excel file into a Pandas DataFrame
-        df = df.astype(str).replace('nan', None)
         json_data = convert_excel_to_json(df)
         return jsonify(json_data)
 
 
 @app.route("/json-to-excel", methods=["POST"])
 def json_to_excel():
     if not request.json:
-        return "", 400
+        return "Please provide the json", 400
     json_data = request.json
     df = convert_json_to_excel(json_data)
+    json_validator.validate_json(json_data)
     # Create a BytesIO buffer to save the Excel file
     output = BytesIO()
     with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
@@ -48,3 +50,33 @@ def json_to_excel():
         download_name="output.xlsx",  # Use download_name for newer Flask versions if attachment_filename causes issues
         mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
     )
+
+
+@app.route("/validate-json", methods=["POST"])
+def validate_json():
+    json_data = request.json
+    if not request.json:
+        return "Please provide the json", 400
+    try:
+        json_validator.validate_json(json_data)
+        return jsonify({"message": "Data model is valid."})
+    except json_validator.InvalidDataModelError as e:
+        return jsonify({"error": str(e)}), 400
+
+
+@app.route("/validate-excel", methods=["POST"])
+def validate_excel():
+    if "file" not in request.files:
+        return jsonify({"error": "No file part"}), 400
+    file = request.files["file"]
+    if file.filename == "":
+        return jsonify({"error": "No selected file"}), 400
+    if file:
+        file_stream = BytesIO(file.read())
+        # Use the file_stream object with pandas
+        df = pd.read_excel(file_stream, engine="openpyxl")
+        try:
+            excel_validator.validate_excel(df)
+            return jsonify({"message": "Data model is valid."})
+        except json_validator.InvalidDataModelError as e:
+            return jsonify({"error": str(e)}), 400
diff --git a/converter/tests/__init__.py → data_quality_tool/converter/__init__.py b/converter/tests/__init__.py → data_quality_tool/converter/__init__.py
diff --git a/converter/converter/excel_to_json.py → data_quality_tool/converter/excel_to_json.py b/converter/converter/excel_to_json.py → data_quality_tool/converter/excel_to_json.py
@@ -1,9 +1,10 @@
 import pandas as pd
 import json
 
-# Mapping of Excel columns to JSON keys, adjust as necessary,
-# Did not contain the values because it is not in a 1 to 1 scenario.
-EXCEL_JSON_FIELDS_MAP = {
+from common_entities import EXCEL_TYPE_2_SQL_TYPE_ISCATEGORICAL_MAP, InvalidDataModelError
+
+
+EXCEL_JSON_FIELDS_MAP_WITHOUT_VALUES = {
     "name": "label",
     "code": "code",
     "type": "type",
@@ -15,33 +16,24 @@
     "methodology": "methodology",
 }
 
-TYPE_2_SQL = {
-    "nominal": ("text", True),
-    "real": ("real", False),
-    "integer": ("int", False),
-    "text": ("text", False),
-}
-
 
-def process_enumerations(enumerations_str):
+def process_enumerations(values):
     """
     Parses a custom-formatted string into a list of dictionaries with 'code' and 'label'.
     Expected format: '{"code1", "label1"}, {"code2", "label2"}'.
     """
+    # Transform the string to a JSON-compatible format
     try:
-        enumerations_str = (
-            "[" + enumerations_str.replace('","', '":"').replace('", "', '": "') + "]"
-        )
-        parsed = json.loads(enumerations_str)
-    except json.decoder.JSONDecodeError:
-        raise ValueError(
-            "Could not parse enumerations: "
-            + enumerations_str
-            + '"proper format is {"code1", "label1"}, {"code2", "label2"}"'
+        # Transforming {"key","value"} into [{"key": "value"}]
+        transformed_values = "[" + values.replace('","', '":"').replace('", "', '": "') + "]"
+        enumerations = json.loads(transformed_values)
+    except json.JSONDecodeError:
+        raise InvalidDataModelError(
+            'Nominal values format error: \'{"code", "label"}, {"code", "label"}\' expected but got ' + values + "."
         )
     return [
         {"code": list(item.keys())[0], "label": list(item.values())[0]}
-        for item in parsed
+        for item in enumerations
     ]
 
 
@@ -78,16 +70,18 @@ def process_values_based_on_type(row, variable):
     values = row.get("values")
     variable_type = row.get("type")
 
-    if variable_type in ["real", "integer"]:
-        if not values:
-            return
-
-        # Split the range and strip whitespace
+    if variable_type in ["real", "integer"] and values:
         try:
-            print(values)
-            min_value, max_value = map(str.strip, values.split("-"))
-        except ValueError:
-            raise ValueError(f"Invalid range format for variable {code}: {values}")
+            # Attempt to split the string into exactly two parts and unpack
+            min_value, max_value = values.split('-')
+
+            # Try to convert both parts into floats
+            float(min_value)
+            float(max_value)
+        except Exception:
+            raise InvalidDataModelError(
+                f"Values must match format '<float or integer>-<float or integer>' but got '{values}'."
+            )
 
         # Convert min and max values to the appropriate type
         try:
@@ -100,23 +94,23 @@ def process_values_based_on_type(row, variable):
                     max_value
                 )
         except ValueError:
-            raise ValueError(
+            raise InvalidDataModelError(
                 f"Range values for variable {code} must be valid {variable_type} numbers"
             )
 
     elif variable_type == "nominal":
         if not values:
-            raise ValueError(
+            raise InvalidDataModelError(
                 f"The 'values' should not be empty for variable {code} when type is 'nominal'"
             )
         variable["enumerations"] = process_enumerations(values)
 
 
 def validate_variable_type(row):
-    valid_types = set(TYPE_2_SQL.keys())
+    valid_types = set(EXCEL_TYPE_2_SQL_TYPE_ISCATEGORICAL_MAP.keys())
     if "type" not in row or row["type"] not in valid_types:
         valid_types_str = ", ".join(valid_types)
-        raise ValueError(
+        raise InvalidDataModelError(
             f"The row must have a 'type' field with a valid value."
             f" Valid values are: {valid_types_str}, got '{row.get('type')}' instead."
         )
@@ -130,46 +124,60 @@ def process_variable(row):
     # Validate variable type first
     validate_variable_type(row)
 
-    # Initialize the variable dictionary with mappings from EXCEL_JSON_FIELDS_MAP
+    # Initialize the variable dictionary with mappings from EXCEL_JSON_FIELDS_MAP_WITHOUT_VALUES
     variable = {
         json_key: row[excel_col]
-        for excel_col, json_key in EXCEL_JSON_FIELDS_MAP.items()
+        for excel_col, json_key in EXCEL_JSON_FIELDS_MAP_WITHOUT_VALUES.items()
         if excel_col in row and pd.notnull(row[excel_col])
     }
 
     # Process 'values' based on variable type, which might modify 'variable' in-place
     process_values_based_on_type(row, variable)
 
-    variable["sql_type"], variable["isCategorical"] = TYPE_2_SQL[variable["type"]]
+    variable["sql_type"], variable["isCategorical"] = EXCEL_TYPE_2_SQL_TYPE_ISCATEGORICAL_MAP[variable["type"]]
 
     return variable
 
 
+def clean_empty_fields(data):
+    if isinstance(data, dict):  # If the item is a dictionary
+        keys_to_delete = [key for key, value in data.items() if (key in ['variables', 'groups', 'enumerations'] and not value) or value == ""]
+        for key in keys_to_delete:
+            del data[key]  # Delete the key if its value is an empty list or an empty string
+        for key in data:  # Recursively clean remaining dictionary items
+            clean_empty_fields(data[key])
+    elif isinstance(data, list):  # If the item is a list, apply the function to each element
+        for item in data:
+            clean_empty_fields(item)
+
+
 def convert_excel_to_json(df):
     """
     Converts a DataFrame from Excel into a JSON structure, handling enumerations specifically,
     and adds 'isCategorical' and 'sql_type' based on the 'type'.
     """
+    df = df.astype(str).replace("nan", None)
     root = {"variables": [], "groups": [], "code": "root"}
 
     for _, row in df.iterrows():
         try:
 
             variable = process_variable(row.to_dict())
-            if "conceptPath" in variable and variable["conceptPath"]:
+            if "conceptPath" in variable and variable["conceptPath"] and variable["conceptPath"] != "None":
                 path = variable["conceptPath"].split("/")
                 del variable["conceptPath"]
                 insert_variable_into_structure(root, variable, path)
             else:
-                raise ValueError(
+                raise InvalidDataModelError(
                     f"The variable {variable['code']} is missing the conceptPath"
                 )
-        except ValueError as e:
-            raise ValueError(f"Error processing variable: {e}")
+        except InvalidDataModelError as e:
+            raise InvalidDataModelError(f"Error processing variable: {e}")
 
     if root["groups"]:
         data_model = root["groups"][0]
         data_model["version"] = "to be defined"
+        clean_empty_fields(data_model)
         return data_model
     else:
         return {"code": "No groups found", "groups": [], "variables": root["variables"]}