Skip to content

Commit

Permalink
DataType feature (part 1) (#1026)
Browse files Browse the repository at this point in the history
Partly implements #981:

* add models and views for viewing, creating, and updating DataTypes
* add views for managing a project's registered DataTypes
* handle specified DataTypes in the API for file uploads
* some default behavior if DataTypes weren't specified during upload
* records DataType edit history
* adds public API endpoint for DataTypes
* adds public API endpoint for projects

Aspects related to sharing permissions are not part of this commit
(i.e. requesting and authorizing data sharing according to DataType).
Also, the feature at this stage does not enforce DataType in uploaded
data, but does introduce a default we plan to use for legacy support.
  • Loading branch information
madprime committed Apr 15, 2019
1 parent fce5789 commit eb74ce4
Show file tree
Hide file tree
Showing 30 changed files with 1,283 additions and 43 deletions.
1 change: 1 addition & 0 deletions data_import/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@

admin.site.register(models.DataFile)
admin.site.register(models.NewDataFileAccessLog)
admin.site.register(models.DataType)
60 changes: 60 additions & 0 deletions data_import/forms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from django import forms

from .models import DataType


class DataTypeForm(forms.ModelForm):
"""
A form for creating and editing DataTypes.
"""

class Meta: # noqa: D101
model = DataType
fields = ["name", "parent", "description"]

def __init__(self, *args, **kwargs):
self.editor = kwargs.pop("editor")
return super().__init__(*args, **kwargs)

def clean_parent(self):
"""
Verify that the parent is not the object itself nor a descendent.
"""
parent = self.cleaned_data.get("parent")
if not parent:
return parent
if self.instance.id == parent.id:
raise forms.ValidationError(
"A DataType cannot be assigned to be its own parent."
)
elif self.instance in parent.all_parents:
raise forms.ValidationError(
"{0} is not an allowed parent, as it is a descendent of {1}.".format(
parent.name, self.instance.name
)
)
return parent

def clean_name(self):
"""
Verify that the name is case insensitive unique.
"""
name = self.cleaned_data.get("name")
try:
dt = DataType.objects.get(name__iexact=name)
except DataType.DoesNotExist:
dt = self.instance
if not dt == self.instance:
raise forms.ValidationError(
"Please provide a unique name for this datatype"
)
return name

def clean(self, *args, **kwargs):
if self.instance:
if not self.instance.editable:
raise forms.ValidationError(
"Not editable: in use by one or more approved projects."
)
self.instance.editor = self.editor
return super().clean(*args, **kwargs)
69 changes: 69 additions & 0 deletions data_import/migrations/0019_datatype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Generated by Django 2.2 on 2019-04-12 19:08

import django.contrib.postgres.fields.jsonb
import django.core.validators
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
("open_humans", "0014_member_password_reset_redirect"),
("data_import", "0018_auto_20190402_1947"),
]

operations = [
migrations.CreateModel(
name="DataType",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"name",
models.CharField(
max_length=128,
unique=True,
validators=[
django.core.validators.RegexValidator(
"^[\\w\\-\\s]+$",
"Only alphanumeric characters, space, dash, and underscore are allowed.",
)
],
),
),
("description", models.CharField(max_length=512)),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
(
"history",
django.contrib.postgres.fields.jsonb.JSONField(default=dict),
),
(
"last_editor",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="open_humans.Member",
),
),
(
"parent",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.PROTECT,
related_name="children",
to="data_import.DataType",
),
),
],
)
]
147 changes: 147 additions & 0 deletions data_import/models.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,35 @@
from collections import OrderedDict
import datetime
import logging
import os
import uuid

import arrow
from botocore.exceptions import ClientError

from django.conf import settings
from django.contrib.postgres.fields import JSONField
from django.core.validators import RegexValidator
from django.urls import reverse
from django.db import models
from django.db.models import F
from django.utils import timezone

from ipware.ip import get_ip

from common import fields
from common.utils import full_url
from open_humans.models import Member

from .utils import get_upload_path

logger = logging.getLogger(__name__)

charvalidator = RegexValidator(
r"^[\w\-\s]+$",
"Only alphanumeric characters, space, dash, and underscore are allowed.",
)


def is_public(member, source):
"""
Expand Down Expand Up @@ -267,3 +277,140 @@ class TestUserData(models.Model):
related_name="test_user_data",
on_delete=models.CASCADE,
)


class DataType(models.Model):
"""
Describes the types of data a DataFile can contain.
"""

name = models.CharField(
max_length=128, blank=False, unique=True, validators=[charvalidator]
)
parent = models.ForeignKey(
"self", blank=True, null=True, related_name="children", on_delete=models.PROTECT
)
last_editor = models.ForeignKey(Member, on_delete=models.SET_NULL, null=True)
description = models.CharField(max_length=512, blank=False)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
history = JSONField(default=dict)

def __str__(self):
parents = self.all_parents
if parents:
parents.reverse()
parents = [parent.name for parent in parents if parent]
parents = ":".join(parents)
return str("{0}:{1}").format(parents, self.name)
return self.name

def save(self, *args, **kwargs):
"""
Override save to record edit history and require an associated "editor".
"editor" is an instance-specific parameter; this avoids accepting an update
that is merely retaining the existing value for the "last_editor" field.
"""
if not self.editor:
raise ValueError("'self.editor' must be set when saving DataType.")
else:
self.last_editor = self.editor
self.history[arrow.get(timezone.now()).isoformat()] = {
"name": self.name,
"parent": self.parent.id if self.parent else None,
"description": self.description,
"editor": self.last_editor.id,
}
return super().save(*args, **kwargs)

@property
def history_sorted(self):
history_sorted = OrderedDict()
items_sorted = sorted(
self.history.items(), key=lambda item: arrow.get(item[0]), reverse=True
)
for item in items_sorted:
parent = (
DataType.objects.get(id=item[1]["parent"])
if item[1]["parent"]
else None
)
try:
editor = Member.objects.get(id=item[1]["editor"])
except Member.DoesNotExist:
editor = None
history_sorted[arrow.get(item[0]).datetime] = {
"name": item[1]["name"],
"parent": parent,
"description": item[1]["description"],
"editor": editor,
}
return history_sorted

@property
def editable(self):
"""
Return True if no approved projects are registered as using this.
"""
# Always true for a new instance that hasn't yet been saved:
if not self.id:
return True

approved_registered = self.datarequestproject_set.filter(approved=True)
if approved_registered:
return False
else:
return True

@property
def all_parents(self):
"""
Return list of parents, from immediate to most ancestral.
"""
parent = self.parent
parents = []
if parent:
while True:
if not parent:
break
parents.append(parent)
parent = parent.parent

return parents

@classmethod
def all_as_tree(cls):
"""
Dict tree of all datatypes. Key = parent & value = array of child dicts.
This method is intended to make all ancestry relationships available without
having to hit the database more than necessary.
"""

def _children(parent, all_datatypes):
children = {}
for dt in [dt for dt in all_datatypes if dt.parent == parent]:
children[dt] = _children(dt, all_datatypes)
return children

all_datatypes = list(DataType.objects.all())
roots = DataType.objects.filter(parent=None)
tree = {dt: _children(dt, all_datatypes) for dt in roots}
return tree

@classmethod
def sorted_by_ancestors(cls, queryset=None):
"""
Sort DataTypes by ancestors array of dicts containing 'datatype' and 'depth'.
"""

def _flatten(node, depth=0):
flattened = []
for child in sorted(node.keys(), key=lambda obj: obj.name):
flattened.append({"datatype": child, "depth": depth})
flattened = flattened + _flatten(node[child], depth=depth + 1)
return flattened

datatypes_tree = cls.all_as_tree()
return _flatten(datatypes_tree)
28 changes: 27 additions & 1 deletion data_import/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

from rest_framework import serializers

from .models import AWSDataFileAccessLog, DataFile, NewDataFileAccessLog
from private_sharing.models import DataRequestProject

from .models import AWSDataFileAccessLog, DataFile, DataType, NewDataFileAccessLog


def serialize_datafile_to_dict(datafile):
Expand Down Expand Up @@ -87,3 +89,27 @@ class Meta: # noqa: D101
"host_header",
"datafile",
]


class DataTypeSerializer(serializers.ModelSerializer):
"""
Serialize DataTypes
"""

class Meta: # noqa: D101
model = DataType

fields = ["id", "name", "parent", "description", "source_projects"]

source_projects = serializers.SerializerMethodField()

def get_source_projects(self, obj):
"""
Get approved projects that are registered as potential sources.
"""
projects = (
DataRequestProject.objects.filter(approved=True)
.filter(registered_datatypes=obj)
.distinct()
)
return [project.id_label for project in projects]
Loading

0 comments on commit eb74ce4

Please sign in to comment.