Skip to content

Commit fab037d

Browse files
committed
Fix chrisdev#63 and add compress argument to read_frame
1 parent 498e355 commit fab037d

File tree

3 files changed

+120
-9
lines changed

3 files changed

+120
-9
lines changed

django_pandas/io.py

+109-8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1+
from collections.abc import Mapping
2+
13
import pandas as pd
4+
from pandas.core.index import _ensure_index
5+
from pandas.core.frame import _to_arrays, _arrays_to_mgr
26
from .utils import update_with_verbose, get_related_model
37
import django
8+
import numpy as np
49

510

611
def to_fields(qs, fieldnames):
@@ -32,14 +37,82 @@ def is_values_queryset(qs):
3237
return qs._iterable_class == django.db.models.query.ValuesIterable
3338

3439

40+
_FIELDS_TO_DTYPES = {
41+
django.db.models.fields.AutoField: np.int32,
42+
django.db.models.fields.BigAutoField: np.int64,
43+
django.db.models.fields.BigIntegerField: np.int64,
44+
django.db.models.fields.BinaryField: np.bytes_,
45+
django.db.models.fields.BooleanField: np.bool_,
46+
django.db.models.fields.CharField: np.unicode_,
47+
django.db.models.fields.DateField: np.datetime64,
48+
django.db.models.fields.DateTimeField: np.datetime64,
49+
django.db.models.fields.DecimalField: object,
50+
django.db.models.fields.DurationField: np.timedelta64,
51+
django.db.models.fields.EmailField: np.unicode_,
52+
django.db.models.fields.FilePathField: np.unicode_,
53+
django.db.models.fields.FloatField: np.float64,
54+
django.db.models.fields.GenericIPAddressField: np.unicode_,
55+
django.db.models.fields.IntegerField: np.int32,
56+
django.db.models.fields.NullBooleanField: object, # bool(None) is False
57+
django.db.models.fields.PositiveIntegerField: np.uint32,
58+
django.db.models.fields.PositiveSmallIntegerField: np.uint16,
59+
django.db.models.fields.SlugField: np.unicode_,
60+
django.db.models.fields.TextField: np.unicode_,
61+
django.db.models.fields.TimeField: np.datetime64,
62+
django.db.models.fields.URLField: np.unicode_,
63+
django.db.models.fields.UUIDField: object,
64+
django.db.models.fields.SmallIntegerField: np.int16,
65+
}
66+
67+
68+
def _get_dtypes(fields_to_dtypes, fields):
69+
"""Infer NumPy dtypes from field types among those named in fieldnames.
70+
71+
Returns a list of (fieldname, NumPy dtype) pairs. Read about NumPy dtypes
72+
here [#]_ and here [#]_. The returned list can be passed to ``numpy.array``
73+
in ``read_frame``.
74+
75+
Parameters
76+
----------
77+
78+
field_to_dtypes : mapping
79+
A (potentially empty) mapping of Django field classes to NumPy dtypes.
80+
This mapping overrides the defualts from ``_FIELDS_TO_DTYPES``. The
81+
back-up default dtype is ``object`` for unfamiliar field classes.
82+
83+
fields : list of Django field class instances
84+
They must correspond in order to the columns of the dataframe that
85+
``read_frame`` is building.
86+
87+
.. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html
88+
.. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
89+
"""
90+
dtypes = []
91+
f2d = _FIELDS_TO_DTYPES.copy()
92+
f2d.update(fields_to_dtypes)
93+
for field in fields:
94+
# Find the lowest subclass mong the keys of f2d
95+
t, dtype = object, object
96+
for k, v in f2d.items():
97+
if isinstance(field, k) and issubclass(k, t):
98+
t = k
99+
dtype = v
100+
dtypes.append((field.name, dtype))
101+
return dtypes
102+
103+
35104
def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False,
36-
verbose=True):
105+
verbose=True, compress=False):
37106
"""
38107
Returns a dataframe from a QuerySet
39108
40109
Optionally specify the field names/columns to utilize and
41110
a field as the index
42111
112+
This function uses the QuerySet's ``iterator`` method, so it does not
113+
populate the QuerySet's cache. This is more memory efficient in the typical
114+
case where you do not use the QuerySet after ``read_frame``.
115+
43116
Parameters
44117
----------
45118
@@ -58,13 +131,31 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False,
58131
coerce_float : boolean, default False
59132
Attempt to convert values to non-string, non-numeric data (like
60133
decimal.Decimal) to floating point, useful for SQL result sets
134+
Does not work with ``compress``.
61135
62136
verbose: boolean If this is ``True`` then populate the DataFrame with the
63137
human readable versions of any foreign key fields else use
64138
the primary keys values.
65139
The human readable version of the foreign key field is
66140
defined in the ``__unicode__`` or ``__str__``
67141
methods of the related class definition
142+
143+
compress: boolean or a mapping, default False
144+
If a true value, infer NumPy data types [#]_ for Pandas dataframe
145+
columns from the corresponding Django field types. For example, Django's
146+
built in ``SmallIntgerField`` is cast to NumPy's ``int16``. If
147+
``compress`` is a mapping (e.g., a ``dict``), it should be a mapping
148+
with Django field subclasses as keys and NumPy dtypes [#]_ as values.
149+
This mapping overrides the defualts for the field classes appearing in
150+
the mapping. However, the inference is based on the field subclass
151+
lowest on a chain of subclasses, that is, in order of inheritence.
152+
To override ``SmallIntegerField`` it is therefore not sufficient to
153+
override ``IntegerField``. Careful of setting ``compress={}`` because
154+
``{}`` is a false value in Python, which would cause ``read_frame``
155+
not to compress columns.
156+
157+
.. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html
158+
.. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
68159
"""
69160

70161
if fieldnames:
@@ -108,13 +199,23 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False,
108199
fields = qs.model._meta.fields
109200
fieldnames = [f.name for f in fields]
110201

111-
if is_values_queryset(qs):
112-
recs = list(qs)
113-
else:
114-
recs = list(qs.values_list(*fieldnames))
115-
116-
df = pd.DataFrame.from_records(recs, columns=fieldnames,
117-
coerce_float=coerce_float)
202+
if not is_values_queryset(qs):
203+
qs = qs.values_list(*fieldnames)
204+
205+
# Goal is to avoid instantiating the NumPy columns with wider dtypes than
206+
# compress needs. If pandas.DataFrame.from_records accepted a dtype
207+
# argument, we would just call that constructor. The following several lines
208+
# do the same thing.
209+
columns = _ensure_index(fieldnames)
210+
values = list(qs.iterator()) # Potentially the hardest step
211+
if compress:
212+
if not isinstance(compress, Mapping):
213+
compress = {}
214+
values = np.array(values, dtype=_get_dtypes(compress, fields))
215+
df = pd.DataFrame(_arrays_to_mgr(
216+
arrays=_to_arrays(
217+
data=values, columns=columns, coerce_float=coerce_float)[0],
218+
arr_names=columns, index=None, columns=columns))
118219

119220
if verbose:
120221
update_with_verbose(df, fieldnames, fields)

django_pandas/tests/models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class MyModel(models.Model):
99
col1 = models.IntegerField()
1010
col2 = models.FloatField(null=True)
1111
col3 = models.FloatField(null=True)
12-
col4 = models.IntegerField()
12+
col4 = models.SmallIntegerField()
1313

1414
def __str__(self):
1515
return "{} {} {} {}".format(

django_pandas/tests/test_io.py

+10
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,16 @@ def test_basic(self):
4646
df1 = read_frame(qs, ['col1', 'col2'])
4747
self.assertEqual(df1.shape, (qs.count(), 2))
4848

49+
def test_compress(self):
50+
qs = MyModel.objects.all()
51+
df = read_frame(qs, compress=True)
52+
53+
# Test automatic inference of dtypes
54+
self.assertIs(df.col1.dtype, np.dtype('int32'))
55+
self.assertIs(df.col2.dtype, np.dtype('float_'))
56+
self.assertIs(df.col3.dtype, np.dtype('float_'))
57+
self.assertIs(df.col4.dtype, np.dtype('int16'))
58+
4959
def test_values(self):
5060
qs = MyModel.objects.all()
5161
qs = qs.extra(select={"ecol1": "col1+1"})

0 commit comments

Comments
 (0)