1
+ from collections .abc import Mapping
2
+
1
3
import pandas as pd
4
+ from pandas .core .index import _ensure_index
5
+ from pandas .core .frame import _to_arrays , _arrays_to_mgr
2
6
from .utils import update_with_verbose , get_related_model
3
7
import django
8
+ import numpy as np
4
9
5
10
6
11
def to_fields (qs , fieldnames ):
@@ -32,14 +37,82 @@ def is_values_queryset(qs):
32
37
return qs ._iterable_class == django .db .models .query .ValuesIterable
33
38
34
39
40
+ _FIELDS_TO_DTYPES = {
41
+ django .db .models .fields .AutoField : np .int32 ,
42
+ django .db .models .fields .BigAutoField : np .int64 ,
43
+ django .db .models .fields .BigIntegerField : np .int64 ,
44
+ django .db .models .fields .BinaryField : np .bytes_ ,
45
+ django .db .models .fields .BooleanField : np .bool_ ,
46
+ django .db .models .fields .CharField : np .unicode_ ,
47
+ django .db .models .fields .DateField : np .datetime64 ,
48
+ django .db .models .fields .DateTimeField : np .datetime64 ,
49
+ django .db .models .fields .DecimalField : object ,
50
+ django .db .models .fields .DurationField : np .timedelta64 ,
51
+ django .db .models .fields .EmailField : np .unicode_ ,
52
+ django .db .models .fields .FilePathField : np .unicode_ ,
53
+ django .db .models .fields .FloatField : np .float64 ,
54
+ django .db .models .fields .GenericIPAddressField : np .unicode_ ,
55
+ django .db .models .fields .IntegerField : np .int32 ,
56
+ django .db .models .fields .NullBooleanField : object , # bool(None) is False
57
+ django .db .models .fields .PositiveIntegerField : np .uint32 ,
58
+ django .db .models .fields .PositiveSmallIntegerField : np .uint16 ,
59
+ django .db .models .fields .SlugField : np .unicode_ ,
60
+ django .db .models .fields .TextField : np .unicode_ ,
61
+ django .db .models .fields .TimeField : np .datetime64 ,
62
+ django .db .models .fields .URLField : np .unicode_ ,
63
+ django .db .models .fields .UUIDField : object ,
64
+ django .db .models .fields .SmallIntegerField : np .int16 ,
65
+ }
66
+
67
+
68
+ def _get_dtypes (fields_to_dtypes , fields ):
69
+ """Infer NumPy dtypes from field types among those named in fieldnames.
70
+
71
+ Returns a list of (fieldname, NumPy dtype) pairs. Read about NumPy dtypes
72
+ here [#]_ and here [#]_. The returned list can be passed to ``numpy.array``
73
+ in ``read_frame``.
74
+
75
+ Parameters
76
+ ----------
77
+
78
+ field_to_dtypes : mapping
79
+ A (potentially empty) mapping of Django field classes to NumPy dtypes.
80
+ This mapping overrides the defualts from ``_FIELDS_TO_DTYPES``. The
81
+ back-up default dtype is ``object`` for unfamiliar field classes.
82
+
83
+ fields : list of Django field class instances
84
+ They must correspond in order to the columns of the dataframe that
85
+ ``read_frame`` is building.
86
+
87
+ .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html
88
+ .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
89
+ """
90
+ dtypes = []
91
+ f2d = _FIELDS_TO_DTYPES .copy ()
92
+ f2d .update (fields_to_dtypes )
93
+ for field in fields :
94
+ # Find the lowest subclass mong the keys of f2d
95
+ t , dtype = object , object
96
+ for k , v in f2d .items ():
97
+ if isinstance (field , k ) and issubclass (k , t ):
98
+ t = k
99
+ dtype = v
100
+ dtypes .append ((field .name , dtype ))
101
+ return dtypes
102
+
103
+
35
104
def read_frame (qs , fieldnames = (), index_col = None , coerce_float = False ,
36
- verbose = True ):
105
+ verbose = True , compress = False ):
37
106
"""
38
107
Returns a dataframe from a QuerySet
39
108
40
109
Optionally specify the field names/columns to utilize and
41
110
a field as the index
42
111
112
+ This function uses the QuerySet's ``iterator`` method, so it does not
113
+ populate the QuerySet's cache. This is more memory efficient in the typical
114
+ case where you do not use the QuerySet after ``read_frame``.
115
+
43
116
Parameters
44
117
----------
45
118
@@ -58,13 +131,31 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False,
58
131
coerce_float : boolean, default False
59
132
Attempt to convert values to non-string, non-numeric data (like
60
133
decimal.Decimal) to floating point, useful for SQL result sets
134
+ Does not work with ``compress``.
61
135
62
136
verbose: boolean If this is ``True`` then populate the DataFrame with the
63
137
human readable versions of any foreign key fields else use
64
138
the primary keys values.
65
139
The human readable version of the foreign key field is
66
140
defined in the ``__unicode__`` or ``__str__``
67
141
methods of the related class definition
142
+
143
+ compress: boolean or a mapping, default False
144
+ If a true value, infer NumPy data types [#]_ for Pandas dataframe
145
+ columns from the corresponding Django field types. For example, Django's
146
+ built in ``SmallIntgerField`` is cast to NumPy's ``int16``. If
147
+ ``compress`` is a mapping (e.g., a ``dict``), it should be a mapping
148
+ with Django field subclasses as keys and NumPy dtypes [#]_ as values.
149
+ This mapping overrides the defualts for the field classes appearing in
150
+ the mapping. However, the inference is based on the field subclass
151
+ lowest on a chain of subclasses, that is, in order of inheritence.
152
+ To override ``SmallIntegerField`` it is therefore not sufficient to
153
+ override ``IntegerField``. Careful of setting ``compress={}`` because
154
+ ``{}`` is a false value in Python, which would cause ``read_frame``
155
+ not to compress columns.
156
+
157
+ .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html
158
+ .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
68
159
"""
69
160
70
161
if fieldnames :
@@ -108,13 +199,23 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False,
108
199
fields = qs .model ._meta .fields
109
200
fieldnames = [f .name for f in fields ]
110
201
111
- if is_values_queryset (qs ):
112
- recs = list (qs )
113
- else :
114
- recs = list (qs .values_list (* fieldnames ))
115
-
116
- df = pd .DataFrame .from_records (recs , columns = fieldnames ,
117
- coerce_float = coerce_float )
202
+ if not is_values_queryset (qs ):
203
+ qs = qs .values_list (* fieldnames )
204
+
205
+ # Goal is to avoid instantiating the NumPy columns with wider dtypes than
206
+ # compress needs. If pandas.DataFrame.from_records accepted a dtype
207
+ # argument, we would just call that constructor. The following several lines
208
+ # do the same thing.
209
+ columns = _ensure_index (fieldnames )
210
+ values = list (qs .iterator ()) # Potentially the hardest step
211
+ if compress :
212
+ if not isinstance (compress , Mapping ):
213
+ compress = {}
214
+ values = np .array (values , dtype = _get_dtypes (compress , fields ))
215
+ df = pd .DataFrame (_arrays_to_mgr (
216
+ arrays = _to_arrays (
217
+ data = values , columns = columns , coerce_float = coerce_float )[0 ],
218
+ arr_names = columns , index = None , columns = columns ))
118
219
119
220
if verbose :
120
221
update_with_verbose (df , fieldnames , fields )
0 commit comments