From fab037de2b9ff742fc0b55193c01735cbd3229fb Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 09:38:25 -0500 Subject: [PATCH 01/20] Fix #63 and add compress argument to read_frame --- django_pandas/io.py | 117 ++++++++++++++++++++++++++++++--- django_pandas/tests/models.py | 2 +- django_pandas/tests/test_io.py | 10 +++ 3 files changed, 120 insertions(+), 9 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index 41a75e6..eb84766 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -1,6 +1,11 @@ +from collections.abc import Mapping + import pandas as pd +from pandas.core.index import _ensure_index +from pandas.core.frame import _to_arrays, _arrays_to_mgr from .utils import update_with_verbose, get_related_model import django +import numpy as np def to_fields(qs, fieldnames): @@ -32,14 +37,82 @@ def is_values_queryset(qs): return qs._iterable_class == django.db.models.query.ValuesIterable +_FIELDS_TO_DTYPES = { + django.db.models.fields.AutoField: np.int32, + django.db.models.fields.BigAutoField: np.int64, + django.db.models.fields.BigIntegerField: np.int64, + django.db.models.fields.BinaryField: np.bytes_, + django.db.models.fields.BooleanField: np.bool_, + django.db.models.fields.CharField: np.unicode_, + django.db.models.fields.DateField: np.datetime64, + django.db.models.fields.DateTimeField: np.datetime64, + django.db.models.fields.DecimalField: object, + django.db.models.fields.DurationField: np.timedelta64, + django.db.models.fields.EmailField: np.unicode_, + django.db.models.fields.FilePathField: np.unicode_, + django.db.models.fields.FloatField: np.float64, + django.db.models.fields.GenericIPAddressField: np.unicode_, + django.db.models.fields.IntegerField: np.int32, + django.db.models.fields.NullBooleanField: object, # bool(None) is False + django.db.models.fields.PositiveIntegerField: np.uint32, + django.db.models.fields.PositiveSmallIntegerField: np.uint16, + django.db.models.fields.SlugField: np.unicode_, + django.db.models.fields.TextField: np.unicode_, + django.db.models.fields.TimeField: np.datetime64, + django.db.models.fields.URLField: np.unicode_, + django.db.models.fields.UUIDField: object, + django.db.models.fields.SmallIntegerField: np.int16, +} + + +def _get_dtypes(fields_to_dtypes, fields): + """Infer NumPy dtypes from field types among those named in fieldnames. + + Returns a list of (fieldname, NumPy dtype) pairs. Read about NumPy dtypes + here [#]_ and here [#]_. The returned list can be passed to ``numpy.array`` + in ``read_frame``. + + Parameters + ---------- + + field_to_dtypes : mapping + A (potentially empty) mapping of Django field classes to NumPy dtypes. + This mapping overrides the defualts from ``_FIELDS_TO_DTYPES``. The + back-up default dtype is ``object`` for unfamiliar field classes. + + fields : list of Django field class instances + They must correspond in order to the columns of the dataframe that + ``read_frame`` is building. + + .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html + .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html + """ + dtypes = [] + f2d = _FIELDS_TO_DTYPES.copy() + f2d.update(fields_to_dtypes) + for field in fields: + # Find the lowest subclass mong the keys of f2d + t, dtype = object, object + for k, v in f2d.items(): + if isinstance(field, k) and issubclass(k, t): + t = k + dtype = v + dtypes.append((field.name, dtype)) + return dtypes + + def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, - verbose=True): + verbose=True, compress=False): """ Returns a dataframe from a QuerySet Optionally specify the field names/columns to utilize and a field as the index + This function uses the QuerySet's ``iterator`` method, so it does not + populate the QuerySet's cache. This is more memory efficient in the typical + case where you do not use the QuerySet after ``read_frame``. + Parameters ---------- @@ -58,6 +131,7 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, coerce_float : boolean, default False Attempt to convert values to non-string, non-numeric data (like decimal.Decimal) to floating point, useful for SQL result sets + Does not work with ``compress``. verbose: boolean If this is ``True`` then populate the DataFrame with the human readable versions of any foreign key fields else use @@ -65,6 +139,23 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, The human readable version of the foreign key field is defined in the ``__unicode__`` or ``__str__`` methods of the related class definition + + compress: boolean or a mapping, default False + If a true value, infer NumPy data types [#]_ for Pandas dataframe + columns from the corresponding Django field types. For example, Django's + built in ``SmallIntgerField`` is cast to NumPy's ``int16``. If + ``compress`` is a mapping (e.g., a ``dict``), it should be a mapping + with Django field subclasses as keys and NumPy dtypes [#]_ as values. + This mapping overrides the defualts for the field classes appearing in + the mapping. However, the inference is based on the field subclass + lowest on a chain of subclasses, that is, in order of inheritence. + To override ``SmallIntegerField`` it is therefore not sufficient to + override ``IntegerField``. Careful of setting ``compress={}`` because + ``{}`` is a false value in Python, which would cause ``read_frame`` + not to compress columns. + + .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html + .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html """ if fieldnames: @@ -108,13 +199,23 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, fields = qs.model._meta.fields fieldnames = [f.name for f in fields] - if is_values_queryset(qs): - recs = list(qs) - else: - recs = list(qs.values_list(*fieldnames)) - - df = pd.DataFrame.from_records(recs, columns=fieldnames, - coerce_float=coerce_float) + if not is_values_queryset(qs): + qs = qs.values_list(*fieldnames) + + # Goal is to avoid instantiating the NumPy columns with wider dtypes than + # compress needs. If pandas.DataFrame.from_records accepted a dtype + # argument, we would just call that constructor. The following several lines + # do the same thing. + columns = _ensure_index(fieldnames) + values = list(qs.iterator()) # Potentially the hardest step + if compress: + if not isinstance(compress, Mapping): + compress = {} + values = np.array(values, dtype=_get_dtypes(compress, fields)) + df = pd.DataFrame(_arrays_to_mgr( + arrays=_to_arrays( + data=values, columns=columns, coerce_float=coerce_float)[0], + arr_names=columns, index=None, columns=columns)) if verbose: update_with_verbose(df, fieldnames, fields) diff --git a/django_pandas/tests/models.py b/django_pandas/tests/models.py index 95025df..008d19f 100644 --- a/django_pandas/tests/models.py +++ b/django_pandas/tests/models.py @@ -9,7 +9,7 @@ class MyModel(models.Model): col1 = models.IntegerField() col2 = models.FloatField(null=True) col3 = models.FloatField(null=True) - col4 = models.IntegerField() + col4 = models.SmallIntegerField() def __str__(self): return "{} {} {} {}".format( diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index a5dee0e..4be8ce5 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -46,6 +46,16 @@ def test_basic(self): df1 = read_frame(qs, ['col1', 'col2']) self.assertEqual(df1.shape, (qs.count(), 2)) + def test_compress(self): + qs = MyModel.objects.all() + df = read_frame(qs, compress=True) + + # Test automatic inference of dtypes + self.assertIs(df.col1.dtype, np.dtype('int32')) + self.assertIs(df.col2.dtype, np.dtype('float_')) + self.assertIs(df.col3.dtype, np.dtype('float_')) + self.assertIs(df.col4.dtype, np.dtype('int16')) + def test_values(self): qs = MyModel.objects.all() qs = qs.extra(select={"ecol1": "col1+1"}) From fabc47043f001ee927d72d7e2204e4d070cfb365 Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 09:44:33 -0500 Subject: [PATCH 02/20] Simplify implementation of compress --- django_pandas/io.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index eb84766..6d208d0 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -1,8 +1,6 @@ from collections.abc import Mapping import pandas as pd -from pandas.core.index import _ensure_index -from pandas.core.frame import _to_arrays, _arrays_to_mgr from .utils import update_with_verbose, get_related_model import django import numpy as np @@ -201,21 +199,14 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, if not is_values_queryset(qs): qs = qs.values_list(*fieldnames) + recs = qs.iterator() - # Goal is to avoid instantiating the NumPy columns with wider dtypes than - # compress needs. If pandas.DataFrame.from_records accepted a dtype - # argument, we would just call that constructor. The following several lines - # do the same thing. - columns = _ensure_index(fieldnames) - values = list(qs.iterator()) # Potentially the hardest step if compress: if not isinstance(compress, Mapping): compress = {} - values = np.array(values, dtype=_get_dtypes(compress, fields)) - df = pd.DataFrame(_arrays_to_mgr( - arrays=_to_arrays( - data=values, columns=columns, coerce_float=coerce_float)[0], - arr_names=columns, index=None, columns=columns)) + recs = np.array(list(recs), dtype=_get_dtypes(compress, fields)) + df = pd.DataFrame.from_records(recs, columns=fieldnames, + coerce_float=coerce_float) if verbose: update_with_verbose(df, fieldnames, fields) From d120dc68c13717d4e9aee09204bcddc63116e6e3 Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 09:50:06 -0500 Subject: [PATCH 03/20] Add myself to AUTHORS.rst --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index f5503bb..7b91dc0 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -23,5 +23,6 @@ Contributions - `Yousuf Jawwad `_ - `@henhuy `_ - `Hélio Meira Lins `_ +- `William Schwartz `_ - `@utpyngo `_ From 1cad8b64a4cd0ac5e7ad485854e0d8d4065020fa Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 22:12:22 -0500 Subject: [PATCH 04/20] Add tests of read_frame(compress=True) --- django_pandas/tests/models.py | 39 ++++++++++++++++++++++++++++++ django_pandas/tests/test_io.py | 44 ++++++++++++++++++++++++++++++---- runtests.py | 7 +++++- 3 files changed, 84 insertions(+), 6 deletions(-) diff --git a/django_pandas/tests/models.py b/django_pandas/tests/models.py index 008d19f..f52744b 100644 --- a/django_pandas/tests/models.py +++ b/django_pandas/tests/models.py @@ -1,3 +1,7 @@ +import datetime as dt +from decimal import Decimal +from uuid import UUID + from django.db import models from django.utils.encoding import python_2_unicode_compatible from django_pandas.managers import DataFrameManager, PassThroughManager @@ -32,6 +36,41 @@ class MyModelChoice(models.Model): objects = DataFrameManager() +class ByteField(models.SmallIntegerField): + pass + +class CompressableModel(models.Model): + # Can only have one auto field per model and id is added automatically + # id = models.AutoField(primary_key=True) + # bigauto = models.BigAutoField() + + bigint = models.BigIntegerField(default=1<<63 - 1) + binary = models.BinaryField(default=b'test bytes') + boolean = models.BooleanField(default=True) + char = models.CharField(max_length=10, default='test chars') + date = models.DateField(default=dt.date(2018, 3, 27)) + datetime = models.DateTimeField(default=dt.datetime(2018, 3, 27, 13, 55, 56)) + decimal = models.DecimalField(decimal_places=1, max_digits=3, default=Decimal(1.5)) + duration = models.DurationField(default=dt.timedelta(minutes=1, seconds=1)) + email = models.EmailField(default="an+email@address.com") + filepath = models.FilePathField(default="/usr/local/bin/python") + floating = models.FloatField(default=1.2) + ip = models.GenericIPAddressField(default="::ffff:192.0.2.1") + integer = models.IntegerField(default=1<<31 - 1) + nullboolean = models.NullBooleanField(default=None) + uint = models.PositiveIntegerField(default=1<<31 - 1) + ushort = models.PositiveSmallIntegerField(default=1<<15 - 1) + slug = models.SlugField(default="test_slug") + short = models.SmallIntegerField(default=-(1<<15 - 1)) + text = models.TextField(default="test text") + time = models.TimeField(default=dt.time(13, 55, 56)) + url = models.URLField(default="https://github.com/chrisdev/django-pandas") + uuid = models.UUIDField(default=UUID(int=1234556789)) + + # Custom field + byte = ByteField(default=127) + + @python_2_unicode_compatible class DataFrame(models.Model): diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index 4be8ce5..8cf9470 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -4,6 +4,7 @@ import pandas as pd import numpy as np from .models import MyModel, Trader, Security, TradeLog, TradeLogNote, MyModelChoice, Portfolio +from django_pandas.tests import models from django_pandas.io import read_frame @@ -46,15 +47,48 @@ def test_basic(self): df1 = read_frame(qs, ['col1', 'col2']) self.assertEqual(df1.shape, (qs.count(), 2)) - def test_compress(self): + def test_compress_basic(self): qs = MyModel.objects.all() df = read_frame(qs, compress=True) # Test automatic inference of dtypes - self.assertIs(df.col1.dtype, np.dtype('int32')) - self.assertIs(df.col2.dtype, np.dtype('float_')) - self.assertIs(df.col3.dtype, np.dtype('float_')) - self.assertIs(df.col4.dtype, np.dtype('int16')) + self.assertEqual(df.col1.dtype, np.dtype('int32')) + self.assertEqual(df.col2.dtype, np.dtype('float_')) + self.assertEqual(df.col3.dtype, np.dtype('float_')) + self.assertEqual(df.col4.dtype, np.dtype('int16')) + + def assert_default_compressable(self, df): + for field in models.CompressableModel._meta.get_fields(): + if field.name == 'id': + self.assertEqual(df['id'][0], 1) + self.assertIs(df['id'].dtype, np.dtype('int32')) + elif field.name == 'date': + self.assertEqual(df['date'][0].to_pydatetime().date(), field.default) + elif field.name == 'datetime': + self.assertEqual(df['datetime'][0].to_pydatetime(), field.default) + elif field.name == 'duration': + self.assertEqual(df['duration'][0].to_pytimedelta(), field.default) + elif isinstance(field.default, (str, bytes)): + self.assertEqual(df[field.name].dtype, np.dtype(object)) + else: + msg = ( + f'Expected {field.name} to have value {field.default!r}, but was' + f' {df[field.name][0]!r}') + self.assertEqual(df[field.name][0], field.default, msg) + + def test_compress_custom_field(self): + models.CompressableModel().save() + qs = models.CompressableModel.objects.all() + + # Specify a custom dtype for the custom field + df = read_frame(qs, compress={models.ByteField: np.int8}) + self.assert_default_compressable(df) + self.assertEqual(df.byte.dtype, np.int8) + + # Rely on finding the minimum specified parent class + df = read_frame(qs, compress=True) + self.assert_default_compressable(df) + self.assertEqual(df.byte.dtype, np.int16) def test_values(self): qs = MyModel.objects.all() diff --git a/runtests.py b/runtests.py index c8b35c2..b6aaef3 100755 --- a/runtests.py +++ b/runtests.py @@ -24,7 +24,12 @@ "PORT": "", } }, - MIDDLEWARE_CLASSES = () + MIDDLEWARE_CLASSES = (), + # django_pandas.tests.models.CompressableModel contains datetime fields + # with fixed default values instead of auto-now defaults. This makes + # testing easier, but triggers the fields.W161 system check. The + # following setting ignores that system check. + SILENCED_SYSTEM_CHECKS=['fields.W161'], ) settings.configure(**settings_dict) From e87b2a627070106cf9fa06613e32bc8e8225aa43 Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 22:14:46 -0500 Subject: [PATCH 05/20] Pandas has no str, bytes, datetime.time data types Also, be specific enough about units for np.dtype('datetime64...'). 'D' is for days and 'us' is for microseconds. --- django_pandas/io.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index 6d208d0..674ce10 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -39,25 +39,25 @@ def is_values_queryset(qs): django.db.models.fields.AutoField: np.int32, django.db.models.fields.BigAutoField: np.int64, django.db.models.fields.BigIntegerField: np.int64, - django.db.models.fields.BinaryField: np.bytes_, + django.db.models.fields.BinaryField: object, # Pandas has no bytes type django.db.models.fields.BooleanField: np.bool_, - django.db.models.fields.CharField: np.unicode_, - django.db.models.fields.DateField: np.datetime64, - django.db.models.fields.DateTimeField: np.datetime64, + django.db.models.fields.CharField: object, # Pandas has no str type + django.db.models.fields.DateField: np.dtype('datetime64[D]'), + django.db.models.fields.DateTimeField: np.dtype('datetime64[us]'), django.db.models.fields.DecimalField: object, - django.db.models.fields.DurationField: np.timedelta64, - django.db.models.fields.EmailField: np.unicode_, - django.db.models.fields.FilePathField: np.unicode_, + django.db.models.fields.DurationField: np.dtype('timedelta64[us]'), + django.db.models.fields.EmailField: object, + django.db.models.fields.FilePathField: object, django.db.models.fields.FloatField: np.float64, - django.db.models.fields.GenericIPAddressField: np.unicode_, + django.db.models.fields.GenericIPAddressField: object, django.db.models.fields.IntegerField: np.int32, django.db.models.fields.NullBooleanField: object, # bool(None) is False django.db.models.fields.PositiveIntegerField: np.uint32, django.db.models.fields.PositiveSmallIntegerField: np.uint16, - django.db.models.fields.SlugField: np.unicode_, - django.db.models.fields.TextField: np.unicode_, - django.db.models.fields.TimeField: np.datetime64, - django.db.models.fields.URLField: np.unicode_, + django.db.models.fields.SlugField: object, + django.db.models.fields.TextField: object, + django.db.models.fields.TimeField: object, + django.db.models.fields.URLField: object, django.db.models.fields.UUIDField: object, django.db.models.fields.SmallIntegerField: np.int16, } From 0907e95fd7e441a78b588b2e4fa582c1280a4c9c Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 22:16:08 -0500 Subject: [PATCH 06/20] Improve readability of io.py --- django_pandas/io.py | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index 674ce10..b5101ca 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -3,6 +3,7 @@ import pandas as pd from .utils import update_with_verbose, get_related_model import django +from django.db.models import fields import numpy as np @@ -36,30 +37,30 @@ def is_values_queryset(qs): _FIELDS_TO_DTYPES = { - django.db.models.fields.AutoField: np.int32, - django.db.models.fields.BigAutoField: np.int64, - django.db.models.fields.BigIntegerField: np.int64, - django.db.models.fields.BinaryField: object, # Pandas has no bytes type - django.db.models.fields.BooleanField: np.bool_, - django.db.models.fields.CharField: object, # Pandas has no str type - django.db.models.fields.DateField: np.dtype('datetime64[D]'), - django.db.models.fields.DateTimeField: np.dtype('datetime64[us]'), - django.db.models.fields.DecimalField: object, - django.db.models.fields.DurationField: np.dtype('timedelta64[us]'), - django.db.models.fields.EmailField: object, - django.db.models.fields.FilePathField: object, - django.db.models.fields.FloatField: np.float64, - django.db.models.fields.GenericIPAddressField: object, - django.db.models.fields.IntegerField: np.int32, - django.db.models.fields.NullBooleanField: object, # bool(None) is False - django.db.models.fields.PositiveIntegerField: np.uint32, - django.db.models.fields.PositiveSmallIntegerField: np.uint16, - django.db.models.fields.SlugField: object, - django.db.models.fields.TextField: object, - django.db.models.fields.TimeField: object, - django.db.models.fields.URLField: object, - django.db.models.fields.UUIDField: object, - django.db.models.fields.SmallIntegerField: np.int16, + fields.AutoField: np.int32, + fields.BigAutoField: np.int64, + fields.BigIntegerField: np.int64, + fields.BinaryField: object, # Pandas has no bytes type + fields.BooleanField: np.bool_, + fields.CharField: object, # Pandas has no str type + fields.DateField: np.dtype('datetime64[D]'), + fields.DateTimeField: np.dtype('datetime64[us]'), + fields.DecimalField: object, + fields.DurationField: np.dtype('timedelta64[us]'), + fields.EmailField: object, + fields.FilePathField: object, + fields.FloatField: np.float64, + fields.GenericIPAddressField: object, + fields.IntegerField: np.int32, + fields.NullBooleanField: object, # bool(None) is False + fields.PositiveIntegerField: np.uint32, + fields.PositiveSmallIntegerField: np.uint16, + fields.SlugField: object, + fields.SmallIntegerField: np.int16, + fields.TextField: object, + fields.TimeField: object, + fields.URLField: object, + fields.UUIDField: object, } @@ -93,8 +94,7 @@ def _get_dtypes(fields_to_dtypes, fields): t, dtype = object, object for k, v in f2d.items(): if isinstance(field, k) and issubclass(k, t): - t = k - dtype = v + t, dtype = k, v dtypes.append((field.name, dtype)) return dtypes From 7bcc8dc36d6dc94ae31f2c978749927693a9674e Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 22:22:56 -0500 Subject: [PATCH 07/20] Test that compress really does use less memory --- django_pandas/tests/test_io.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index 8cf9470..b2266c1 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -57,6 +57,9 @@ def test_compress_basic(self): self.assertEqual(df.col3.dtype, np.dtype('float_')) self.assertEqual(df.col4.dtype, np.dtype('int16')) + # Compress should use less memory + self.assertLess(df.memory_usage().sum(), read_frame(qs).memory_usage().sum()) + def assert_default_compressable(self, df): for field in models.CompressableModel._meta.get_fields(): if field.name == 'id': @@ -81,14 +84,18 @@ def test_compress_custom_field(self): qs = models.CompressableModel.objects.all() # Specify a custom dtype for the custom field - df = read_frame(qs, compress={models.ByteField: np.int8}) - self.assert_default_compressable(df) - self.assertEqual(df.byte.dtype, np.int8) + df1 = read_frame(qs, compress={models.ByteField: np.int8}) + self.assert_default_compressable(df1) + self.assertEqual(df1.byte.dtype, np.int8) # Rely on finding the minimum specified parent class - df = read_frame(qs, compress=True) - self.assert_default_compressable(df) - self.assertEqual(df.byte.dtype, np.int16) + df2 = read_frame(qs, compress=True) + self.assert_default_compressable(df2) + self.assertEqual(df2.byte.dtype, np.int16) + + # Memory usage is ordered as df1 < df2 < read_frame(qs, compress=False) + self.assertLess(df2.memory_usage().sum(), read_frame(qs).memory_usage().sum()) + self.assertLess(df1.memory_usage().sum(), df2.memory_usage().sum()) def test_values(self): qs = MyModel.objects.all() From d5fb7e6af8cc4ba716c85405a6648a0791724350 Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 22:42:12 -0500 Subject: [PATCH 08/20] Comment typo --- django_pandas/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index b5101ca..74114cc 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -90,7 +90,7 @@ def _get_dtypes(fields_to_dtypes, fields): f2d = _FIELDS_TO_DTYPES.copy() f2d.update(fields_to_dtypes) for field in fields: - # Find the lowest subclass mong the keys of f2d + # Find the lowest subclass among the keys of f2d t, dtype = object, object for k, v in f2d.items(): if isinstance(field, k) and issubclass(k, t): From 909d1f5a0613ab2c658e6b9f14dad955ba934503 Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 22:42:56 -0500 Subject: [PATCH 09/20] Test compress argument types and clarify docs --- django_pandas/io.py | 7 ++++++- django_pandas/tests/test_io.py | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index 74114cc..8764594 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -89,6 +89,9 @@ def _get_dtypes(fields_to_dtypes, fields): dtypes = [] f2d = _FIELDS_TO_DTYPES.copy() f2d.update(fields_to_dtypes) + for k, v in f2d.items(): + if not issubclass(k, django.db.models.fields.Field): + raise TypeError(f'Expected a type of field, not {k!r}') for field in fields: # Find the lowest subclass among the keys of f2d t, dtype = object, object @@ -138,7 +141,7 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, defined in the ``__unicode__`` or ``__str__`` methods of the related class definition - compress: boolean or a mapping, default False + compress: a false value, ``True``, or a mapping, default False If a true value, infer NumPy data types [#]_ for Pandas dataframe columns from the corresponding Django field types. For example, Django's built in ``SmallIntgerField`` is cast to NumPy's ``int16``. If @@ -202,6 +205,8 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, recs = qs.iterator() if compress: + if not isinstance(compress, (bool, Mapping)): + raise TypeError(f'Ambiguous compress argument: {compress!r}') if not isinstance(compress, Mapping): compress = {} recs = np.array(list(recs), dtype=_get_dtypes(compress, fields)) diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index b2266c1..95434a2 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -60,6 +60,13 @@ def test_compress_basic(self): # Compress should use less memory self.assertLess(df.memory_usage().sum(), read_frame(qs).memory_usage().sum()) + def test_compress_bad_argument(self): + qs = MyModel.objects.all() + bads = [(models.ByteField, np.int8), range(3), type, object(), 'a', 1., + {'IntegerField': int}, {int: models.ByteField}] + for bad in bads: + self.assertRaises(TypeError, read_frame, qs, compress=bad) + def assert_default_compressable(self, df): for field in models.CompressableModel._meta.get_fields(): if field.name == 'id': From 4a283421180cb7e667d0007357cae871bf37e1af Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Tue, 27 Mar 2018 22:53:36 -0500 Subject: [PATCH 10/20] Add this PR's changes to the documentation --- CHANGES.rst | 4 ++++ README.rst | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index bed98fd..98b8c8d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,6 +3,10 @@ CHANGES 0.5.1 (2018-01-) ----------------- - Address Unicode decode error when installing with pip3 on docker (Thanks @utapyngo) +- Fix `#63 `_: Use memory + efficient iteration in ``read_frame`` (by @wkschwartz) +- Add ``compress`` argument to ``read_frame`` to infer NumPy data types for the + returned data frame's columns from the Django field types (by @wkschwartz) 0.5.0 (2018-01-20) ------------------ diff --git a/README.rst b/README.rst index ecdb18e..83c0877 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,7 @@ Contributors * `@henhuy `_ * `Hélio Meira Lins `_ * `@utpyngo `_ +* `William Schwartz `_ What's New =========== @@ -119,6 +120,23 @@ read_frame human readable versions of any foreign key or choice fields else use the actual values set in the model. + - compress: a false value, ``True``, or a mapping, default False + If a true value, infer `NumPy data types + `_ for + Pandas dataframe columns from the corresponding Django field + types. For example, Django's built in ``SmallIntgerField`` is + cast to NumPy's ``int16``. If ``compress`` is a mapping (e.g., a + ``dict``), it should be a mapping with Django field subclasses + as keys and `NumPy dtypes + `_ + as values. This mapping overrides the defualts for the field + classes appearing in the mapping. However, the inference is + based on the field subclass lowest on a chain of subclasses, + that is, in order of inheritence. To override + ``SmallIntegerField`` it is therefore not sufficient to override + ``IntegerField``. Careful of setting ``compress={}`` because + ``{}`` is a false value in Python, which would cause + ``read_frame`` not to compress columns. Examples ^^^^^^^^^ From cfd45c896d56e589c45ee8657c9969f0aca12cda Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Wed, 28 Mar 2018 09:06:28 -0500 Subject: [PATCH 11/20] Test: read_frame doesn't populate cache, handles values_list correctly Fixed a crash when the query set was the result of qs.values_list() --- django_pandas/io.py | 2 +- django_pandas/tests/test_io.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index 8764594..2e3811f 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -200,7 +200,7 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, fields = qs.model._meta.fields fieldnames = [f.name for f in fields] - if not is_values_queryset(qs): + if not issubclass(qs._iterable_class, django.db.models.query.ValuesListIterable): qs = qs.values_list(*fieldnames) recs = qs.iterator() diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index 95434a2..07d3700 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -47,8 +47,7 @@ def test_basic(self): df1 = read_frame(qs, ['col1', 'col2']) self.assertEqual(df1.shape, (qs.count(), 2)) - def test_compress_basic(self): - qs = MyModel.objects.all() + def assert_compress_basic(self, qs): df = read_frame(qs, compress=True) # Test automatic inference of dtypes @@ -59,6 +58,14 @@ def test_compress_basic(self): # Compress should use less memory self.assertLess(df.memory_usage().sum(), read_frame(qs).memory_usage().sum()) + # Uses qs.iterator() rather than for x in qs. + self.assertFalse(qs._result_cache) + + def test_compress_basic(self): + qs = MyModel.objects.all() + self.assert_compress_basic(qs) + self.assert_compress_basic(qs.values()) + self.assert_compress_basic(qs.values_list()) def test_compress_bad_argument(self): qs = MyModel.objects.all() @@ -103,6 +110,8 @@ def test_compress_custom_field(self): # Memory usage is ordered as df1 < df2 < read_frame(qs, compress=False) self.assertLess(df2.memory_usage().sum(), read_frame(qs).memory_usage().sum()) self.assertLess(df1.memory_usage().sum(), df2.memory_usage().sum()) + # Uses qs.iterator() rather than for x in qs. + self.assertFalse(qs._result_cache) def test_values(self): qs = MyModel.objects.all() From bc2f5b2b40437cc5a0a99efdc03ba72b3d5a9858 Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Mon, 2 Apr 2018 12:02:21 -0500 Subject: [PATCH 12/20] Handle nullable fields, repeated column names, coerce_float+compress --- django_pandas/io.py | 85 +++++++++++++++++++++++++++------- django_pandas/tests/models.py | 22 +++++++-- django_pandas/tests/test_io.py | 57 ++++++++++++++++++++--- 3 files changed, 135 insertions(+), 29 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index 2e3811f..a84ade5 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -4,6 +4,7 @@ from .utils import update_with_verbose, get_related_model import django from django.db.models import fields +from django.contrib.gis.db.models import fields as geo_fields import numpy as np @@ -37,11 +38,11 @@ def is_values_queryset(qs): _FIELDS_TO_DTYPES = { - fields.AutoField: np.int32, - fields.BigAutoField: np.int64, - fields.BigIntegerField: np.int64, + fields.AutoField: np.dtype(np.int32), + fields.BigAutoField: np.dtype(np.int64), + fields.BigIntegerField: np.dtype(np.int64), fields.BinaryField: object, # Pandas has no bytes type - fields.BooleanField: np.bool_, + fields.BooleanField: np.dtype(np.bool_), fields.CharField: object, # Pandas has no str type fields.DateField: np.dtype('datetime64[D]'), fields.DateTimeField: np.dtype('datetime64[us]'), @@ -49,22 +50,30 @@ def is_values_queryset(qs): fields.DurationField: np.dtype('timedelta64[us]'), fields.EmailField: object, fields.FilePathField: object, - fields.FloatField: np.float64, + fields.FloatField: np.dtype(np.float64), fields.GenericIPAddressField: object, - fields.IntegerField: np.int32, - fields.NullBooleanField: object, # bool(None) is False - fields.PositiveIntegerField: np.uint32, - fields.PositiveSmallIntegerField: np.uint16, + fields.IntegerField: np.dtype(np.int32), + fields.PositiveIntegerField: np.dtype(np.uint32), + fields.PositiveSmallIntegerField: np.dtype(np.uint16), fields.SlugField: object, - fields.SmallIntegerField: np.int16, + fields.SmallIntegerField: np.dtype(np.int16), fields.TextField: object, fields.TimeField: object, fields.URLField: object, fields.UUIDField: object, -} + # https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data-casting-rules-and-indexing + # Explicitly setting NullBooleanField here can be removed when support for + # Django versions <= 2.0 are dropped. See + # https://github.com/django/django/pull/8467 + fields.NullBooleanField: object, + + # Geometry fields + geo_fields.GeometryField: object, + geo_fields.RasterField: object, +} -def _get_dtypes(fields_to_dtypes, fields): +def _get_dtypes(fields_to_dtypes, fields, fieldnames): """Infer NumPy dtypes from field types among those named in fieldnames. Returns a list of (fieldname, NumPy dtype) pairs. Read about NumPy dtypes @@ -83,6 +92,9 @@ def _get_dtypes(fields_to_dtypes, fields): They must correspond in order to the columns of the dataframe that ``read_frame`` is building. + fieldnames : iterable of names of the fields as they will appear in the data + frame + .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html """ @@ -92,13 +104,48 @@ def _get_dtypes(fields_to_dtypes, fields): for k, v in f2d.items(): if not issubclass(k, django.db.models.fields.Field): raise TypeError(f'Expected a type of field, not {k!r}') - for field in fields: + if not isinstance(v, np.dtype): + f2d[k] = np.dtype(v) + for field, name in zip(fields, fieldnames): # Find the lowest subclass among the keys of f2d - t, dtype = object, object + t, dtype = object, np.generic for k, v in f2d.items(): if isinstance(field, k) and issubclass(k, t): t, dtype = k, v - dtypes.append((field.name, dtype)) + + # Handle nulls for integer and boolean types + if field.null and issubclass(dtype.type, (np.bool_, bool)): + # Pandas handles nullable booleans as objects. See + # https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data-casting-rules-and-indexing + # Not needed until Django 2.1. See + # https://github.com/django/django/pull/8467 + dtype = np.object_ + elif field.null and issubclass(dtype.type, (np.integer, int)): + # dtype.itemsize is denominated in bytes. Check it against the + # number of mantissa bits since the max exact integer is + # 2**(mantissa bits): + # >>> 2**sys.float_info.mant_dig - 1 == int(float(2**sys.float_info.mant_dig - 1)) + # True + # >>> 2**sys.float_info.mant_dig == int(float(2**sys.float_info.mant_dig)) + # True + # >>> 2**sys.float_info.mant_dig + 1 == int(float(2**sys.float_info.mant_dig + 1)) + # False + # Thus the integer needs to fit into ((mantissa bits) - 1) bits + # https://docs.scipy.org/doc/numpy-dev/user/basics.types.html + def fits(itype, ftype): + return np.iinfo(itype).bits <= (np.finfo(ftype).nmant - 1) + if fits(dtype, np.float16): + dtype = np.float16 + elif fits(dtype, np.float32): + dtype = np.float32 + elif fits(dtype, np.float64): + dtype = np.float64 + elif fits(dtype, np.longdouble): + dtype = np.longdouble + else: + dtype = np.object_ + + dtypes.append((name, dtype)) return dtypes @@ -155,10 +202,13 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, ``{}`` is a false value in Python, which would cause ``read_frame`` not to compress columns. + Does not work with ``coerce_float``. + .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html """ - + if coerce_float and compress: + raise ValueError('Cannot use coerce_float and compress at the same time') if fieldnames: fieldnames = pd.unique(fieldnames) if index_col is not None and index_col not in fieldnames: @@ -209,7 +259,8 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, raise TypeError(f'Ambiguous compress argument: {compress!r}') if not isinstance(compress, Mapping): compress = {} - recs = np.array(list(recs), dtype=_get_dtypes(compress, fields)) + dtype = _get_dtypes(compress, fields, fieldnames) + recs = np.array(list(recs), dtype=dtype) df = pd.DataFrame.from_records(recs, columns=fieldnames, coerce_float=coerce_float) diff --git a/django_pandas/tests/models.py b/django_pandas/tests/models.py index f52744b..8e19b9f 100644 --- a/django_pandas/tests/models.py +++ b/django_pandas/tests/models.py @@ -44,7 +44,7 @@ class CompressableModel(models.Model): # id = models.AutoField(primary_key=True) # bigauto = models.BigAutoField() - bigint = models.BigIntegerField(default=1<<63 - 1) + bigint = models.BigIntegerField(default=2**63 - 1) binary = models.BinaryField(default=b'test bytes') boolean = models.BooleanField(default=True) char = models.CharField(max_length=10, default='test chars') @@ -56,12 +56,12 @@ class CompressableModel(models.Model): filepath = models.FilePathField(default="/usr/local/bin/python") floating = models.FloatField(default=1.2) ip = models.GenericIPAddressField(default="::ffff:192.0.2.1") - integer = models.IntegerField(default=1<<31 - 1) + integer = models.IntegerField(default=2**31 - 1) nullboolean = models.NullBooleanField(default=None) - uint = models.PositiveIntegerField(default=1<<31 - 1) - ushort = models.PositiveSmallIntegerField(default=1<<15 - 1) + uint = models.PositiveIntegerField(default=2**31 - 1) + ushort = models.PositiveSmallIntegerField(default=2**15 - 1) slug = models.SlugField(default="test_slug") - short = models.SmallIntegerField(default=-(1<<15 - 1)) + short = models.SmallIntegerField(default=-(2**15 - 1)) text = models.TextField(default="test text") time = models.TimeField(default=dt.time(13, 55, 56)) url = models.URLField(default="https://github.com/chrisdev/django-pandas") @@ -71,6 +71,18 @@ class CompressableModel(models.Model): byte = ByteField(default=127) +class CompressableModelWithNulls(models.Model): + bigint = models.BigIntegerField(null=True, default=None) + floating = models.FloatField(null=True, default=None) + integer = models.IntegerField(null=True, default=None) + nullboolean = models.NullBooleanField(null=True, default=None) + uint = models.PositiveIntegerField(null=True, default=None) + ushort = models.PositiveSmallIntegerField(null=True, default=None) + short = models.SmallIntegerField(null=True, default=None) + # Custom field + byte = ByteField(null=True, default=None) + + @python_2_unicode_compatible class DataFrame(models.Model): diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index 07d3700..05c5f97 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -1,3 +1,5 @@ +import sys + from django.test import TestCase import django from django.db.models import Sum @@ -57,7 +59,7 @@ def assert_compress_basic(self, qs): self.assertEqual(df.col4.dtype, np.dtype('int16')) # Compress should use less memory - self.assertLess(df.memory_usage().sum(), read_frame(qs).memory_usage().sum()) + self.assertLess(df.memory_usage(deep=True).sum(), read_frame(qs).memory_usage(deep=True).sum()) # Uses qs.iterator() rather than for x in qs. self.assertFalse(qs._result_cache) @@ -70,10 +72,14 @@ def test_compress_basic(self): def test_compress_bad_argument(self): qs = MyModel.objects.all() bads = [(models.ByteField, np.int8), range(3), type, object(), 'a', 1., - {'IntegerField': int}, {int: models.ByteField}] + {'IntegerField': int}, {int: models.ByteField}, + {models.ByteField: 'asdf'}] for bad in bads: self.assertRaises(TypeError, read_frame, qs, compress=bad) + self.assertRaises( + ValueError, read_frame, qs, compress=True, coerce_float=True) + def assert_default_compressable(self, df): for field in models.CompressableModel._meta.get_fields(): if field.name == 'id': @@ -85,12 +91,14 @@ def assert_default_compressable(self, df): self.assertEqual(df['datetime'][0].to_pydatetime(), field.default) elif field.name == 'duration': self.assertEqual(df['duration'][0].to_pytimedelta(), field.default) + elif field.name == 'nullboolean': + self.assertEqual(df['nullboolean'].dtype, np.object_) + self.assertIsNone(df['nullboolean'][0]) elif isinstance(field.default, (str, bytes)): self.assertEqual(df[field.name].dtype, np.dtype(object)) else: - msg = ( - f'Expected {field.name} to have value {field.default!r}, but was' - f' {df[field.name][0]!r}') + msg = 'Expected {} to have value {!r}, but was {!r}'.format( + field.name, field.default, df[field.name][0]) self.assertEqual(df[field.name][0], field.default, msg) def test_compress_custom_field(self): @@ -105,14 +113,49 @@ def test_compress_custom_field(self): # Rely on finding the minimum specified parent class df2 = read_frame(qs, compress=True) self.assert_default_compressable(df2) + self.assertEqual(df2.uint.dtype, np.uint32) self.assertEqual(df2.byte.dtype, np.int16) # Memory usage is ordered as df1 < df2 < read_frame(qs, compress=False) - self.assertLess(df2.memory_usage().sum(), read_frame(qs).memory_usage().sum()) - self.assertLess(df1.memory_usage().sum(), df2.memory_usage().sum()) + self.assertLess(df2.memory_usage(deep=True).sum(), read_frame(qs).memory_usage(deep=True).sum()) + self.assertLess(df1.memory_usage(deep=True).sum(), df2.memory_usage(deep=True).sum()) # Uses qs.iterator() rather than for x in qs. self.assertFalse(qs._result_cache) + def test_compress_nulls(self): + maxs = dict(bigint=np.iinfo(np.int64).max, floating=sys.float_info.max, + integer=np.iinfo(np.int32).max, nullboolean=True, + uint=np.iinfo(np.uint32).max, ushort=np.iinfo(np.uint16).max, + short=np.iinfo(np.int16).max, byte=np.iinfo(np.int8).max) + mins = dict(bigint=np.iinfo(np.int64).min, floating=sys.float_info.min, + integer=np.iinfo(np.int32).min, nullboolean=True, + uint=np.iinfo(np.uint32).min, ushort=np.iinfo(np.uint16).min, + short=np.iinfo(np.int16).min, byte=np.iinfo(np.int8).min) + dbmaxs = models.CompressableModelWithNulls(**maxs) + dbmaxs.save() + dbnulls = models.CompressableModelWithNulls() + dbnulls.save() + dbmins = models.CompressableModelWithNulls(**mins) + dbmins.save() + qs = models.CompressableModelWithNulls.objects.all() + df1 = read_frame(qs, compress={models.ByteField: np.int8}) + + self.assertEqual(df1.bigint.dtype, np.object_) + self.assertEqual(df1.floating.dtype, np.float_) + self.assertEqual(df1.integer.dtype, np.float64) + self.assertEqual(df1.nullboolean.dtype, np.object_) + self.assertEqual(df1.uint.dtype, np.float64) + self.assertEqual(df1.ushort.dtype, np.float32) + self.assertEqual(df1.short.dtype, np.float32) + self.assertEqual(df1.byte.dtype, np.float16) + + for col in df1.columns: + if col == 'id': + continue + self.assertEqual(df1[col][0], maxs[col]) + self.assertTrue(df1[col][1] is None or np.isnan(df1[col][1])) + self.assertEqual(df1[col][2], mins[col]) + def test_values(self): qs = MyModel.objects.all() qs = qs.extra(select={"ecol1": "col1+1"}) From 59319fc6d880eb40848503255a4e02e01d13f51e Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Mon, 2 Apr 2018 12:04:35 -0500 Subject: [PATCH 13/20] Drop usage of f-strings for backward compatibility --- django_pandas/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index a84ade5..116c815 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -103,7 +103,7 @@ def _get_dtypes(fields_to_dtypes, fields, fieldnames): f2d.update(fields_to_dtypes) for k, v in f2d.items(): if not issubclass(k, django.db.models.fields.Field): - raise TypeError(f'Expected a type of field, not {k!r}') + raise TypeError('Expected a type of field, not {!r}'.format(k)) if not isinstance(v, np.dtype): f2d[k] = np.dtype(v) for field, name in zip(fields, fieldnames): @@ -256,7 +256,7 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, if compress: if not isinstance(compress, (bool, Mapping)): - raise TypeError(f'Ambiguous compress argument: {compress!r}') + raise TypeError('Ambiguous compress argument: {!r}'.format(compress)) if not isinstance(compress, Mapping): compress = {} dtype = _get_dtypes(compress, fields, fieldnames) From ea4a056b756a17544f5472d1b30ecfbba8f04fbe Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Thu, 12 Apr 2018 22:23:01 -0500 Subject: [PATCH 14/20] Handle foreign key fields --- django_pandas/io.py | 26 ++++++++++++++++--- django_pandas/tests/test_io.py | 46 ++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index 116c815..708dad8 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -3,7 +3,7 @@ import pandas as pd from .utils import update_with_verbose, get_related_model import django -from django.db.models import fields +from django.db.models import fields, ForeignKey from django.contrib.gis.db.models import fields as geo_fields import numpy as np @@ -107,6 +107,13 @@ def _get_dtypes(fields_to_dtypes, fields, fieldnames): if not isinstance(v, np.dtype): f2d[k] = np.dtype(v) for field, name in zip(fields, fieldnames): + # Get field.null before switching to target field since foreign key can + # be nullable even while the target isn't, and vice versa. + nullable = field.null + if isinstance(field, ForeignKey): + field = field.target_field + nullable = nullable or field.null + # Find the lowest subclass among the keys of f2d t, dtype = object, np.generic for k, v in f2d.items(): @@ -114,13 +121,13 @@ def _get_dtypes(fields_to_dtypes, fields, fieldnames): t, dtype = k, v # Handle nulls for integer and boolean types - if field.null and issubclass(dtype.type, (np.bool_, bool)): + if nullable and issubclass(dtype.type, (np.bool_, bool)): # Pandas handles nullable booleans as objects. See # https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data-casting-rules-and-indexing # Not needed until Django 2.1. See # https://github.com/django/django/pull/8467 dtype = np.object_ - elif field.null and issubclass(dtype.type, (np.integer, int)): + elif nullable and issubclass(dtype.type, (np.integer, int)): # dtype.itemsize is denominated in bytes. Check it against the # number of mantissa bits since the max exact integer is # 2**(mantissa bits): @@ -204,6 +211,19 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, Does not work with ``coerce_float``. + Known Issues + ------------ + + When using ``compress=True`` with a nullable foreign key field the double- + underscore import name may not work but the single-underscore import name + should. For example, suppose model ``A`` has a nullable foreign key field + ``b`` pointing at model ``B``, both of which models' primary key fields are + called ``id``. Suppose further that ``A``'s table has some entries with + null values of ``b`` and some with non-null values. + ``read_frame(A.objects.all(), ['b', 'b_id'])`` and + ``read_frame(A.objects.filter(b__isnull=False), ['b__id'])`` will work as + expected, but ``read_frame(A.objects.all(), ['b__id'])`` will not. + .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html """ diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index 05c5f97..30ba8a5 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -1,4 +1,5 @@ import sys +from unittest import expectedFailure from django.test import TestCase import django @@ -247,6 +248,51 @@ def setUp(self): growth = Portfolio.objects.create(name="Fund 2") growth.securities.add(abc) + def test_compress_fk(self): + qs = TradeLog.objects.all() + # trader and trader_id are both the foreign key id column on TradeLog. + # trader__id is the id column on Trader via a JOIN. + cols = ['trader', 'trader_id', 'trader__id'] + df = read_frame(qs, cols, verbose=False, compress=True) + + self.assertEqual(df.shape, (qs.count(), len(cols))) + self.assertTrue(df.trader.equals(df.trader__id)) + self.assertTrue(df.trader_id.equals(df.trader__id)) + self.assertEqual(df.trader.dtype, np.dtype('int32')) + self.assertEqual(df.trader_id.dtype, np.dtype('int32')) + self.assertEqual(df.trader__id.dtype, np.dtype('int32')) + self.assertCountEqual( + df.trader_id, qs.values_list('trader_id', flat=True)) + + def test_compress_fk_nullable(self): + qs = TradeLog.objects.all() + cols = ['symbol', 'symbol_id'] + df = read_frame(qs, cols, verbose=False, compress=True) + + self.assertEqual(df.shape, (qs.count(), len(cols))) + self.assertTrue(df.symbol.equals(df.symbol_id)) + self.assertEqual(df.symbol.dtype, np.dtype(float)) + self.assertEqual(df.symbol_id.dtype, np.dtype(float)) + self.assertCountEqual( + [None if pd.isna(x) else x for x in df.symbol], + qs.values_list('symbol_id', flat=True)) + + @expectedFailure + def test_compress_fk_nullable_join(self): + qs = TradeLog.objects.all() + # symbol is the foreign key id column on TradeLog. symbol__id is the id + # column on Security via a JOIN. + cols = ['symbol', 'symbol__id'] + df = read_frame(qs, cols, verbose=False, compress=True) + + self.assertEqual(df.shape, (qs.count(), len(cols))) + self.assertTrue(df.symbol.equals(df.symbol__id)) + self.assertEqual(df.symbol.dtype, np.dtype(float)) + self.assertEqual(df.symbol__id.dtype, np.dtype(float)) + self.assertCountEqual( + [None if pd.isna(x) else x for x in df.symbol], + qs.values_list('symbol__id', flat=True)) + def test_verbose(self): qs = TradeLog.objects.all() df = read_frame(qs, verbose=True) From 393b126f76449d414fb2c5a0ec39aa3c3ee6766e Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Thu, 12 Apr 2018 22:23:30 -0500 Subject: [PATCH 15/20] Fix documentation spelling errors --- django_pandas/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index 708dad8..b91066f 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -201,9 +201,9 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, built in ``SmallIntgerField`` is cast to NumPy's ``int16``. If ``compress`` is a mapping (e.g., a ``dict``), it should be a mapping with Django field subclasses as keys and NumPy dtypes [#]_ as values. - This mapping overrides the defualts for the field classes appearing in + This mapping overrides the defaults for the field classes appearing in the mapping. However, the inference is based on the field subclass - lowest on a chain of subclasses, that is, in order of inheritence. + lowest on a chain of subclasses, that is, in order of inheritance. To override ``SmallIntegerField`` it is therefore not sufficient to override ``IntegerField``. Careful of setting ``compress={}`` because ``{}`` is a false value in Python, which would cause ``read_frame`` From 00bcf399ec765bf293aa0f05876e9ec836d5c272 Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Thu, 12 Apr 2018 22:31:37 -0500 Subject: [PATCH 16/20] Don't require GeoDjango to load properly If it doesn't load, then the user doesn't need it. On some versions of Django, trying to import GeoDjango without having GDAL installed is an error. --- django_pandas/io.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index b91066f..eaa773c 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -4,8 +4,11 @@ from .utils import update_with_verbose, get_related_model import django from django.db.models import fields, ForeignKey -from django.contrib.gis.db.models import fields as geo_fields import numpy as np +try: + from django.contrib.gis.db.models import fields as geo_fields +except (ImportError, django.core.exceptions.ImproperlyConfigured): # pragma: no cover + geo_fields = None def to_fields(qs, fieldnames): @@ -67,12 +70,15 @@ def is_values_queryset(qs): # Django versions <= 2.0 are dropped. See # https://github.com/django/django/pull/8467 fields.NullBooleanField: object, - - # Geometry fields - geo_fields.GeometryField: object, - geo_fields.RasterField: object, } +if geo_fields is not None: + _FIELDS_TO_DTYPES.update({ + # Geometry fields + geo_fields.GeometryField: object, + geo_fields.RasterField: object, + }) + def _get_dtypes(fields_to_dtypes, fields, fieldnames): """Infer NumPy dtypes from field types among those named in fieldnames. From 19453d06d132d54123ca17dad8692853ac2806b9 Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Fri, 13 Apr 2018 08:57:17 -0500 Subject: [PATCH 17/20] Use np.isnan rather than pd.isna pd.isna was added in version 0.21, but the Travis builds still use Pandas 0.20. np.isnan is already working in these tests, so that should fix the build failure. --- django_pandas/tests/test_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index 30ba8a5..352c35f 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -274,7 +274,7 @@ def test_compress_fk_nullable(self): self.assertEqual(df.symbol.dtype, np.dtype(float)) self.assertEqual(df.symbol_id.dtype, np.dtype(float)) self.assertCountEqual( - [None if pd.isna(x) else x for x in df.symbol], + [None if np.isnan(x) else x for x in df.symbol], qs.values_list('symbol_id', flat=True)) @expectedFailure @@ -290,7 +290,7 @@ def test_compress_fk_nullable_join(self): self.assertEqual(df.symbol.dtype, np.dtype(float)) self.assertEqual(df.symbol__id.dtype, np.dtype(float)) self.assertCountEqual( - [None if pd.isna(x) else x for x in df.symbol], + [None if np.isnan(x) else x for x in df.symbol], qs.values_list('symbol__id', flat=True)) def test_verbose(self): From 932852d4768f991be6eaef96857597de7017c82c Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Mon, 16 Apr 2018 15:35:35 -0500 Subject: [PATCH 18/20] Backward compatibility for collections.abc.Mapping Per request of @chrisdev --- django_pandas/io.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index eaa773c..cb6f31b 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -1,4 +1,7 @@ -from collections.abc import Mapping +try: + from collections.abc import Mapping +except ImportError: + Mapping = dict import pandas as pd from .utils import update_with_verbose, get_related_model From 28e9a97b5c4a4f4a674650bbcef71688370c55ed Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Mon, 16 Apr 2018 15:37:18 -0500 Subject: [PATCH 19/20] Do not do coverage analysis of backward compatibility --- django_pandas/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index cb6f31b..4ec61ae 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -1,6 +1,6 @@ try: from collections.abc import Mapping -except ImportError: +except ImportError: # pragma: no cover Mapping = dict import pandas as pd From b9bcec6e1f80c10da32747375cd0e403e0f40076 Mon Sep 17 00:00:00 2001 From: William Schwartz Date: Mon, 16 Apr 2018 17:58:59 -0500 Subject: [PATCH 20/20] Avoid allocating intermediate list if possible NumPy's fromiter can allow us to skip allocating an intermediate list to create the NumPy NDArray. However, it only works if none of the dtypes are for Python objects, numpy.dtype('O'). --- django_pandas/io.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/django_pandas/io.py b/django_pandas/io.py index 4ec61ae..3f922c5 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -289,7 +289,14 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, if not isinstance(compress, Mapping): compress = {} dtype = _get_dtypes(compress, fields, fieldnames) - recs = np.array(list(recs), dtype=dtype) + # As long as there are no object dtypes, we can avoid the intermediate + # list, but np.fromiter chokes on dtype('O'). + if np.dtype('O') in [dt[1] for dt in dtype]: # small list, set not needed + recs = np.array(list(recs), dtype=dtype) + else: + # Skip the count argument because qs.count() may take more time than + # just reallocating memory as NumPy consumes the iterator. + recs = np.fromiter(recs, dtype=dtype) df = pd.DataFrame.from_records(recs, columns=fieldnames, coerce_float=coerce_float)