diff --git a/AUTHORS.rst b/AUTHORS.rst index f5503bb..7b91dc0 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -23,5 +23,6 @@ Contributions - `Yousuf Jawwad `_ - `@henhuy `_ - `Hélio Meira Lins `_ +- `William Schwartz `_ - `@utpyngo `_ diff --git a/CHANGES.rst b/CHANGES.rst index bed98fd..98b8c8d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,6 +3,10 @@ CHANGES 0.5.1 (2018-01-) ----------------- - Address Unicode decode error when installing with pip3 on docker (Thanks @utapyngo) +- Fix `#63 `_: Use memory + efficient iteration in ``read_frame`` (by @wkschwartz) +- Add ``compress`` argument to ``read_frame`` to infer NumPy data types for the + returned data frame's columns from the Django field types (by @wkschwartz) 0.5.0 (2018-01-20) ------------------ diff --git a/README.rst b/README.rst index ecdb18e..83c0877 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,7 @@ Contributors * `@henhuy `_ * `Hélio Meira Lins `_ * `@utpyngo `_ +* `William Schwartz `_ What's New =========== @@ -119,6 +120,23 @@ read_frame human readable versions of any foreign key or choice fields else use the actual values set in the model. + - compress: a false value, ``True``, or a mapping, default False + If a true value, infer `NumPy data types + `_ for + Pandas dataframe columns from the corresponding Django field + types. For example, Django's built in ``SmallIntgerField`` is + cast to NumPy's ``int16``. If ``compress`` is a mapping (e.g., a + ``dict``), it should be a mapping with Django field subclasses + as keys and `NumPy dtypes + `_ + as values. This mapping overrides the defualts for the field + classes appearing in the mapping. However, the inference is + based on the field subclass lowest on a chain of subclasses, + that is, in order of inheritence. To override + ``SmallIntegerField`` it is therefore not sufficient to override + ``IntegerField``. Careful of setting ``compress={}`` because + ``{}`` is a false value in Python, which would cause + ``read_frame`` not to compress columns. Examples ^^^^^^^^^ diff --git a/django_pandas/io.py b/django_pandas/io.py index 41a75e6..3f922c5 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -1,6 +1,17 @@ +try: + from collections.abc import Mapping +except ImportError: # pragma: no cover + Mapping = dict + import pandas as pd from .utils import update_with_verbose, get_related_model import django +from django.db.models import fields, ForeignKey +import numpy as np +try: + from django.contrib.gis.db.models import fields as geo_fields +except (ImportError, django.core.exceptions.ImproperlyConfigured): # pragma: no cover + geo_fields = None def to_fields(qs, fieldnames): @@ -32,14 +43,140 @@ def is_values_queryset(qs): return qs._iterable_class == django.db.models.query.ValuesIterable +_FIELDS_TO_DTYPES = { + fields.AutoField: np.dtype(np.int32), + fields.BigAutoField: np.dtype(np.int64), + fields.BigIntegerField: np.dtype(np.int64), + fields.BinaryField: object, # Pandas has no bytes type + fields.BooleanField: np.dtype(np.bool_), + fields.CharField: object, # Pandas has no str type + fields.DateField: np.dtype('datetime64[D]'), + fields.DateTimeField: np.dtype('datetime64[us]'), + fields.DecimalField: object, + fields.DurationField: np.dtype('timedelta64[us]'), + fields.EmailField: object, + fields.FilePathField: object, + fields.FloatField: np.dtype(np.float64), + fields.GenericIPAddressField: object, + fields.IntegerField: np.dtype(np.int32), + fields.PositiveIntegerField: np.dtype(np.uint32), + fields.PositiveSmallIntegerField: np.dtype(np.uint16), + fields.SlugField: object, + fields.SmallIntegerField: np.dtype(np.int16), + fields.TextField: object, + fields.TimeField: object, + fields.URLField: object, + fields.UUIDField: object, + + # https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data-casting-rules-and-indexing + # Explicitly setting NullBooleanField here can be removed when support for + # Django versions <= 2.0 are dropped. See + # https://github.com/django/django/pull/8467 + fields.NullBooleanField: object, +} + +if geo_fields is not None: + _FIELDS_TO_DTYPES.update({ + # Geometry fields + geo_fields.GeometryField: object, + geo_fields.RasterField: object, + }) + +def _get_dtypes(fields_to_dtypes, fields, fieldnames): + """Infer NumPy dtypes from field types among those named in fieldnames. + + Returns a list of (fieldname, NumPy dtype) pairs. Read about NumPy dtypes + here [#]_ and here [#]_. The returned list can be passed to ``numpy.array`` + in ``read_frame``. + + Parameters + ---------- + + field_to_dtypes : mapping + A (potentially empty) mapping of Django field classes to NumPy dtypes. + This mapping overrides the defualts from ``_FIELDS_TO_DTYPES``. The + back-up default dtype is ``object`` for unfamiliar field classes. + + fields : list of Django field class instances + They must correspond in order to the columns of the dataframe that + ``read_frame`` is building. + + fieldnames : iterable of names of the fields as they will appear in the data + frame + + .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html + .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html + """ + dtypes = [] + f2d = _FIELDS_TO_DTYPES.copy() + f2d.update(fields_to_dtypes) + for k, v in f2d.items(): + if not issubclass(k, django.db.models.fields.Field): + raise TypeError('Expected a type of field, not {!r}'.format(k)) + if not isinstance(v, np.dtype): + f2d[k] = np.dtype(v) + for field, name in zip(fields, fieldnames): + # Get field.null before switching to target field since foreign key can + # be nullable even while the target isn't, and vice versa. + nullable = field.null + if isinstance(field, ForeignKey): + field = field.target_field + nullable = nullable or field.null + + # Find the lowest subclass among the keys of f2d + t, dtype = object, np.generic + for k, v in f2d.items(): + if isinstance(field, k) and issubclass(k, t): + t, dtype = k, v + + # Handle nulls for integer and boolean types + if nullable and issubclass(dtype.type, (np.bool_, bool)): + # Pandas handles nullable booleans as objects. See + # https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data-casting-rules-and-indexing + # Not needed until Django 2.1. See + # https://github.com/django/django/pull/8467 + dtype = np.object_ + elif nullable and issubclass(dtype.type, (np.integer, int)): + # dtype.itemsize is denominated in bytes. Check it against the + # number of mantissa bits since the max exact integer is + # 2**(mantissa bits): + # >>> 2**sys.float_info.mant_dig - 1 == int(float(2**sys.float_info.mant_dig - 1)) + # True + # >>> 2**sys.float_info.mant_dig == int(float(2**sys.float_info.mant_dig)) + # True + # >>> 2**sys.float_info.mant_dig + 1 == int(float(2**sys.float_info.mant_dig + 1)) + # False + # Thus the integer needs to fit into ((mantissa bits) - 1) bits + # https://docs.scipy.org/doc/numpy-dev/user/basics.types.html + def fits(itype, ftype): + return np.iinfo(itype).bits <= (np.finfo(ftype).nmant - 1) + if fits(dtype, np.float16): + dtype = np.float16 + elif fits(dtype, np.float32): + dtype = np.float32 + elif fits(dtype, np.float64): + dtype = np.float64 + elif fits(dtype, np.longdouble): + dtype = np.longdouble + else: + dtype = np.object_ + + dtypes.append((name, dtype)) + return dtypes + + def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, - verbose=True): + verbose=True, compress=False): """ Returns a dataframe from a QuerySet Optionally specify the field names/columns to utilize and a field as the index + This function uses the QuerySet's ``iterator`` method, so it does not + populate the QuerySet's cache. This is more memory efficient in the typical + case where you do not use the QuerySet after ``read_frame``. + Parameters ---------- @@ -58,6 +195,7 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, coerce_float : boolean, default False Attempt to convert values to non-string, non-numeric data (like decimal.Decimal) to floating point, useful for SQL result sets + Does not work with ``compress``. verbose: boolean If this is ``True`` then populate the DataFrame with the human readable versions of any foreign key fields else use @@ -65,8 +203,41 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, The human readable version of the foreign key field is defined in the ``__unicode__`` or ``__str__`` methods of the related class definition - """ + compress: a false value, ``True``, or a mapping, default False + If a true value, infer NumPy data types [#]_ for Pandas dataframe + columns from the corresponding Django field types. For example, Django's + built in ``SmallIntgerField`` is cast to NumPy's ``int16``. If + ``compress`` is a mapping (e.g., a ``dict``), it should be a mapping + with Django field subclasses as keys and NumPy dtypes [#]_ as values. + This mapping overrides the defaults for the field classes appearing in + the mapping. However, the inference is based on the field subclass + lowest on a chain of subclasses, that is, in order of inheritance. + To override ``SmallIntegerField`` it is therefore not sufficient to + override ``IntegerField``. Careful of setting ``compress={}`` because + ``{}`` is a false value in Python, which would cause ``read_frame`` + not to compress columns. + + Does not work with ``coerce_float``. + + Known Issues + ------------ + + When using ``compress=True`` with a nullable foreign key field the double- + underscore import name may not work but the single-underscore import name + should. For example, suppose model ``A`` has a nullable foreign key field + ``b`` pointing at model ``B``, both of which models' primary key fields are + called ``id``. Suppose further that ``A``'s table has some entries with + null values of ``b`` and some with non-null values. + ``read_frame(A.objects.all(), ['b', 'b_id'])`` and + ``read_frame(A.objects.filter(b__isnull=False), ['b__id'])`` will work as + expected, but ``read_frame(A.objects.all(), ['b__id'])`` will not. + + .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html + .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html + """ + if coerce_float and compress: + raise ValueError('Cannot use coerce_float and compress at the same time') if fieldnames: fieldnames = pd.unique(fieldnames) if index_col is not None and index_col not in fieldnames: @@ -108,11 +279,24 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, fields = qs.model._meta.fields fieldnames = [f.name for f in fields] - if is_values_queryset(qs): - recs = list(qs) - else: - recs = list(qs.values_list(*fieldnames)) + if not issubclass(qs._iterable_class, django.db.models.query.ValuesListIterable): + qs = qs.values_list(*fieldnames) + recs = qs.iterator() + if compress: + if not isinstance(compress, (bool, Mapping)): + raise TypeError('Ambiguous compress argument: {!r}'.format(compress)) + if not isinstance(compress, Mapping): + compress = {} + dtype = _get_dtypes(compress, fields, fieldnames) + # As long as there are no object dtypes, we can avoid the intermediate + # list, but np.fromiter chokes on dtype('O'). + if np.dtype('O') in [dt[1] for dt in dtype]: # small list, set not needed + recs = np.array(list(recs), dtype=dtype) + else: + # Skip the count argument because qs.count() may take more time than + # just reallocating memory as NumPy consumes the iterator. + recs = np.fromiter(recs, dtype=dtype) df = pd.DataFrame.from_records(recs, columns=fieldnames, coerce_float=coerce_float) diff --git a/django_pandas/tests/models.py b/django_pandas/tests/models.py index 95025df..8e19b9f 100644 --- a/django_pandas/tests/models.py +++ b/django_pandas/tests/models.py @@ -1,3 +1,7 @@ +import datetime as dt +from decimal import Decimal +from uuid import UUID + from django.db import models from django.utils.encoding import python_2_unicode_compatible from django_pandas.managers import DataFrameManager, PassThroughManager @@ -9,7 +13,7 @@ class MyModel(models.Model): col1 = models.IntegerField() col2 = models.FloatField(null=True) col3 = models.FloatField(null=True) - col4 = models.IntegerField() + col4 = models.SmallIntegerField() def __str__(self): return "{} {} {} {}".format( @@ -32,6 +36,53 @@ class MyModelChoice(models.Model): objects = DataFrameManager() +class ByteField(models.SmallIntegerField): + pass + +class CompressableModel(models.Model): + # Can only have one auto field per model and id is added automatically + # id = models.AutoField(primary_key=True) + # bigauto = models.BigAutoField() + + bigint = models.BigIntegerField(default=2**63 - 1) + binary = models.BinaryField(default=b'test bytes') + boolean = models.BooleanField(default=True) + char = models.CharField(max_length=10, default='test chars') + date = models.DateField(default=dt.date(2018, 3, 27)) + datetime = models.DateTimeField(default=dt.datetime(2018, 3, 27, 13, 55, 56)) + decimal = models.DecimalField(decimal_places=1, max_digits=3, default=Decimal(1.5)) + duration = models.DurationField(default=dt.timedelta(minutes=1, seconds=1)) + email = models.EmailField(default="an+email@address.com") + filepath = models.FilePathField(default="/usr/local/bin/python") + floating = models.FloatField(default=1.2) + ip = models.GenericIPAddressField(default="::ffff:192.0.2.1") + integer = models.IntegerField(default=2**31 - 1) + nullboolean = models.NullBooleanField(default=None) + uint = models.PositiveIntegerField(default=2**31 - 1) + ushort = models.PositiveSmallIntegerField(default=2**15 - 1) + slug = models.SlugField(default="test_slug") + short = models.SmallIntegerField(default=-(2**15 - 1)) + text = models.TextField(default="test text") + time = models.TimeField(default=dt.time(13, 55, 56)) + url = models.URLField(default="https://github.com/chrisdev/django-pandas") + uuid = models.UUIDField(default=UUID(int=1234556789)) + + # Custom field + byte = ByteField(default=127) + + +class CompressableModelWithNulls(models.Model): + bigint = models.BigIntegerField(null=True, default=None) + floating = models.FloatField(null=True, default=None) + integer = models.IntegerField(null=True, default=None) + nullboolean = models.NullBooleanField(null=True, default=None) + uint = models.PositiveIntegerField(null=True, default=None) + ushort = models.PositiveSmallIntegerField(null=True, default=None) + short = models.SmallIntegerField(null=True, default=None) + # Custom field + byte = ByteField(null=True, default=None) + + @python_2_unicode_compatible class DataFrame(models.Model): diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index a5dee0e..352c35f 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -1,9 +1,13 @@ +import sys +from unittest import expectedFailure + from django.test import TestCase import django from django.db.models import Sum import pandas as pd import numpy as np from .models import MyModel, Trader, Security, TradeLog, TradeLogNote, MyModelChoice, Portfolio +from django_pandas.tests import models from django_pandas.io import read_frame @@ -46,6 +50,113 @@ def test_basic(self): df1 = read_frame(qs, ['col1', 'col2']) self.assertEqual(df1.shape, (qs.count(), 2)) + def assert_compress_basic(self, qs): + df = read_frame(qs, compress=True) + + # Test automatic inference of dtypes + self.assertEqual(df.col1.dtype, np.dtype('int32')) + self.assertEqual(df.col2.dtype, np.dtype('float_')) + self.assertEqual(df.col3.dtype, np.dtype('float_')) + self.assertEqual(df.col4.dtype, np.dtype('int16')) + + # Compress should use less memory + self.assertLess(df.memory_usage(deep=True).sum(), read_frame(qs).memory_usage(deep=True).sum()) + # Uses qs.iterator() rather than for x in qs. + self.assertFalse(qs._result_cache) + + def test_compress_basic(self): + qs = MyModel.objects.all() + self.assert_compress_basic(qs) + self.assert_compress_basic(qs.values()) + self.assert_compress_basic(qs.values_list()) + + def test_compress_bad_argument(self): + qs = MyModel.objects.all() + bads = [(models.ByteField, np.int8), range(3), type, object(), 'a', 1., + {'IntegerField': int}, {int: models.ByteField}, + {models.ByteField: 'asdf'}] + for bad in bads: + self.assertRaises(TypeError, read_frame, qs, compress=bad) + + self.assertRaises( + ValueError, read_frame, qs, compress=True, coerce_float=True) + + def assert_default_compressable(self, df): + for field in models.CompressableModel._meta.get_fields(): + if field.name == 'id': + self.assertEqual(df['id'][0], 1) + self.assertIs(df['id'].dtype, np.dtype('int32')) + elif field.name == 'date': + self.assertEqual(df['date'][0].to_pydatetime().date(), field.default) + elif field.name == 'datetime': + self.assertEqual(df['datetime'][0].to_pydatetime(), field.default) + elif field.name == 'duration': + self.assertEqual(df['duration'][0].to_pytimedelta(), field.default) + elif field.name == 'nullboolean': + self.assertEqual(df['nullboolean'].dtype, np.object_) + self.assertIsNone(df['nullboolean'][0]) + elif isinstance(field.default, (str, bytes)): + self.assertEqual(df[field.name].dtype, np.dtype(object)) + else: + msg = 'Expected {} to have value {!r}, but was {!r}'.format( + field.name, field.default, df[field.name][0]) + self.assertEqual(df[field.name][0], field.default, msg) + + def test_compress_custom_field(self): + models.CompressableModel().save() + qs = models.CompressableModel.objects.all() + + # Specify a custom dtype for the custom field + df1 = read_frame(qs, compress={models.ByteField: np.int8}) + self.assert_default_compressable(df1) + self.assertEqual(df1.byte.dtype, np.int8) + + # Rely on finding the minimum specified parent class + df2 = read_frame(qs, compress=True) + self.assert_default_compressable(df2) + self.assertEqual(df2.uint.dtype, np.uint32) + self.assertEqual(df2.byte.dtype, np.int16) + + # Memory usage is ordered as df1 < df2 < read_frame(qs, compress=False) + self.assertLess(df2.memory_usage(deep=True).sum(), read_frame(qs).memory_usage(deep=True).sum()) + self.assertLess(df1.memory_usage(deep=True).sum(), df2.memory_usage(deep=True).sum()) + # Uses qs.iterator() rather than for x in qs. + self.assertFalse(qs._result_cache) + + def test_compress_nulls(self): + maxs = dict(bigint=np.iinfo(np.int64).max, floating=sys.float_info.max, + integer=np.iinfo(np.int32).max, nullboolean=True, + uint=np.iinfo(np.uint32).max, ushort=np.iinfo(np.uint16).max, + short=np.iinfo(np.int16).max, byte=np.iinfo(np.int8).max) + mins = dict(bigint=np.iinfo(np.int64).min, floating=sys.float_info.min, + integer=np.iinfo(np.int32).min, nullboolean=True, + uint=np.iinfo(np.uint32).min, ushort=np.iinfo(np.uint16).min, + short=np.iinfo(np.int16).min, byte=np.iinfo(np.int8).min) + dbmaxs = models.CompressableModelWithNulls(**maxs) + dbmaxs.save() + dbnulls = models.CompressableModelWithNulls() + dbnulls.save() + dbmins = models.CompressableModelWithNulls(**mins) + dbmins.save() + qs = models.CompressableModelWithNulls.objects.all() + df1 = read_frame(qs, compress={models.ByteField: np.int8}) + + self.assertEqual(df1.bigint.dtype, np.object_) + self.assertEqual(df1.floating.dtype, np.float_) + self.assertEqual(df1.integer.dtype, np.float64) + self.assertEqual(df1.nullboolean.dtype, np.object_) + self.assertEqual(df1.uint.dtype, np.float64) + self.assertEqual(df1.ushort.dtype, np.float32) + self.assertEqual(df1.short.dtype, np.float32) + self.assertEqual(df1.byte.dtype, np.float16) + + for col in df1.columns: + if col == 'id': + continue + self.assertEqual(df1[col][0], maxs[col]) + self.assertTrue(df1[col][1] is None or np.isnan(df1[col][1])) + self.assertEqual(df1[col][2], mins[col]) + def test_values(self): qs = MyModel.objects.all() qs = qs.extra(select={"ecol1": "col1+1"}) @@ -137,6 +248,51 @@ def setUp(self): growth = Portfolio.objects.create(name="Fund 2") growth.securities.add(abc) + def test_compress_fk(self): + qs = TradeLog.objects.all() + # trader and trader_id are both the foreign key id column on TradeLog. + # trader__id is the id column on Trader via a JOIN. + cols = ['trader', 'trader_id', 'trader__id'] + df = read_frame(qs, cols, verbose=False, compress=True) + + self.assertEqual(df.shape, (qs.count(), len(cols))) + self.assertTrue(df.trader.equals(df.trader__id)) + self.assertTrue(df.trader_id.equals(df.trader__id)) + self.assertEqual(df.trader.dtype, np.dtype('int32')) + self.assertEqual(df.trader_id.dtype, np.dtype('int32')) + self.assertEqual(df.trader__id.dtype, np.dtype('int32')) + self.assertCountEqual( + df.trader_id, qs.values_list('trader_id', flat=True)) + + def test_compress_fk_nullable(self): + qs = TradeLog.objects.all() + cols = ['symbol', 'symbol_id'] + df = read_frame(qs, cols, verbose=False, compress=True) + + self.assertEqual(df.shape, (qs.count(), len(cols))) + self.assertTrue(df.symbol.equals(df.symbol_id)) + self.assertEqual(df.symbol.dtype, np.dtype(float)) + self.assertEqual(df.symbol_id.dtype, np.dtype(float)) + self.assertCountEqual( + [None if np.isnan(x) else x for x in df.symbol], + qs.values_list('symbol_id', flat=True)) + + @expectedFailure + def test_compress_fk_nullable_join(self): + qs = TradeLog.objects.all() + # symbol is the foreign key id column on TradeLog. symbol__id is the id + # column on Security via a JOIN. + cols = ['symbol', 'symbol__id'] + df = read_frame(qs, cols, verbose=False, compress=True) + + self.assertEqual(df.shape, (qs.count(), len(cols))) + self.assertTrue(df.symbol.equals(df.symbol__id)) + self.assertEqual(df.symbol.dtype, np.dtype(float)) + self.assertEqual(df.symbol__id.dtype, np.dtype(float)) + self.assertCountEqual( + [None if np.isnan(x) else x for x in df.symbol], + qs.values_list('symbol__id', flat=True)) + def test_verbose(self): qs = TradeLog.objects.all() df = read_frame(qs, verbose=True) diff --git a/runtests.py b/runtests.py index c8b35c2..b6aaef3 100755 --- a/runtests.py +++ b/runtests.py @@ -24,7 +24,12 @@ "PORT": "", } }, - MIDDLEWARE_CLASSES = () + MIDDLEWARE_CLASSES = (), + # django_pandas.tests.models.CompressableModel contains datetime fields + # with fixed default values instead of auto-now defaults. This makes + # testing easier, but triggers the fields.W161 system check. The + # following setting ignores that system check. + SILENCED_SYSTEM_CHECKS=['fields.W161'], ) settings.configure(**settings_dict)