Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 35 additions & 11 deletions json_tricks/decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,16 @@ def json_numpy_obj_hook(dct):
"""
if not isinstance(dct, dict):
return dct
if not '__ndarray__' in dct:
if '__ndarray__' not in dct:
return dct
if 'shape' not in dct or (dct['shape'] == [] and not dct.get('0dim', False)):
# New style scalar encoding
return _decode_numpy_scalar(dct)
else:
return _decode_ndarray(dct)


def _decode_ndarray(dct):
try:
import numpy
except ImportError:
Expand All @@ -297,7 +305,32 @@ def json_numpy_obj_hook(dct):
else:
return _lists_of_numbers_to_ndarray(data_json, order, shape, nptype)
else:
return _scalar_to_numpy(data_json, nptype)
# This code path is mostly for 0-dimensional arrays
# numpy scalars are separately decoded
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about sclars that were serialized with encode_scalars_inplace before these changes?

return numpy.asarray(
data_json,
dtype=nptype
).reshape(dct['shape'])


def _decode_numpy_scalar(dct):
try:
import numpy
except ImportError:
raise NoNumpyException('Trying to decode a map which appears to represent a numpy '
'scalar, but numpy appears not to be installed.')

# numpy.asarray will handle dtypes with units well (such as datetime64)
arr = numpy.asarray(dct['__ndarray__'], dtype=dct['dtype'])

# https://numpy.org/doc/stable/reference/arrays.scalars.html#indexing
# https://numpy.org/doc/stable/user/basics.indexing.html#detailed-notes
# > An empty (tuple) index is a full scalar index into a zero-dimensional
# array. x[()] returns a scalar if x is zero-dimensional and a view
# otherwise. On the other hand, x[...] always returns a view.

scalar = arr[()]
return scalar


def _bin_str_to_ndarray(data, order, shape, np_type_name, data_endianness):
Expand Down Expand Up @@ -354,15 +387,6 @@ def _lists_of_obj_to_ndarray(data, order, shape, dtype):
return arr


def _scalar_to_numpy(data, dtype):
"""
From scalar value to numpy type.
"""
import numpy as nptypes
dtype = getattr(nptypes, dtype)
return dtype(data)


def json_nonumpy_obj_hook(dct):
"""
This hook has no effect except to check if you're trying to decode numpy arrays without support, and give you a useful message.
Expand Down
258 changes: 252 additions & 6 deletions json_tricks/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from fractions import Fraction
from functools import wraps
from json import JSONEncoder
from json.encoder import encode_basestring_ascii, encode_basestring, INFINITY
import sys

from .utils import hashodict, get_module_name_from_object, NoEnumException, NoPandasException, \
Expand Down Expand Up @@ -81,6 +82,54 @@ def default(self, obj, *args, **kwargs):
type(obj), self.__class__.__name__, ', '.join(str(encoder) for encoder in self.obj_encoders)))
return obj

def iterencode(self, o, _one_shot=False):
"""Encode the given object and yield each string
representation as available.

For example::

for chunk in JSONEncoder().iterencode(bigobject):
mysocket.write(chunk)

"""
if self.check_circular:
markers = {}
else:
markers = None
if self.ensure_ascii:
_encoder = encode_basestring_ascii
else:
_encoder = encode_basestring

def floatstr(o, allow_nan=self.allow_nan,
_repr=float.__repr__, _inf=INFINITY, _neginf=-INFINITY):
# Check for specials. Note that this type of test is processor
# and/or platform-specific, so do tests which don't depend on the
# internals.

if o != o:
text = 'NaN'
elif o == _inf:
text = 'Infinity'
elif o == _neginf:
text = '-Infinity'
else:
return _repr(o)

if not allow_nan:
raise ValueError(
"Out of range float values are not JSON compliant: " +
repr(o))

return text


_iterencode = _make_iterencode(
markers, self.default, _encoder, self.indent, floatstr,
self.key_separator, self.item_separator, self.sort_keys,
self.skipkeys, _one_shot)
return _iterencode(o, 0)


def json_date_time_encode(obj, primitives=False):
"""
Expand Down Expand Up @@ -375,7 +424,9 @@ def numpy_encode(obj, primitives=False, properties=None):

:param primitives: If True, arrays are serialized as (nested) lists without meta info.
"""
from numpy import ndarray, generic
from numpy import ndarray, generic, datetime64

scalar_types = (generic, datetime64)

if isinstance(obj, ndarray):
if primitives:
Expand Down Expand Up @@ -407,17 +458,19 @@ def numpy_encode(obj, primitives=False, properties=None):
('__ndarray__', data_json),
('dtype', str(obj.dtype)),
('shape', obj.shape),
('0dim', obj.ndim == 0),
))
if len(obj.shape) > 1:
dct['Corder'] = obj.flags['C_CONTIGUOUS']
if use_compact and store_endianness != 'suppress':
dct['endian'] = store_endianness or sys.byteorder
return dct
elif isinstance(obj, generic):
if NumpyEncoder.SHOW_SCALAR_WARNING:
NumpyEncoder.SHOW_SCALAR_WARNING = False
warnings.warn('json-tricks: numpy scalar serialization is experimental and may work differently in future versions')
return obj.item()
elif isinstance(obj, scalar_types):
return hashodict((
('__ndarray__', obj.item()),
('dtype', str(obj.dtype)),
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This new approach seems great and indeed in an ideal world it would work perfectly.

...

However, it was found in issue #18 that Python sees some numpy scalars are primitives, and refuses to call encoders for them (presumably for performance).

Which ones depends on the Python version, for extra confusion, although I guess Python 2 is less important now.

In any case, this function in Python 3 works for a lot of types, but not for float64, which is an important one. There are two concerns with this:

  • While keeping the numpy scalar type is better in general, doing it half the time seems like it adds more confusion than it's worth.
  • We've been doing it this way, it's be a (slightly) breaking change to start doing it differently.

I think we'll need to think a bit more about this, maybe make it opt-in, or skip the scalars and do 0dimensional arrays only, if that's possible.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

omg, i feel so ashamed to not have tested float64....

I literally tested uint8 through uint64, int8 through int64, datetime64s, and float32.... but not float64....

for reference, this is the current output.

{                                                                                   
    "uint32": {                                                                     
        "__numpy_scalar__": 1,                                                      
        "dtype": "uint32"                                                           
    },                                                                              
    "int32": {                                                                      
        "__numpy_scalar__": 1,                                                      
        "dtype": "int32"                                                            
    },                                                                              
    "float32": {                                                                    
        "__numpy_scalar__": 1.0,                                                    
        "dtype": "float32"                                                          
    },                                                                                                                                                                                                                
    "float64": 1.0,                                                                 
    "datetime64[ns]": {                                                         
        "__numpy_scalar__": 1704235669639528000,                                    
        "dtype": "datetime64[ns]"                                                   
    },                                                                              
    "datetime64[us]": {                                                             
        "__numpy_scalar__": {                                                       
            "__datetime__": null,                                                   
            "year": 2024,                                                           
            "month": 1,                                                             
            "day": 2,                                                               
            "hour": 22,                                                             
            "minute": 47,                                                           
            "second": 49,                                                           
            "microsecond": 639546                                                   
        },                                                                          
        "dtype": "datetime64[us]"                                                   
    }                                                                               
}

('0dim', False),
))
return obj


Expand Down Expand Up @@ -476,3 +529,196 @@ def default(self, obj, *args, **kwargs):
warnings.warn('`NoNumpyEncoder` is deprecated, use `nonumpy_encode`', JsonTricksDeprecation)
obj = nonumpy_encode(obj)
return super(NoNumpyEncoder, self).default(obj, *args, **kwargs)

def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
_key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
## HACK: hand-optimized bytecode; turn globals into locals
ValueError=ValueError,
dict=dict,
float=float,
id=id,
int=int,
isinstance=isinstance,
list=list,
str=str,
tuple=tuple,
_intstr=int.__repr__,
):

try:
import numpy
def isfloatinstance(obj):
return isinstance(obj, float) and not isinstance(obj, numpy.number)
except ImportError:
def isfloatinstance(obj):
return isinstance(obj, float)

if _indent is not None and not isinstance(_indent, str):
_indent = ' ' * _indent

def _iterencode_list(lst, _current_indent_level):
if not lst:
yield '[]'
return
if markers is not None:
markerid = id(lst)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = lst
buf = '['
if _indent is not None:
_current_indent_level += 1
newline_indent = '\n' + _indent * _current_indent_level
separator = _item_separator + newline_indent
buf += newline_indent
else:
newline_indent = None
separator = _item_separator
first = True
for value in lst:
if first:
first = False
else:
buf = separator
if isinstance(value, str):
yield buf + _encoder(value)
elif value is None:
yield buf + 'null'
elif value is True:
yield buf + 'true'
elif value is False:
yield buf + 'false'
elif isinstance(value, int):
# Subclasses of int/float may override __repr__, but we still
# want to encode them as integers/floats in JSON. One example
# within the standard library is IntEnum.
yield buf + _intstr(value)
elif isfloatinstance(value):
# see comment above for int
yield buf + _floatstr(value)
else:
yield buf
if isinstance(value, (list, tuple)):
chunks = _iterencode_list(value, _current_indent_level)
elif isinstance(value, dict):
chunks = _iterencode_dict(value, _current_indent_level)
else:
chunks = _iterencode(value, _current_indent_level)
yield from chunks
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + _indent * _current_indent_level
yield ']'
if markers is not None:
del markers[markerid]

def _iterencode_dict(dct, _current_indent_level):
if not dct:
yield '{}'
return
if markers is not None:
markerid = id(dct)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = dct
yield '{'
if _indent is not None:
_current_indent_level += 1
newline_indent = '\n' + _indent * _current_indent_level
item_separator = _item_separator + newline_indent
yield newline_indent
else:
newline_indent = None
item_separator = _item_separator
first = True
if _sort_keys:
items = sorted(dct.items())
else:
items = dct.items()
for key, value in items:
if isinstance(key, str):
pass
# JavaScript is weakly typed for these, so it makes sense to
# also allow them. Many encoders seem to do something like this.
elif isinstance(key, float):
# see comment for int/float in _make_iterencode
key = _floatstr(key)
elif key is True:
key = 'true'
elif key is False:
key = 'false'
elif key is None:
key = 'null'
elif isinstance(key, int):
# see comment for int/float in _make_iterencode
key = _intstr(key)
elif _skipkeys:
continue
else:
raise TypeError(f'keys must be str, int, float, bool or None, '
f'not {key.__class__.__name__}')
if first:
first = False
else:
yield item_separator
yield _encoder(key)
yield _key_separator
if isinstance(value, str):
yield _encoder(value)
elif value is None:
yield 'null'
elif value is True:
yield 'true'
elif value is False:
yield 'false'
elif isinstance(value, int):
# see comment for int/float in _make_iterencode
yield _intstr(value)
elif isfloatinstance(value):
# see comment for int/float in _make_iterencode
yield _floatstr(value)
else:
if isinstance(value, (list, tuple)):
chunks = _iterencode_list(value, _current_indent_level)
elif isinstance(value, dict):
chunks = _iterencode_dict(value, _current_indent_level)
else:
chunks = _iterencode(value, _current_indent_level)
yield from chunks
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + _indent * _current_indent_level
yield '}'
if markers is not None:
del markers[markerid]

def _iterencode(o, _current_indent_level):
if isinstance(o, str):
yield _encoder(o)
elif o is None:
yield 'null'
elif o is True:
yield 'true'
elif o is False:
yield 'false'
elif isinstance(o, int):
# see comment for int/float in _make_iterencode
yield _intstr(o)
elif isfloatinstance(o):
# see comment for int/float in _make_iterencode
yield _floatstr(o)
elif isinstance(o, (list, tuple)):
yield from _iterencode_list(o, _current_indent_level)
elif isinstance(o, dict):
yield from _iterencode_dict(o, _current_indent_level)
else:
if markers is not None:
markerid = id(o)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = o
o = _default(o)
yield from _iterencode(o, _current_indent_level)
if markers is not None:
del markers[markerid]
return _iterencode
1 change: 1 addition & 0 deletions json_tricks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def get_scalar_repr(npscalar):
('__ndarray__', npscalar.item()),
('dtype', str(npscalar.dtype)),
('shape', ()),
('0dim', False),
))


Expand Down
Loading