-
Notifications
You must be signed in to change notification settings - Fork 25
Serialize scalars and 0-dimensional arrays #99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ | |
| from fractions import Fraction | ||
| from functools import wraps | ||
| from json import JSONEncoder | ||
| from json.encoder import encode_basestring_ascii, encode_basestring, INFINITY | ||
| import sys | ||
|
|
||
| from .utils import hashodict, get_module_name_from_object, NoEnumException, NoPandasException, \ | ||
|
|
@@ -81,6 +82,54 @@ def default(self, obj, *args, **kwargs): | |
| type(obj), self.__class__.__name__, ', '.join(str(encoder) for encoder in self.obj_encoders))) | ||
| return obj | ||
|
|
||
| def iterencode(self, o, _one_shot=False): | ||
| """Encode the given object and yield each string | ||
| representation as available. | ||
|
|
||
| For example:: | ||
|
|
||
| for chunk in JSONEncoder().iterencode(bigobject): | ||
| mysocket.write(chunk) | ||
|
|
||
| """ | ||
| if self.check_circular: | ||
| markers = {} | ||
| else: | ||
| markers = None | ||
| if self.ensure_ascii: | ||
| _encoder = encode_basestring_ascii | ||
| else: | ||
| _encoder = encode_basestring | ||
|
|
||
| def floatstr(o, allow_nan=self.allow_nan, | ||
| _repr=float.__repr__, _inf=INFINITY, _neginf=-INFINITY): | ||
| # Check for specials. Note that this type of test is processor | ||
| # and/or platform-specific, so do tests which don't depend on the | ||
| # internals. | ||
|
|
||
| if o != o: | ||
| text = 'NaN' | ||
| elif o == _inf: | ||
| text = 'Infinity' | ||
| elif o == _neginf: | ||
| text = '-Infinity' | ||
| else: | ||
| return _repr(o) | ||
|
|
||
| if not allow_nan: | ||
| raise ValueError( | ||
| "Out of range float values are not JSON compliant: " + | ||
| repr(o)) | ||
|
|
||
| return text | ||
|
|
||
|
|
||
| _iterencode = _make_iterencode( | ||
| markers, self.default, _encoder, self.indent, floatstr, | ||
| self.key_separator, self.item_separator, self.sort_keys, | ||
| self.skipkeys, _one_shot) | ||
| return _iterencode(o, 0) | ||
|
|
||
|
|
||
| def json_date_time_encode(obj, primitives=False): | ||
| """ | ||
|
|
@@ -375,7 +424,9 @@ def numpy_encode(obj, primitives=False, properties=None): | |
|
|
||
| :param primitives: If True, arrays are serialized as (nested) lists without meta info. | ||
| """ | ||
| from numpy import ndarray, generic | ||
| from numpy import ndarray, generic, datetime64 | ||
|
|
||
| scalar_types = (generic, datetime64) | ||
|
|
||
| if isinstance(obj, ndarray): | ||
| if primitives: | ||
|
|
@@ -407,17 +458,19 @@ def numpy_encode(obj, primitives=False, properties=None): | |
| ('__ndarray__', data_json), | ||
| ('dtype', str(obj.dtype)), | ||
| ('shape', obj.shape), | ||
| ('0dim', obj.ndim == 0), | ||
| )) | ||
| if len(obj.shape) > 1: | ||
| dct['Corder'] = obj.flags['C_CONTIGUOUS'] | ||
| if use_compact and store_endianness != 'suppress': | ||
| dct['endian'] = store_endianness or sys.byteorder | ||
| return dct | ||
| elif isinstance(obj, generic): | ||
| if NumpyEncoder.SHOW_SCALAR_WARNING: | ||
| NumpyEncoder.SHOW_SCALAR_WARNING = False | ||
| warnings.warn('json-tricks: numpy scalar serialization is experimental and may work differently in future versions') | ||
| return obj.item() | ||
| elif isinstance(obj, scalar_types): | ||
| return hashodict(( | ||
| ('__ndarray__', obj.item()), | ||
| ('dtype', str(obj.dtype)), | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This new approach seems great and indeed in an ideal world it would work perfectly. ... However, it was found in issue #18 that Python sees some numpy scalars are primitives, and refuses to call encoders for them (presumably for performance). Which ones depends on the Python version, for extra confusion, although I guess Python 2 is less important now. In any case, this function in Python 3 works for a lot of types, but not for float64, which is an important one. There are two concerns with this:
I think we'll need to think a bit more about this, maybe make it opt-in, or skip the scalars and do 0dimensional arrays only, if that's possible.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. omg, i feel so ashamed to not have tested float64.... I literally tested uint8 through uint64, int8 through int64, datetime64s, and float32.... but not float64.... for reference, this is the current output. {
"uint32": {
"__numpy_scalar__": 1,
"dtype": "uint32"
},
"int32": {
"__numpy_scalar__": 1,
"dtype": "int32"
},
"float32": {
"__numpy_scalar__": 1.0,
"dtype": "float32"
},
"float64": 1.0,
"datetime64[ns]": {
"__numpy_scalar__": 1704235669639528000,
"dtype": "datetime64[ns]"
},
"datetime64[us]": {
"__numpy_scalar__": {
"__datetime__": null,
"year": 2024,
"month": 1,
"day": 2,
"hour": 22,
"minute": 47,
"second": 49,
"microsecond": 639546
},
"dtype": "datetime64[us]"
}
} |
||
| ('0dim', False), | ||
| )) | ||
| return obj | ||
|
|
||
|
|
||
|
|
@@ -476,3 +529,196 @@ def default(self, obj, *args, **kwargs): | |
| warnings.warn('`NoNumpyEncoder` is deprecated, use `nonumpy_encode`', JsonTricksDeprecation) | ||
| obj = nonumpy_encode(obj) | ||
| return super(NoNumpyEncoder, self).default(obj, *args, **kwargs) | ||
|
|
||
| def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, | ||
| _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot, | ||
| ## HACK: hand-optimized bytecode; turn globals into locals | ||
| ValueError=ValueError, | ||
| dict=dict, | ||
| float=float, | ||
| id=id, | ||
| int=int, | ||
| isinstance=isinstance, | ||
| list=list, | ||
| str=str, | ||
| tuple=tuple, | ||
| _intstr=int.__repr__, | ||
| ): | ||
|
|
||
| try: | ||
| import numpy | ||
| def isfloatinstance(obj): | ||
| return isinstance(obj, float) and not isinstance(obj, numpy.number) | ||
| except ImportError: | ||
| def isfloatinstance(obj): | ||
| return isinstance(obj, float) | ||
|
|
||
| if _indent is not None and not isinstance(_indent, str): | ||
| _indent = ' ' * _indent | ||
|
|
||
| def _iterencode_list(lst, _current_indent_level): | ||
| if not lst: | ||
| yield '[]' | ||
| return | ||
| if markers is not None: | ||
| markerid = id(lst) | ||
| if markerid in markers: | ||
| raise ValueError("Circular reference detected") | ||
| markers[markerid] = lst | ||
| buf = '[' | ||
| if _indent is not None: | ||
| _current_indent_level += 1 | ||
| newline_indent = '\n' + _indent * _current_indent_level | ||
| separator = _item_separator + newline_indent | ||
| buf += newline_indent | ||
| else: | ||
| newline_indent = None | ||
| separator = _item_separator | ||
| first = True | ||
| for value in lst: | ||
| if first: | ||
| first = False | ||
| else: | ||
| buf = separator | ||
| if isinstance(value, str): | ||
| yield buf + _encoder(value) | ||
| elif value is None: | ||
| yield buf + 'null' | ||
| elif value is True: | ||
| yield buf + 'true' | ||
| elif value is False: | ||
| yield buf + 'false' | ||
| elif isinstance(value, int): | ||
| # Subclasses of int/float may override __repr__, but we still | ||
| # want to encode them as integers/floats in JSON. One example | ||
| # within the standard library is IntEnum. | ||
| yield buf + _intstr(value) | ||
| elif isfloatinstance(value): | ||
| # see comment above for int | ||
| yield buf + _floatstr(value) | ||
| else: | ||
| yield buf | ||
| if isinstance(value, (list, tuple)): | ||
| chunks = _iterencode_list(value, _current_indent_level) | ||
| elif isinstance(value, dict): | ||
| chunks = _iterencode_dict(value, _current_indent_level) | ||
| else: | ||
| chunks = _iterencode(value, _current_indent_level) | ||
| yield from chunks | ||
| if newline_indent is not None: | ||
| _current_indent_level -= 1 | ||
| yield '\n' + _indent * _current_indent_level | ||
| yield ']' | ||
| if markers is not None: | ||
| del markers[markerid] | ||
|
|
||
| def _iterencode_dict(dct, _current_indent_level): | ||
| if not dct: | ||
| yield '{}' | ||
| return | ||
| if markers is not None: | ||
| markerid = id(dct) | ||
| if markerid in markers: | ||
| raise ValueError("Circular reference detected") | ||
| markers[markerid] = dct | ||
| yield '{' | ||
| if _indent is not None: | ||
| _current_indent_level += 1 | ||
| newline_indent = '\n' + _indent * _current_indent_level | ||
| item_separator = _item_separator + newline_indent | ||
| yield newline_indent | ||
| else: | ||
| newline_indent = None | ||
| item_separator = _item_separator | ||
| first = True | ||
| if _sort_keys: | ||
| items = sorted(dct.items()) | ||
| else: | ||
| items = dct.items() | ||
| for key, value in items: | ||
| if isinstance(key, str): | ||
| pass | ||
| # JavaScript is weakly typed for these, so it makes sense to | ||
| # also allow them. Many encoders seem to do something like this. | ||
| elif isinstance(key, float): | ||
| # see comment for int/float in _make_iterencode | ||
| key = _floatstr(key) | ||
| elif key is True: | ||
| key = 'true' | ||
| elif key is False: | ||
| key = 'false' | ||
| elif key is None: | ||
| key = 'null' | ||
| elif isinstance(key, int): | ||
| # see comment for int/float in _make_iterencode | ||
| key = _intstr(key) | ||
| elif _skipkeys: | ||
| continue | ||
| else: | ||
| raise TypeError(f'keys must be str, int, float, bool or None, ' | ||
| f'not {key.__class__.__name__}') | ||
| if first: | ||
| first = False | ||
| else: | ||
| yield item_separator | ||
| yield _encoder(key) | ||
| yield _key_separator | ||
| if isinstance(value, str): | ||
| yield _encoder(value) | ||
| elif value is None: | ||
| yield 'null' | ||
| elif value is True: | ||
| yield 'true' | ||
| elif value is False: | ||
| yield 'false' | ||
| elif isinstance(value, int): | ||
| # see comment for int/float in _make_iterencode | ||
| yield _intstr(value) | ||
| elif isfloatinstance(value): | ||
| # see comment for int/float in _make_iterencode | ||
| yield _floatstr(value) | ||
| else: | ||
| if isinstance(value, (list, tuple)): | ||
| chunks = _iterencode_list(value, _current_indent_level) | ||
| elif isinstance(value, dict): | ||
| chunks = _iterencode_dict(value, _current_indent_level) | ||
| else: | ||
| chunks = _iterencode(value, _current_indent_level) | ||
| yield from chunks | ||
| if newline_indent is not None: | ||
| _current_indent_level -= 1 | ||
| yield '\n' + _indent * _current_indent_level | ||
| yield '}' | ||
| if markers is not None: | ||
| del markers[markerid] | ||
|
|
||
| def _iterencode(o, _current_indent_level): | ||
| if isinstance(o, str): | ||
| yield _encoder(o) | ||
| elif o is None: | ||
| yield 'null' | ||
| elif o is True: | ||
| yield 'true' | ||
| elif o is False: | ||
| yield 'false' | ||
| elif isinstance(o, int): | ||
| # see comment for int/float in _make_iterencode | ||
| yield _intstr(o) | ||
| elif isfloatinstance(o): | ||
| # see comment for int/float in _make_iterencode | ||
| yield _floatstr(o) | ||
| elif isinstance(o, (list, tuple)): | ||
| yield from _iterencode_list(o, _current_indent_level) | ||
| elif isinstance(o, dict): | ||
| yield from _iterencode_dict(o, _current_indent_level) | ||
| else: | ||
| if markers is not None: | ||
| markerid = id(o) | ||
| if markerid in markers: | ||
| raise ValueError("Circular reference detected") | ||
| markers[markerid] = o | ||
| o = _default(o) | ||
| yield from _iterencode(o, _current_indent_level) | ||
| if markers is not None: | ||
| del markers[markerid] | ||
| return _iterencode | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about sclars that were serialized with
encode_scalars_inplacebefore these changes?