112 lines
4.0 KiB
Python
112 lines
4.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
from __future__ import absolute_import, print_function, unicode_literals
|
|
|
|
from typing import Any, Dict
|
|
|
|
import msgpack
|
|
import sys
|
|
|
|
|
|
def _trim_dict_in_dict(data: Dict, max_val_size: int, replace_with: Any):
|
|
"""
|
|
Takes a dictionary, max_val_size and replace_with
|
|
and recursively loops through and replaces any values
|
|
that are greater than max_val_size.
|
|
"""
|
|
for key in data:
|
|
if isinstance(data[key], dict):
|
|
_trim_dict_in_dict(data[key], max_val_size, replace_with)
|
|
else:
|
|
if sys.getsizeof(data[key]) > max_val_size:
|
|
data[key] = replace_with
|
|
|
|
|
|
def trim_dict(
|
|
data: Any,
|
|
max_dict_bytes: int,
|
|
percent: float = 50.0,
|
|
stepper_size: int = 10,
|
|
replace_with: str = "VALUE_TRIMMED",
|
|
is_msgpacked: bool = False,
|
|
use_bin_type: bool = False,
|
|
):
|
|
"""
|
|
Takes a dictionary and iterates over its keys, looking for
|
|
large values and replacing them with a trimmed string.
|
|
|
|
If after the first pass over dictionary keys, the dictionary
|
|
is not sufficiently small, the stepper_size will be increased
|
|
and the dictionary will be rescanned. This allows for progressive
|
|
scanning, removing large items first and only making additional
|
|
passes for smaller items if necessary.
|
|
|
|
This function uses msgpack to calculate the size of the dictionary
|
|
in question. While this might seem like unnecessary overhead, a
|
|
data structure in python must be serialized in order for sys.getsizeof()
|
|
to accurately return the items referenced in the structure.
|
|
|
|
Ex:
|
|
>>> dict_tools.trim.trim_dict({'a': 'b', 'c': 'x' * 10000}, 100)
|
|
{'a': 'b', 'c': 'VALUE_TRIMMED'}
|
|
|
|
To improve performance, it is adviseable to pass in msgpacked
|
|
data structures instead of raw dictionaries. If a msgpack
|
|
structure is passed in, it will not be unserialized unless
|
|
necessary.
|
|
|
|
If a msgpack is passed in, it will be repacked if necessary
|
|
before being returned.
|
|
|
|
:param use_bin_type: Set this to true if "is_msgpacked=True"
|
|
and the msgpack data has been encoded
|
|
with "use_bin_type=True". This also means
|
|
that the msgpack data should be decoded with
|
|
"encoding='utf-8'".
|
|
"""
|
|
if is_msgpacked:
|
|
dict_size = sys.getsizeof(data)
|
|
else:
|
|
dict_size = sys.getsizeof(msgpack.dumps(data))
|
|
if dict_size > max_dict_bytes:
|
|
if is_msgpacked:
|
|
if use_bin_type:
|
|
data = msgpack.loads(data, encoding="utf-8")
|
|
else:
|
|
data = msgpack.loads(data)
|
|
while True:
|
|
percent = float(percent)
|
|
max_val_size = float(max_dict_bytes * (percent / 100))
|
|
try:
|
|
for key in data:
|
|
if isinstance(data[key], dict):
|
|
_trim_dict_in_dict(data[key], max_val_size, replace_with)
|
|
else:
|
|
if sys.getsizeof(data[key]) > max_val_size:
|
|
data[key] = replace_with
|
|
percent = percent - stepper_size
|
|
max_val_size = float(max_dict_bytes * (percent / 100))
|
|
if use_bin_type:
|
|
dump_data = msgpack.dumps(data, use_bin_type=True)
|
|
else:
|
|
dump_data = msgpack.dumps(data)
|
|
cur_dict_size = sys.getsizeof(dump_data)
|
|
if cur_dict_size < max_dict_bytes:
|
|
if is_msgpacked: # Repack it
|
|
return dump_data
|
|
else:
|
|
return data
|
|
elif max_val_size == 0:
|
|
if is_msgpacked:
|
|
return dump_data
|
|
else:
|
|
return data
|
|
except ValueError:
|
|
pass
|
|
if is_msgpacked:
|
|
if use_bin_type:
|
|
return msgpack.dumps(data, use_bin_type=True)
|
|
else:
|
|
return msgpack.dumps(data)
|
|
else:
|
|
return data
|