Skip to content

Commit

Permalink
minor improvements to formatting.
Browse files Browse the repository at this point in the history
  • Loading branch information
root-11 committed Dec 6, 2023
1 parent 50f5816 commit 7167800
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 27 deletions.
4 changes: 3 additions & 1 deletion tablite/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def synthetic_order_data(rows=100_000):
"""Creates a synthetic dataset for testing that looks like this:
(depending on number of rows)
```
+=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+
| ~ | # | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
| row | int | int | datetime | int |int| int |str |str|mixed|mixed| float | float |
Expand All @@ -30,12 +31,13 @@ def synthetic_order_data(rows=100_000):
|7,999,998|7999999|1382206732187|2021-11-13 00:00:00|50993| 1|24832|C5-2|UDL|None |ABC |0.08425329763360942|12.707735293126758|
|7,999,999|8000000| 600688069780|2021-09-28 00:00:00|50510| 0|15819|C3-4|IGY|None |ABC | 1.066241687256579|13.862069804070295|
+=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+
```
Args:
rows (int, optional): number of rows wanted. Defaults to 100_000.
Returns:
Table: Populated table.
Table (Table): Populated table.
""" # noqa
rows = int(rows)

Expand Down
99 changes: 73 additions & 26 deletions tablite/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections import defaultdict, Counter
import numpy as np
import pickle
from typing import Any


class DataTypes(object):
Expand Down Expand Up @@ -214,6 +215,7 @@ def from_type_code(cls, value, code):
"NNNNNNNN": lambda x: date(*(int(x[:4]), int(x[4:6]), int(x[6:]))),
}

# fmt:off
datetime_formats = {
# Note: Only recognised ISO8601 formats are accepted.
# year first
Expand Down Expand Up @@ -261,6 +263,7 @@ def from_type_code(cls, value, code):
# compact formats - type 3
"NNNNNNNNTNN:NN:NN": lambda x: DataTypes.pattern_to_datetime(x, compact=3),
}
# fmt:on

@staticmethod
def pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False):
Expand Down Expand Up @@ -310,7 +313,9 @@ def pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False
if 0 < dot < 10:
ix = len(iso_string) - dot
microsecond = int(float(f"0{iso_string[ix - 1:]}") * 10**6)
# fmt:off
iso_string = iso_string[: len(iso_string) - dot] + str(microsecond).rjust(6, "0")
# fmt:on
if ymd:
iso_string = iso_string.replace(ymd, "-", 2)
if T:
Expand All @@ -321,23 +326,26 @@ def pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False
def round(cls, value, multiple, up=None):
"""a nicer way to round numbers.
:param value: float, integer or datetime to be rounded.
:param multiple: float, integer or timedelta to be used as the base of the rounding.
:param up: None (default) or boolean rounds half, up or down.
round(1.6, 1) rounds to 2.
round(1.4, 1) rounds to 1.
round(1.5, 1, up=True) rounds to 2.
round(1.5, 1, up=False) rounds to 1.
:return: rounded value
Examples:
[1] multiple = 1 is the same as rounding to whole integers.
[2] multiple = 0.001 is the same as rounding to 3 digits precision.
[3] mulitple = 3.1415 is rounding to nearest multiplier of 3.1415
[4] value = datetime(2022,8,18,11,14,53,440)
[5] multiple = timedelta(hours=0.5)
[6] xround(value,multiple) is datetime(2022,8,18,11,0)
Args:
value (float,integer,datetime): value to be rounded
multiple (float,integer,timedelta): value to be used as the based of rounding.
1) multiple = 1 is the same as rounding to whole integers.
2) multiple = 0.001 is the same as rounding to 3 digits precision.
3) mulitple = 3.1415 is rounding to nearest multiplier of 3.1415
4) value = datetime(2022,8,18,11,14,53,440)
5) multiple = timedelta(hours=0.5)
6) xround(value,multiple) is datetime(2022,8,18,11,0)
up (None, bool, optional):
None (default) or boolean rounds half, up or down.
round(1.6, 1) rounds to 2.
round(1.4, 1) rounds to 1.
round(1.5, 1, up=True) rounds to 2.
round(1.5, 1, up=False) rounds to 1.
Returns:
float,integer,datetime: rounded value in same type as input.
"""
epoch = 0
if isinstance(value, (datetime)) and isinstance(multiple, timedelta):
Expand Down Expand Up @@ -505,7 +513,7 @@ def infer(cls, v, dtype):

if dtype not in matched_types:
raise TypeError(f"The datatype {str(dtype)} is not supported.")

return matched_types[dtype](v)

@classmethod
Expand Down Expand Up @@ -651,8 +659,9 @@ def _infer_datetime(cls, value):
dot = value.find(",", 11)
else:
dot = len(value)

# fmt:off
pattern = "".join(["N" if n in DataTypes.digits else n for n in value[:dot]])
# fmt:on
f = DataTypes.datetime_formats.get(pattern, None)
if f:
return f(value)
Expand Down Expand Up @@ -687,15 +696,31 @@ def _infer_none(cls, value):
raise ValueError()


def numpy_to_python(obj):
"""See https://numpy.org/doc/stable/reference/arrays.scalars.html"""
def numpy_to_python(obj: Any) -> Any:
"""Converts numpy types to python types.
See https://numpy.org/doc/stable/reference/arrays.scalars.html
Args:
obj (Any): A numpy object
Returns:
python object: A python object
"""
if isinstance(obj, np.generic):
return obj.item()
return obj


def pytype(obj):
"""Returns the python type of any object"""
"""Returns the python type of any object
Args:
obj (Any): any numpy or python object
Returns:
type: type of obj
"""
if isinstance(obj, np.generic):
return type(obj.item())
return type(obj)
Expand All @@ -714,7 +739,9 @@ def match(self, k): # k+=1

if ix > 0:
p = self.items_list
while r[ix] > r[ix - 1] and ix > 0: # use a simple bubble sort to maintain rank
while (
r[ix] > r[ix - 1] and ix > 0
): # use a simple bubble sort to maintain rank
r[ix], r[ix - 1] = r[ix - 1], r[ix]
p[ix], p[ix - 1] = p[ix - 1], p[ix]
old = p[ix]
Expand All @@ -726,7 +753,18 @@ def __iter__(self):
return iter(self.items_list)


def pytype_from_iterable(iterable):
def pytype_from_iterable(iterable: {tuple, list}) -> {np.dtype, dict}:
"""helper to make correct np array from python types.
Args:
iterable (tuple,list): values to be converted to numpy array.
Raises:
NotImplementedError: if datatype is not supported.
Returns:
np.dtype: python type of the iterable.
"""
py_types = {}
if isinstance(iterable, (tuple, list)):
type_counter = Counter((pytype(v) for v in iterable))
Expand Down Expand Up @@ -795,6 +833,14 @@ def list_to_np_array(iterable):


def np_type_unify(arrays):
"""unifies numpy types.
Args:
arrays (list): List of numpy arrays
Returns:
np.ndarray: numpy array of a single type.
"""
dtypes = {arr.dtype: len(arr) for arr in arrays}
if len(dtypes) == 1:
dtype, _ = dtypes.popitem()
Expand All @@ -808,7 +854,7 @@ def np_type_unify(arrays):
def multitype_set(arr):
"""prevents loss of True, False when calling sets.
python looses values when called returning a set:
python looses values when called returning a set. Example:
>>> {1, True, 0, False}
{0,1}
Expand All @@ -823,6 +869,7 @@ def multitype_set(arr):
L = [v for _, v in L]
return np.array(L, dtype=object)


matched_types = {
int: DataTypes._infer_int,
str: DataTypes._infer_str,
Expand All @@ -831,4 +878,4 @@ def multitype_set(arr):
date: DataTypes._infer_date,
datetime: DataTypes._infer_datetime,
time: DataTypes._infer_time,
}
}

0 comments on commit 7167800

Please sign in to comment.