Skip to content

Latest commit

 

History

History
3018 lines (2457 loc) · 43.5 KB

README.md

File metadata and controls

3018 lines (2457 loc) · 43.5 KB

Qfrom_slim

Qfrom is a unified and simple to use tool for data manipulation and data analysis. This Project is based on Python 3.10.0

I come from C# and I'm used to and in love with the Linq lib and pandas dataframes annoyed me and like this the Qfrom package was born. This is my take and adventure into datamanagement, numpy and python, from which, not least, a better understanding of pandas has emerged. warning this project vaiolates also some of the holy python conventions. there is also a bit of coding black magic involved. in some caeses it even feels Qfrom can read your mind (#args-not-specified, #function-returning-multiple-columns). but just in case you asked your self i can asssure you there is no AI involved. now i wish you the best of fun with a first example of Qfrom in use.

For a first impression here is a example of an imaginary company. This company consist of four peoble.

data = {
    'name': ['Emma', 'Bob', 'Steve', 'Ann'],
    'job': ['manager', 'employee', 'employee', 'freelancer'],
    'salary': [90_000, 61_000, 48_000, 56_000]
}

To turn this data info a Qfrom object, just pass the data into the constructor.

q = Qfrom(data)
q
> Qfrom
> name	job	salary
> Emma	manager	90000
> Bob	employee	61000
> Steve	employee	48000
> Ann	freelancer	56000

One day Emma the manager asks Bob the data scientist for a list of all employees. Bob just enters the following into his IDE

q.where(func=lambda job: job=='employee')
> Qfrom
> name	job	salary
> Bob	employee	61000
> Steve	employee	48000

Because Bob finished is task so quickly, Emma wants to increase everyones salary by 1,000 $

q = q.map('salary', lambda x: x+1_000)
q
> Qfrom
> name	job	salary
> Emma	manager	91000
> Bob	employee	62000
> Steve	employee	49000
> Ann	freelancer	57000

Now Emma wants to see the average salary per job title. A tricky one, bot not for Bob...

(q.groupby('job')
    .map('group', func.vec(lambda x: x['salary']))
    .map('group', func.vec(lambda x: x.agg(agg.mean)))
    .rename({'key': 'job', 'group': 'mean salary'}))
> Qfrom
> job	mean salary
> manager	91000.0
> employee	55500.0
> freelancer	57000.0

For a full desciption of all Qfrom functionality just browse throght the following documentation.


Contents


class Qfrom

import Qfrom like this

from QfromPackage.Qfrom_slim import Qfrom

import list

l = [1, 2, 3]
Qfrom(l)
> Qfrom
> y
> 1
> 2
> 3
l = [
    (1, 4),
    (2, 5),
    (3, 6)
    ]
Qfrom(l)
> Qfrom
> y0	y1
> 1	4
> 2	5
> 3	6
l = [
    {'a': 1, 'b': 4},
    {'a': 2, 'b': 5},
    {'a': 3, 'b': 6}
    ]
Qfrom(l)
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6

Contents


import dict

d = {
    'a': [1, 2, 3],
    'b': [4, 5, 6]
    }
Qfrom(d)
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6

Contents


import set

s = {1, 2, 3}
Qfrom(s)
> Qfrom
> y
> 1
> 2
> 3

Contents


import array

a = np.array([1, 2, 3])
Qfrom(a)
> Qfrom
> y
> 1
> 2
> 3

Contents


import matrix

mtx = np.array([
    [1, 4],
    [2, 5],
    [3, 6]
    ])
Qfrom(mtx)
> Qfrom
> y0	y1
> 1	4
> 2	5
> 3	6

Contents


import DataFrame

df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
    })
Qfrom(df)
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6

Contents


import csv

csv = '''
a,b
1,4
2,5
3,6
'''
Qfrom(csv)
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6

Contents


import json

json = """
{
    'a': [1, 2, 3],
    'b': [4, 5, 6]
}
"""
Qfrom(json)
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6

Contents


import generator

Qfrom(range(3))
> Qfrom
> y
> 0
> 1
> 2

Contents


eq

q1 = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q2 = Qfrom([
    {'a': 1, 'b': 4},
    {'a': 2, 'b': 5},
    {'a': 3, 'b': 6}
])

q1 == q2
> True

Contents


str

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
str(q)
> 'Qfrom\na\tb\n1\t4\n2\t5\n3\t6'

Contents


repr

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
print(q)
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6

Contents


append

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q.append((4, 7))
q
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6
> 4	7
q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q.append({'a': 4, 'b':7})
q
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6
> 4	7

Performance test

Contents


setitem

set row

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q[1] = (7, 8)
q
> Qfrom
> a	b
> 1	4
> 7	8
> 3	6
q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q[1] = 7, 8
q
> Qfrom
> a	b
> 1	4
> 7	8
> 3	6
q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q[1] = {'a': 7, 'b': 8}
q
> Qfrom
> a	b
> 1	4
> 7	8
> 3	6

set column

set single column

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q['a'] = [7, 8, 9]
q
> Qfrom
> a	b
> 7	4
> 8	5
> 9	6

add new column

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q['c'] = [7, 8, 9]
q
> Qfrom
> a	b	c
> 1	4	7
> 2	5	8
> 3	6	9

set multiple columns

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q['a, b'] = [(4, 1), (5, 2), (6, 3)]
q
> Qfrom
> a	b
> 4	1
> 5	2
> 6	3

the order of the key-value-pairs in the dictionary does not matter.

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q['a, b'] = {'b': [1, 2, 3], 'a': [4, 5, 6]}
q
> Qfrom
> a	b
> 4	1
> 5	2
> 6	3

set cell

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q['a', 1] = 7
q
> Qfrom
> a	b
> 1	4
> 7	5
> 3	6
q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

q[1] = {'a': 7}
q
> Qfrom
> a	b
> 1	4
> 7	5
> 3	6

Contents


getitem

Performance test

get row

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

print(q[1])

a, b = q[1]
print(f'{a=}, {b=}')
> (2, 5)
> a=2, b=5

it is posssible to use slice notation. the result is returned as a new Qfrom.

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q[1:]
> Qfrom
> a	b
> 2	5
> 3	6

get column

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q['a']
> Qfrom
> a
> 1
> 2
> 3

select multiple columns

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q['a,c']
> Qfrom
> a c
> 1 7
> 2 8
> 3 9

it is possible to use dynamic column selection. More information in section dynamic column selection

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q['...,c']
> Qfrom
> a b   c
> 1 4   7
> 2 5   8
> 3 6   9

get cell

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q['a', 0]
> 1

Contents


contains

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
(2, 5) in q
> True

the order of the key-value-pairs in the dictionary does not matter.

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
{'b':5, 'a': 2} in q
> True

Contents


iter

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

for a, b in q:
    print(a, b)
> 1 4
> 2 5
> 3 6

Performance test

Contents


len

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
len(q)
> 3

Contents


keys

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

for k in q.keys():
    print(k)
> a
> b

Contents


values

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

for v in q.values():
    print(v)
> [1 2 3]
> [4 5 6]

Contents


items

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})

for v in q.values():
    print(v)
> a [1 2 3]
> b [4 5 6]

Contents


remove

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q.remove('a')
> Qfrom
> b
> 4
> 5
> 6

remove multiple columns

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q.remove('a, c')
> Qfrom
> b	d	e
> 4	10	13
> 5	11	14
> 6	12	15

it is possible to use dynamic column selection. More information in section dynamic column selection

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q.remove('...,c')
> Qfrom
> d	e
> 10	13
> 11	14
> 12	15

Contents


rename

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q.rename({'b': 'c'})
> Qfrom
> a	c
> 1	4
> 2	5
> 3	6

rename multiple columns

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q.rename({'b': 'c'})
> Qfrom
> a	y
> 1	4
> 2	5
> 3	6

Contents


select

args

  • selection: str|tuple[str]|list[str]

    -> determents which columns will be passed to new Qfrom

  • return: Qfrom

Performance test

string

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q.select('a')
> Qfrom
> a
> 1
> 2
> 3

select multiple columns

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q.select('a, c')
> Qfrom
> a	c
> 1	7
> 2	8
> 3	9

dynamic column selection

... notation for a slice of the keys

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q.select('...,c')
> Qfrom
> a	b	c
> 1	4	7
> 2	5	8
> 3	6	9
q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q.select('b,...,d')
> Qfrom
> a	b	c	d
> 1	4	7	10
> 2	5	8	11
> 3	6	9	12

. will be replaced by next occuring key

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q.select('a,.,c')
> Qfrom
> a	b	c
> 1	4	7
> 2	5	8
> 3	6	9

* will be replaced by all keys

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q.select('*')
> Qfrom
> a	b	c	d	e
> 1	4	7	10	13
> 2	5	8	11	14
> 3	6	9	12	15

tuple

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
    'd': [10, 11, 12],
    'e': [13, 14, 15],
})
q.select(('a', 'c'))
> Qfrom
> a	c
> 1	7
> 2	8
> 3	9

Contents


map

args

  • args: str | tuple[str] | list[str] = None

    -> determents which columns will be passed to func

  • func: callable = None,

    -> function mappes passed columns to one or more new columns

  • out: str | tuple[str] | list[str] = None

    -> names for the output columns

  • return: Qfrom

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q.map('a,b', lambda x,y: x+y, 'c')
> Qfrom
> a	b	c
> 1	4	5
> 2	5	7
> 3	6	9

Performance test

out not specified

if out is not speecified the result will be written into the first column from the specified args

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q.map('a,b', lambda x,y: x+y)
> Qfrom
> a	b
> 5	4
> 7	5
> 9	6

args not specified

if args is not speecified the passed columns will be choosen by ne names of arguments of the given function.

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q.map(func=lambda b,a: b+a, out='c')
> Qfrom
> a	b	c
> 1	4	5
> 2	5	7
> 3	6	9

if * notation is used in the args of the given function, all not used columns will be passed to the function.

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9]
})
q.map(func=lambda a, *args: a+sum(args), out='d')
> Qfrom
> a	b	c	d
> 1	4	7	12
> 2	5	8	15
> 3	6	9	18

if ** notation is used in the args of the given function, all not used columns will be passed as a dict to the function.

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9]
})
q.map(func=lambda a, **kwrgs: kwrgs['c'], out='d')
> Qfrom
> a	b	c	d
> 1	4	7	7
> 2	5	8	8
> 3	6	9	9

args dynamic column selection

it is possible to use dynamic column selection to specify the parameter args. More information in section dynamic column selection

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9]
})
q.map('*', lambda x, *args: x+sum(args), out='d')
> Qfrom
> a	b	c	d
> 1	4	7	12
> 2	5	8	15
> 3	6	9	18

func not specified respectively copying column

if func is not specified map will write the selected columns to the specified out keys

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q.map('a, b', out='c, d')
> Qfrom
> a	b	c	d
> 1	4	1	4
> 2	5	2	5
> 3	6	3	6

vectorize function

by default the columns which will be passed to the function are of type np.ndarray. if the given function is defined vor single element, not for whole columns, the function must first be vectorized. More information in section vec

q = Qfrom({'a': ['ab', 'cd', 'fg']})
q.map('a', func.vec(lambda x: x.upper()))
> Qfrom
> a
> AB
> CD
> FG

function returning multiple columns

if a function is returning a tuple or a dict of np.ndarray the result will be treated as multible columns.

q = Qfrom({'a': [1, 2, 3]})
q.map('a', lambda x: (x+1, x+2))
> Qfrom
> a	a0	a1
> 1	2	3
> 2	3	4
> 3	4	5
q = Qfrom({'a': [1, 2, 3]})
q.map('a', lambda x: {'b': x+1, 'c': x+2})
> Qfrom
> a	b	c
> 1	2	3
> 2	3	4
> 3	4	5

multible keys can be specified in the out parameter

q = Qfrom({'a': [1, 2, 3]})
q.map('a', lambda x: (x+1, x+2), 'b, c')
> Qfrom
> a	b	c
> 1	2	3
> 2	3	4
> 3	4	5

function returning a scalar

if the function is returning a scalar insted of a np.ndarray, the scalar will be broadcasted to a np.ndarray of the size of a column

q = Qfrom({'a': [1, 2, 3]})
q.map(func=lambda: 1, out='b')
> Qfrom
> a	b
> 1	1
> 2	1
> 3	1

function returning a generator

if the function is returning a generator insted of a np.ndarray, map will pull as many elements from the generator as needed to fill a np.ndarray of the size of a column

q = Qfrom({'a': [1, 2, 3]})
q.map(func=lambda: (c for c in 'python'), out='b')
> Qfrom
> a	b
> 1	p
> 2	y
> 3	t

using the generator col.id is a simple way to get a id column. More information in section id

q = Qfrom({'a': [1, 2, 3]})
q.map(func=col.id, out='i')
> Qfrom
> a	i
> 1	0
> 2	1
> 3	2

Contents


orderby

data = [
    {'a': 3, 'b': 4, 'c': 1},
    {'a': 2, 'b': 3, 'c': 2},
    {'a': 2, 'b': 2, 'c': 3},
    {'a': 1, 'b': 1, 'c': 4},
]
q = Qfrom(data)
q.orderby('a')
> Qfrom
> a	b	c
> 1	1	4
> 2	3	2
> 2	2	3
> 3	4	1
data = [
    {'a': 3, 'b': 4, 'c': 1},
    {'a': 2, 'b': 3, 'c': 2},
    {'a': 2, 'b': 2, 'c': 3},
    {'a': 1, 'b': 1, 'c': 4},
]
q = Qfrom(data)
q.orderby('a', reverse=True)
> Qfrom
> a	b	c
> 3	4	1
> 2	2	3
> 2	3	2
> 1	1	4

it is possible to order by multiple keys.

data = [
    {'a': 3, 'b': 4, 'c': 1},
    {'a': 2, 'b': 3, 'c': 2},
    {'a': 2, 'b': 2, 'c': 3},
    {'a': 1, 'b': 1, 'c': 4},
]
q = Qfrom(data)
q.orderby('a, b')
> Qfrom
> a	b	c
> 1	1	4
> 2	2	3
> 2	3	2
> 3	4	1

it is possible to transform the key column through a function.

data = [
    {'a': 3, 'b': 4, 'c': 1},
    {'a': 2, 'b': 3, 'c': 2},
    {'a': 2, 'b': 2, 'c': 3},
    {'a': 1, 'b': 1, 'c': 4},
]
q = Qfrom(data)
q.orderby('a', lambda x: x%2)
> Qfrom
> a	b	c
> 2	3	2
> 2	2	3
> 3	4	1
> 1	1	4

if selection is not speecified the passed columns will be choosen by ne names of arguments of the given function.

data = [
    {'a': 3, 'b': 4, 'c': 1},
    {'a': 2, 'b': 3, 'c': 2},
    {'a': 2, 'b': 2, 'c': 3},
    {'a': 1, 'b': 1, 'c': 4},
]
q = Qfrom(data)
q.orderby(func=lambda a: a%2)
> Qfrom
> a	b	c
> 2	3	2
> 2	2	3
> 3	4	1
> 1	1	4

Performance test

Contents


where

q = Qfrom({
    'a': [True, False, True, False, True],
    'b': [1, 1, 1, 1, 0],
    'c': [1, 2, 3, 4, 5]
})
q.where('a')
> Qfrom
> a	b	c
> True	1	1
> True	1	3
> True	0	5

it is possible to pass multiple keys into where method. The values in all selected columns will first be parst to booleans. The parsed columns will be combined through a logical and operation to resive the final boolean key array which determines which rows will be passed to the result Qfrom.

q = Qfrom({
    'a': [True, False, True, False, True],
    'b': [1, 1, 1, 1, 0],
    'c': [1, 2, 3, 4, 5]
})
q.where('a, b')
> Qfrom
> a	b	c
> True	1	1
> True	1	3

it is possible to transform the key column through a function.

q = Qfrom({
    'a': [True, False, True, False, True],
    'b': [1, 1, 1, 1, 0],
    'c': [1, 2, 3, 4, 5]
})
q.where('c', lambda x: x < 3)
> Qfrom
> a	b	c
> True	1	1
> False	1	2

if selection is not speecified the passed columns will be choosen by ne names of arguments of the given function.

q = Qfrom({
    'a': [True, False, True, False, True],
    'b': [1, 1, 1, 1, 0],
    'c': [1, 2, 3, 4, 5]
})
q.where(func=lambda c: c < 3)
> Qfrom
> a	b	c
> True	1	1
> False	1	2

Performance test

Contents


groupy

q = Qfrom({
    'a': [1, 1, 2, 2],
    'b': [3, 3, 3, 4],
    'c': [5, 6, 7, 8]
})
q.groupby('a')
> Qfrom
> key	group
> 1	Qfrom
> a	b	c
> 1	3	5
> 1	3	6
> 2	Qfrom
> a	b	c
> 2	3	7
> 2	4	8

it is possible to group by multiple keys. Therefore the selcted columns will be transforemd to one column full of tuples holding the items from the selected columns.

q = Qfrom({
    'a': [1, 1, 2, 2],
    'b': [3, 3, 3, 4],
    'c': [5, 6, 7, 8]
})
q.groupby('a, b')
> Qfrom
> key	group
> (1, 3)	Qfrom
> a	b	c
> 1	3	5
> 1	3	6
> (2, 3)	Qfrom
> a	b	c
> 2	3	7
> (2, 4)	Qfrom
> a	b	c
> 2	4	8

it is possible to transform the key column through a function.

q = Qfrom({
    'a': [1, 1, 2, 2],
    'b': [3, 3, 3, 4],
    'c': [5, 6, 7, 8]
})
q.groupby('c', lambda x: x%2)
> Qfrom
> key	group
> 1	Qfrom
> a	b	c
> 1	3	5
> 2	3	7
> 0	Qfrom
> a	b	c
> 1	3	6
> 2	4	8

if selection is not speecified the passed columns will be choosen by ne names of arguments of the given function.

q = Qfrom({
    'a': [1, 1, 2, 2],
    'b': [3, 3, 3, 4],
    'c': [5, 6, 7, 8]
})
q.groupby(func=lambda c: c%2)
> Qfrom
> key	group
> 1	Qfrom
> a	b	c
> 1	3	5
> 2	3	7
> 0	Qfrom
> a	b	c
> 1	3	6
> 2	4	8

Performance test

Contents


flatten

q = Qfrom({
    'a': [1, 2],
    'b': [[3, 4], [5, 6]]
})
q.flatten('b')
> Qfrom
> a	b
> 1	3
> 1	4
> 2	5
> 2	6
q = Qfrom({
    'a': [1, 2],
    'b': [[3, 4], [5, 6]]
})
q.flatten('b', 'c')
> Qfrom
> a	b	c
> 1	[3, 4]	3
> 1	[3, 4]	4
> 2	[5, 6]	5
> 2	[5, 6]	6
q = Qfrom({
    'a': [1, 2],
    'b': [3, 4],
})
q.flatten('a, b', 'c')
> Qfrom
> a	b	c
> 1	3	1
> 1	3	3
> 2	4	2
> 2	4	4

Contents


unique

collects first appearing items in Qfrom with a unique key.

q = Qfrom({
    'a': [1, 2, 2, 3, 3],
    'b': [4, 5, 5, 6, 7]
})
q.unique('a')
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6

it is possible to pass multiple keys.

q = Qfrom({
    'a': [1, 2, 2, 3, 3],
    'b': [4, 5, 5, 6, 7]
})
q.unique('a, b')
> Qfrom
> a	b
> 1	4
> 2	5
> 3	6
> 3	7

value counts

count how often each key appears in the given Qfrom.

q = Qfrom({
    'a': [1, 2, 2, 3, 3],
    'b': [4, 5, 5, 6, 7]
})
q.unique('a')
> Qfrom
> value	count
> 1	1
> 2	2
> 3	2

it is possible to pass multiple keys. Therefore the selcted columns will be transforemd to one column full of tuples holding the items from the selected columns.

q = Qfrom({
    'a': [1, 2, 2, 3, 3],
    'b': [4, 5, 5, 6, 7]
})
q.value_counts('a, b')
> Qfrom
> value	count
> (1, 4)	1
> (2, 5)	2
> (3, 6)	1
> (3, 7)	1

Contents


agg

if one function is passed to agg, the function will be applied to every column.

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q.agg(agg.sum)
> (6, 15)

multiple functions can be passed as a tuple of functions. Each function will be applied to the corresponding column in order of key apperances in the Qfrom.

q = Qfrom({
    'a': [1, 2, 3],
    'b': [4, 5, 6]
})
q.agg((agg.max, agg.min))
> (3, 4)

Performance test

Contents


join

Contents


join cross

Contents


join outer

Contents


join outer left

Contents


join outer right

Contents


join id

Contents


join id outer

Contents


join id outer left

Contents


join id outer right

Contents


concat

Contents


concat outer

Contents


concat outer left

Contents


concat outer right

Contents


calculate

Contents


call

Contents


class col

col class is a colection of functions which can easily be applied to colums of in a Qfrom.

import col like this

from QfromPackage.Qfrom_slim import col

0 -> 1 functions

id

g = col.id()
print(next(g))
print(next(g))
print(next(g))
> 0
> 1
> 2
q = Qfrom({'a': ['x', 'y', 'z']})
q.map(func=col.id, out='id')
> Qfrom
> a	id
> x	0
> y	1
> z	2

Contents


1 -> 1 functions

function which resive one np.ndarray and return one np.ndarray of same lenght.

normalize

a = np.array([1, 2, 3, 4])
col.normalize(a)
> array([0.25, 0.5 , 0.75, 1.  ])
q = Qfrom({'a': [1, 2, 3, 4]})
q.map('a', col.normalize)
> Qfrom
> a
> 0.25
> 0.5
> 0.75
> 1.0
a = np.array([1, -2, 3, -4])
col.normalize(a)
> array([ 0.25, -0.5 ,  0.75, -1.  ])

Contents


abs

a = np.array([1, -2, 3, -4])
col.abs(a)
> array([1, 2, 3, 4])

Contents


shift

a = np.array([1, 2, 3, 4])
col.shift(steps=1, default_value=0)(a)
> array([0, 1, 2, 3])

Contents


not

Contents


n -> 1 functions

function which resive one np.ndarray and return multiple np.ndarray of same lenght.

any

Contents


all

Contents


min

Contents


min_colname

a = np.array([1, 2, 3, 4])
b = np.array([4, 3, 2, 1])

col.min_colname(a=a, b=b)
> array(['a', 'a', 'b', 'b'], dtype=object)
q = Qfrom({
    'a': [1, 2, 3, 4],
    'b': [4, 3, 2, 1]
})
q.map('*', col.min_colname, 'min')
> Qfrom
> a	b	min
> 1	4	a
> 2	3	a
> 3	2	b
> 4	1	b

Contents


max

Contents


max_colname

a = np.array([1, 2, 3, 4])
b = np.array([4, 3, 2, 1])

col.max_colname(a=a, b=b)
> array(['b', 'b', 'a', 'a'], dtype=object)
q = Qfrom({
    'a': [1, 2, 3, 4],
    'b': [4, 3, 2, 1]
})
q.map('*', col.max_colname, 'max')
> Qfrom
> a	b	max
> 1	4	b
> 2	3	b
> 3	2	a
> 4	1	a

Contents


sum

Contents


mean

Contents


median

Contents


var

Contents


eq

Contents


agg

Contents


state

Contents


lod_and

Contents


lod_or

Contents


lod_xor

Contents


1 -> n functions

copy

Contents


flatten

Contents


n -> m functions

ml_models

Contents


class func

import func like this

from QfromPackage.Qfrom_slim import func

vec

Contents


multicol

Contents


class agg

import agg like this

from QfromPackage.Qfrom_slim import agg

any

Contents


all

Contents


min

Contents


min_id

Contents


max

Contents


max_id

Contents


sum

Contents


mean

Contents


median

Contents


var

Contents


len

Contents


size

Contents


state

Contents


class plot

import plot like this

from QfromPackage.Qfrom_slim import plot

plot

Contents


bar

Contents


hist

Contents


box

Contents


scatter

Contents


class out

import out like this

from QfromPackage.Qfrom_slim import out

list

Contents


set

Contents


dict

Contents


array

Contents


mtx

Contents


df

Contents


csv

Contents


csv file

Contents


json

Contents


json file

Contents


class trans

import trans like this

from QfromPackage.Qfrom_slim import trans

shuffle

Contents


Performance Tests

In this section several modules for data manipulation will be getting compared. Therefore several common methods for data manipulation are getting explored in regard to there runtimes.

The explored modules are numpy, pandas, python lists and Qfrom.

setup

this section discripes the test data.

data set generation

def get_p(n):
    return [1/(2**(i+1)) if i+1 != n else 1/(2**(i)) for i in range(n)]

def get_tab_data(n):
    name_list = ['Ann', 'Steven', 'Max', 'Jack', 'Julia', 'Clara', 'Emma', 'Bob', 'Anna' 'Lena']
    job_list = ['employee', 'jobless', 'freelancer', 'artist', 'technician', 'leader', 'coach', 'manager']
    max_age = 100
    max_salary = 1_000_000

    return {
        'name': np.random.choice(name_list, n, p=get_p(len(name_list))),
        'age': np.random.randint(max_age, size=n),
        'job': np.random.choice(job_list, n, p=get_p(len(job_list))),
        'salary': np.random.randint(max_salary, size=n),
        }

How data gets transformed to meet the requirements of the different modules.

class setup():
    @classmethod
    def np(cls, data: dict[str, numpy.ndarray]):
        return {k: np.copy(v) for k,v in data.items()}
    @classmethod
    def np_tpl(cls, data: dict[str, numpy.ndarray]):
        return ({k:np.copy(v) for k,v in data[0].items()}, *data[1:])
    @classmethod
    def np_mtx(cls, data: dict[str, numpy.ndarray]):
        cols = list(data.values())
        return np_ext.col_stack(cols)
    @classmethod
    def df(cls, data: dict[str, numpy.ndarray]):
        return pd.DataFrame(data)
    @classmethod
    def df_tpl(cls, data: dict[str, numpy.ndarray]):
        return (pd.DataFrame(data[0]), *data[1:])
    @classmethod
    def l(cls, data: dict[str, numpy.ndarray]):
        return {key: list(col) for key, col in data.items()}
    @classmethod
    def l_tpl(cls, data: dict[str, numpy.ndarray]):
        return ({key: list(col) for key, col in data[0].items()}, *data[1:])
    @classmethod
    def qs(cls, data: dict[str, numpy.ndarray]):
        return Qfrom_slim(data)
    @classmethod
    def qs_tpl(cls, data: dict[str, numpy.ndarray]):
        return (Qfrom_slim(data[0]), *data[1:])
    @classmethod
    def list_items(cls, data: dict[str, numpy.ndarray]):
        return list(iter_table_dict(data))

Contents


runtime tests

the different data manipulation methods get executed on multiple datasets of varying sizes

append

append method implementations

def append_np(data):
    result = {
        'name': np.array([data[0][0]]),
        'age': np.array([data[0][1]]),
        'job': np.array([data[0][2]]),
        'salary': np.array([data[0][3]]),
        }
    for name, age, job, salary in data[1:]:
        result['name'] = np.append(result['name'], [name])
        result['name'] = np.append(result['age'], [age])
        result['name'] = np.append(result['job'], [job])
        result['name'] = np.append(result['salary'], [salary])
    return result

def append_df(data):
    result = pd.DataFrame([data[0]], columns=['name', 'age', 'job', 'salary'])
    for t in data[1:]:
        row = pd.DataFrame([t], columns=['name', 'age', 'job', 'salary'])
        result.append([row], ignore_index=True)
    return result

def append_qs(data):
    result = Qfrom_slim()
    for t in data:
        result.append(t)
    return result(out.dict)

def append_l(data):
    result = {
        'name': [],
        'age': [],
        'job': [],
        'salary': [],
    }
    for name, age, job, salary in data:
        result['name'].append(name)
        result['age'].append(age)
        result['job'].append(job)
        result['salary'].append(salary)
    return result

measured runtimes dependend on the size of the input data sets

runtimes for max data set size n=10 000

np 0.174 s 1.0%
df 8.22 s 47.184%
qs 0.492 s 2.823%
l 0.003 s 0.016%

Contents


getitem

getitem method implementations

def getitem_np(t):
    data, ids = t
    for id in ids:
        tuple(col[id] for col in data.values())

def getitem_df(t):
    df, ids = t
    for id in ids:
        df.iloc[id]

def getitem_qs(t):
    q, ids = t
    for id in ids:
        q[id]

def getitem_l(t):
    data, ids = t
    for id in ids:
        tuple(col[id] for col in data.values())

runtimes for max data set size n=100 000

np 0.144 s 1.0%
df 5.486 s 38.089%
qs 0.161 s 1.119%
l 0.119 s 0.826%

Contents


iter

iter method implementations

def iter_np(data):
    row_count = 0
    for _ in np.nditer(list(data.values()), flags=["refs_ok"]):
        row_count += 1
    return row_count
def iter_np_mtx(data):
    row_count = 0
    for _ in np.nditer(data, flags=["refs_ok"]):
        row_count += 1
    return row_count

def iter_df(df: pd.DataFrame):
    row_count = 0
    for _ in df.values:
        row_count += 1
    return row_count

def iter_qs(q: Qfrom_slim):
    row_count = 0
    for _ in q:
        row_count += 1
    return row_count

def iter_l(data):
    row_count = 0
    for _ in zip(*data.values()):
        row_count += 1
    return row_count

runtimes for max data set size n=1 000 000

np 0.16 s 1.0%
df 0.163 s 1.015%
qs 0.438 s 2.736%
l 0.044 s 0.275%
np_mtx 0.06 s 0.375%

Contents


select

select method implementations

cols = 'name'

def select_np(data):
    return {key:value for key, value in data.items() if key in cols}

def select_df(df):
    return df[cols]

def select_qs(q: Qfrom_slim):
    return q.select(cols)(out.dict)

def select_l(data):
    return {key:value for key, value in data.items() if key in cols}

runtimes for max data set size n=1 000 000

np 0.0 s 0%
df 0.0 s 0%
qs 0.0 s 0%
l 0.0 s 0%

select multiple columns method implementations

cols = ['name', 'age']

def select_np(data):
    return {key:value for key, value in data.items() if key in cols}

def select_df(df):
    return df[cols]

def select_qs(q: Qfrom_slim):
    return q.select(cols)(out.dict)

def select_l(data):
    return {key:value for key, value in data.items() if key in cols}

runtimes for max data set size n=10 000 000

np 0.0 s 0%
df 0.106 s 0%
qs 0.0 s 0%
l 0.0 s 0%

Contents


map

map add method implementations

def map_add_np(data):
    data['age'] = data['age']+10
    return data

def map_add_df(df: pd.DataFrame):
    data['age'] = data['age']+10
    return df

def map_add_qs(q: Qfrom_slim):
    q = q.map(func=lambda age: age+10)
    return q(out.dict)

def map_add_l(data):
    data['age'] = [x+10 for x in data['age']]
    return data

runtimes for max data set size n=10 000 000

np 0.008 s 1.0%
df 0.016 s 2.05%
qs 0.008 s 1.0%
l 1.223 s 152.791%

map by func method implementations

def test_func(x): f'i am {x} years old'

def map_func_np(data):
    map_age = np.frompyfunc(test_func, 1, 1)
    data['age'] = map_age(data['age'])
    return data

def map_func_df(df: pd.DataFrame):
    df['age'] = df['age'].apply(test_func)
    return df

def map_func_qs(q: Qfrom_slim):
    q = q.map('age', func.vec(test_func))
    return q(out.dict)

def map_func_l(data):
    data['age'] = [test_func(x) for x in data['age']]
    return data

runtimes for max data set size n=10 000 000

np 1.29 s 1.0%
df 2.021 s 1.567%
qs 1.28 s 0.993%
l 2.547 s 1.975%

map by func two arguments method implementations

def test_func(x, y): return f'My name is {x} and i am {y} years old'

def map_func_np(data):
    map_age = np.frompyfunc(test_func, 2, 1)
    data['msg'] = map_age(data['name'], data['age'])
    return data

def map_func_df(df: pd.DataFrame):
    df['msg'] = df.apply(lambda x: test_func(x['name'], x['age']), axis=1)
    return df

def map_func_qs(q: Qfrom_slim):
    q = q.map('name, age', func.vec(test_func), 'msg')
    return q(out.dict)

def map_func_l(data):
    data['msg'] = [test_func(*a) for a in zip(data['name'], data['age'])]
    return data

def map_func_df_lcph(df: pd.DataFrame):
    df['msg'] = [test_func(*a) for a in zip(df['name'], df['age'])]
    return df

def map_func_df_np(df: pd.DataFrame):
    map_age = np.frompyfunc(test_func, 2, 1)
    df['msg'] = map_age(df['name'], df['age'])
    return df
    
def map_func_np_iter(data):
    o = np.full(data['name'].size, '', dtype=np.dtype('U40'))
    it = np.nditer([data['name'], data['age'], o], [],
                [['readonly']]*2+ [['writeonly','allocate']])
    while not it.finished:
        it[2] = test_func(it[0], it[1])
        it.iternext()
    data['msg'] = it.operands[2]
    return data

runtimes for max data set size n=1 000 000

np 0.244 s 1.0%
df 6.877 s 28.15%
qs 0.234 s 0.958%
l 0.431 s 1.764%
df_lcph 0.357 s 1.461%
df_np 0.212 s 0.867%
np_iter 1.419 s 5.809%

Contents


orderby

orderby method implementations

def orderby_np(data):
    sorted_ids = np.argsort(data['age'])
    return {key: value[sorted_ids] for key, value in data.items()}

def orderby_df(df):
    return df.sort_values('age')

def orderby_qs(q: Qfrom_slim):
    return q.orderby('age')(out.dict)

def orderby_l(data):
    sorted_ids = sorted(range(len(data['age'])), key=lambda x: data['age'][x])
    return {key: [value[i] for i in sorted_ids] for key, value in data.items()}

runtimes for max data set size n=1 000 000

np 0.094 s 1.0%
df 0.155 s 1.648%
qs 0.116 s 1.242%
l 1.156 s 12.33%

orderby multiple columns method implementations

def orderby_mult_np(data):
    sorted_ids = np.lexsort([data['age'], data['name']])
    return {key:value[sorted_ids] for key, value in data.items()}

def orderby_mult_df(df):
    return df.sort_values(['name', 'age'])

def orderby_mult_qs(q: Qfrom_slim):
    return q.orderby('name, age')(out.dict)

runtimes for max data set size n=10 000 000

np 8.315 s 1.0%
df 2.378 s 0.286%
qs 7.396 s 0.889%

Contents


where

where method implementations

def where_np(data):
    job_filter = np.where(data['job']=='manager')
    data = {key: value[job_filter] for key, value in data.items()}
    return data

def where_df(df):
    return df[df['job']=='manager']

def where_qs(q: Qfrom_slim):
    return q.where('job', lambda x: x=="manager")(out.dict)

def where_l(data):
    job_filter = [i for i in range(len(data['job'])) if data['job'][i]=='manager']
    return {key: [value[i] for i in job_filter] for key, value in data.items()}

runtimes for max data set size n=10 000 000

np 0.232 s 1.0%
df 0.619 s 2.672%
qs 0.232 s 1.002%
l 1.428 s 6.166%

Contents


groupby

not easy compareable bacause pandas groupby is only returning ids.

groupby method implementations

cols = 'job'

def groupby_np(data):
    sorted_ids = np.argsort(data[cols])
    sorted_key_array = data[cols][sorted_ids]
    unique_keys, unique_key_ids = np.unique(sorted_key_array, return_index=True)
    id_groups = np.split(sorted_ids, unique_key_ids[1:])
    group_dict = {
        'key': unique_keys,
        'group': np.array([{key:col[ids] for key, col in data.items()} for ids in id_groups])
        }

    return group_dict

def groupby_df(df: pd.DataFrame):
    return df.groupby(cols).groups

def groupby_qs(q: Qfrom_slim):
    return q.groupby(cols)(out.dict)

runtimes for max data set size n=10 000 000

np 2.466 s 1.0%
df 0.627 s 0.254%
qs 7.511 s 3.046%

groupby multiple columns method implementations

cols = ['job', 'name']

def groupby_np(data):
    sorted_ids = np.lexsort([data[c] for c in cols[::-1]])
    sorted_key_array = array_tuple_to_tuple_array([c for k, c in data.items() if k in cols])
    unique_keys, unique_key_ids = np.unique(sorted_key_array, return_index=True)
    id_groups = np.split(sorted_ids, unique_key_ids[1:])
    group_dict = {
        'key': unique_keys,
        'group': np.array([{key:col[ids] for key, col in data.items()} for ids in id_groups])
        }
    
    return group_dict


def groupby_df(df: pd.DataFrame):
    return df.groupby(cols).groups

def groupby_qs(q: Qfrom_slim):
    return q.groupby(cols)(out.dict)

runtimes for max data set size n=1 000 000

np 3.431 s 1.0%
df 0.917 s 0.267%
qs 1.089 s 0.318%

Contents


agg

agg method implementations

def agg_np(data):
    return np.mean(data['age'])

def agg_df(df: pd.DataFrame):
    return df['age'].agg('mean')

def agg_qs(q: Qfrom_slim):
    return q['age'].agg(agg.mean)

def agg_l(data):
    return sum(data['age']) / len(data['age'])

runtimes for max data set size n=10 000 000

np 0.005 s 1.0%
df 0.006 s 1.2%
qs 0.005 s 1.0%
l 0.372 s 74.305%

Contents