diff --git a/sampling/__init__.py b/sampling/__init__.py index 38bfb78..e0df4ea 100644 --- a/sampling/__init__.py +++ b/sampling/__init__.py @@ -1,3 +1,3 @@ -from .core import jackknife +from .core import jackknife, shuffle, Reservoir __version__ = '0.0.1' diff --git a/sampling/core.py b/sampling/core.py index 8fe4f3c..df45f38 100644 --- a/sampling/core.py +++ b/sampling/core.py @@ -1,4 +1,5 @@ import itertools +import random as core_random no_replace = '__no__replace__' @@ -45,3 +46,47 @@ def jackknife(seq, replace=no_replace): it = iter(seq) yield itertools.chain(itertools.islice(it, i), replace, itertools.islice(it, 1, None)) + + +def shuffle(x, random=None): + """ Randomly reorder values of x + + Pure version of standard ``random.shuffle`` + """ + if isinstance(x, list): + x = x.copy() + x = list(x) + core_random.shuffle(x, random=random) + return x + + +class Reservoir(object): + """ Basic object for Reservoir Sampling + + >>> res = Reservoir(3) # Reservoir of size 3 + >>> for item in range(10): + ... res.add(item) + + Res contains three elements randomly chosen from ``range(10)`` + + >>> list(res) # doctest: +SKIP + [8, 3] + """ + __slots__ = 'size', 'random', 'storage', 'count' + def __init__(self, size, random=core_random.random): + self.size = size + self.random = random + self.storage = set() + self.count = 0 + + def add(self, item): + self.count += 1 + if self.count <= self.size: + self.storage.add(item) + else: + if self.random() < float(self.size) / self.count: + dropped = self.storage.pop() + self.storage.add(item) + + def __iter__(self): + return iter(self.storage) diff --git a/sampling/tests/test_core.py b/sampling/tests/test_core.py index 08baf7b..2040e9a 100644 --- a/sampling/tests/test_core.py +++ b/sampling/tests/test_core.py @@ -1,4 +1,4 @@ -from sampling.core import jackknife +from sampling import jackknife, shuffle, Reservoir def test_jacknife(): @@ -10,3 +10,26 @@ def test_jacknife(): (0, 2, 3), (1, 0, 3), (1, 2, 0)) assert tuple(tuple(x) for x in jackknife([])) == () assert tuple(tuple(x) for x in jackknife([1], replace=0)) == ((0,),) + + +def test_shuffle(): + assert set(shuffle((1, 2, 3))) == set((1, 2, 3)) + + +def test_Reservoir(): + r = Reservoir(2) + r.add(1) + assert r.count == 1 + assert r.size == 2 + assert set(r) == set([1]) + + r.add(2) + assert r.count == 2 + assert r.size == 2 + print set(r) + assert set(r) == set([1, 2]) + + r.add(3) + assert r.count == 3 + assert r.size == 2 + assert tuple(sorted(r)) in ((1, 2), (1, 3), (2, 3))