Skip to content

Commit

Permalink
CLN: ASV reindex (pandas-dev#18938)
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored and jreback committed Dec 26, 2017
1 parent a088c7b commit 95e79a7
Showing 1 changed file with 86 additions and 117 deletions.
203 changes: 86 additions & 117 deletions asv_bench/benchmarks/reindex.py
Original file line number Diff line number Diff line change
@@ -1,89 +1,77 @@
from .pandas_vb_common import *
from random import shuffle
import numpy as np
import pandas.util.testing as tm
from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index,
date_range)
from .pandas_vb_common import setup, lib # noqa


class Reindexing(object):
class Reindex(object):

goal_time = 0.2

def setup(self):
self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
self.df = DataFrame(np.random.rand(10000, 10), index=self.rng,
rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
self.df = DataFrame(np.random.rand(10000, 10), index=rng,
columns=range(10))
self.df['foo'] = 'bar'
self.rng2 = Index(self.rng[::2])

self.rng_subset = Index(rng[::2])
self.df2 = DataFrame(index=range(10000),
data=np.random.rand(10000, 30), columns=range(30))

# multi-index
N = 5000
K = 200
level1 = tm.makeStringIndex(N).values.repeat(K)
level2 = np.tile(tm.makeStringIndex(K).values, N)
index = MultiIndex.from_arrays([level1, level2])
self.s1 = Series(np.random.randn((N * K)), index=index)
self.s2 = self.s1[::2]
self.s = Series(np.random.randn(N * K), index=index)
self.s_subset = self.s[::2]

def time_reindex_dates(self):
self.df.reindex(self.rng2)
self.df.reindex(self.rng_subset)

def time_reindex_columns(self):
self.df2.reindex(columns=self.df.columns[1:5])

def time_reindex_multiindex(self):
self.s1.reindex(self.s2.index)
self.s.reindex(self.s_subset.index)


#----------------------------------------------------------------------
# Pad / backfill
class ReindexMethod(object):


class FillMethod(object):
goal_time = 0.2
params = ['pad', 'backfill']
param_names = ['method']

def setup(self):
self.rng = date_range('1/1/2000', periods=100000, freq='1min')
self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
self.ts2 = self.ts[::2]
self.ts3 = self.ts2.reindex(self.ts.index)
self.ts4 = self.ts3.astype('float32')

def pad(self, source_series, target_index):
try:
source_series.reindex(target_index, method='pad')
except:
source_series.reindex(target_index, fillMethod='pad')

def backfill(self, source_series, target_index):
try:
source_series.reindex(target_index, method='backfill')
except:
source_series.reindex(target_index, fillMethod='backfill')

def time_backfill_dates(self):
self.backfill(self.ts2, self.ts.index)
def setup(self, method):
N = 100000
self.idx = date_range('1/1/2000', periods=N, freq='1min')
self.ts = Series(np.random.randn(N), index=self.idx)[::2]

def time_pad_daterange(self):
self.pad(self.ts2, self.ts.index)
def time_reindex_method(self, method):
self.ts.reindex(self.idx, method=method)

def time_backfill(self):
self.ts3.fillna(method='backfill')

def time_backfill_float32(self):
self.ts4.fillna(method='backfill')
class Fillna(object):

def time_pad(self):
self.ts3.fillna(method='pad')
goal_time = 0.2
params = ['pad', 'backfill']
param_names = ['method']

def time_pad_float32(self):
self.ts4.fillna(method='pad')
def setup(self, method):
N = 100000
self.idx = date_range('1/1/2000', periods=N, freq='1min')
ts = Series(np.random.randn(N), index=self.idx)[::2]
self.ts_reindexed = ts.reindex(self.idx)
self.ts_float32 = self.ts_reindexed.astype('float32')

def time_reindexed(self, method):
self.ts_reindexed.fillna(method=method)

#----------------------------------------------------------------------
# align on level
def time_float_32(self, method):
self.ts_float32.fillna(method=method)


class LevelAlign(object):

goal_time = 0.2

def setup(self):
Expand All @@ -92,7 +80,6 @@ def setup(self):
labels=[np.arange(10).repeat(10000),
np.tile(np.arange(100).repeat(100), 10),
np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4),
index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4),
Expand All @@ -102,103 +89,85 @@ def time_align_level(self):
self.df.align(self.df_level, level=1, copy=False)

def time_reindex_level(self):
self.df_level.reindex(self.df.index, level=1)
self.df_level.reindex(self.index, level=1)


#----------------------------------------------------------------------
# drop_duplicates
class DropDuplicates(object):


class Duplicates(object):
goal_time = 0.2

def setup(self):
self.N = 10000
self.K = 10
self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
self.df = DataFrame({'key1': self.key1, 'key2': self.key2,
'value': np.random.randn((self.N * self.K)),})
self.col_array_list = list(self.df.values.T)

self.df2 = self.df.copy()
self.df2.ix[:10000, :] = np.nan
params = [True, False]
param_names = ['inplace']

def setup(self, inplace):
N = 10000
K = 10
key1 = tm.makeStringIndex(N).values.repeat(K)
key2 = tm.makeStringIndex(N).values.repeat(K)
self.df = DataFrame({'key1': key1, 'key2': key2,
'value': np.random.randn(N * K)})
self.df_nan = self.df.copy()
self.df_nan.iloc[:10000, :] = np.nan

self.s = Series(np.random.randint(0, 1000, size=10000))
self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10))

np.random.seed(1234)
self.N = 1000000
self.K = 10000
self.key1 = np.random.randint(0, self.K, size=self.N)
self.df_int = DataFrame({'key1': self.key1})
self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K,
dtype=bool)
for i in range(10)})
self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))

def time_frame_drop_dups(self):
self.df.drop_duplicates(['key1', 'key2'])
N = 1000000
K = 10000
key1 = np.random.randint(0, K, size=N)
self.df_int = DataFrame({'key1': key1})
self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
dtype=bool))

def time_frame_drop_dups_inplace(self):
self.df.drop_duplicates(['key1', 'key2'], inplace=True)
def time_frame_drop_dups(self, inplace):
self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)

def time_frame_drop_dups_na(self):
self.df2.drop_duplicates(['key1', 'key2'])
def time_frame_drop_dups_na(self, inplace):
self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)

def time_frame_drop_dups_na_inplace(self):
self.df2.drop_duplicates(['key1', 'key2'], inplace=True)
def time_series_drop_dups_int(self, inplace):
self.s.drop_duplicates(inplace=inplace)

def time_series_drop_dups_int(self):
self.s.drop_duplicates()
def time_series_drop_dups_string(self, inplace):
self.s_str.drop_duplicates(inplace=inplace)

def time_series_drop_dups_string(self):
self.s2.drop_duplicates()
def time_frame_drop_dups_int(self, inplace):
self.df_int.drop_duplicates(inplace=inplace)

def time_frame_drop_dups_int(self):
self.df_int.drop_duplicates()

def time_frame_drop_dups_bool(self):
self.df_bool.drop_duplicates()

#----------------------------------------------------------------------
# blog "pandas escaped the zoo"
def time_frame_drop_dups_bool(self, inplace):
self.df_bool.drop_duplicates(inplace=inplace)


class Align(object):
# blog "pandas escaped the zoo"
goal_time = 0.2

def setup(self):
n = 50000
indices = tm.makeStringIndex(n)
subsample_size = 40000

def sample(values, k):
sampler = np.arange(len(values))
shuffle(sampler)
return values.take(sampler[:k])

self.x = Series(np.random.randn(50000), indices)
self.x = Series(np.random.randn(n), indices)
self.y = Series(np.random.randn(subsample_size),
index=sample(indices, subsample_size))
index=np.random.choice(indices, subsample_size,
replace=False))

def time_align_series_irregular_string(self):
(self.x + self.y)
self.x + self.y


class LibFastZip(object):

goal_time = 0.2

def setup(self):
self.N = 10000
self.K = 10
self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
self.col_array_list = list(self.df.values.T)

self.df2 = self.df.copy()
self.df2.ix[:10000, :] = np.nan
self.col_array_list2 = list(self.df2.values.T)
N = 10000
K = 10
key1 = tm.makeStringIndex(N).values.repeat(K)
key2 = tm.makeStringIndex(N).values.repeat(K)
col_array = np.vstack([key1, key2, np.random.randn(N * K)])
col_array2 = col_array.copy()
col_array2[:, :10000] = np.nan
self.col_array_list = list(col_array)
self.col_array_list2 = list(col_array2)

def time_lib_fast_zip(self):
lib.fast_zip(self.col_array_list)
Expand Down

0 comments on commit 95e79a7

Please sign in to comment.