-
Notifications
You must be signed in to change notification settings - Fork 20
/
features.py
96 lines (74 loc) · 3.83 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from enum import Enum
import numpy as np
from sensai import VectorRegressionModel
from sensai.data_transformation import DFTNormalisation, SkLearnTransformerFactoryFactory
from sensai.featuregen import FeatureGeneratorRegistry, FeatureGeneratorTakeColumns, FeatureGenerator
from .data import *
class FeatureName(Enum):
MUSICAL_DEGREES = "musical_degrees"
MUSICAL_CATEGORIES = "musical_categories"
LOUDNESS = "loudness"
TEMPO = "tempo"
DURATION = "duration"
YEAR = "year"
MEAN_ARTIST_FREQ_POPULAR = "mean_artist_freq_popular"
class FeatureGeneratorMeanArtistPopularity(FeatureGenerator):
def __init__(self):
super().__init__(normalisation_rule_template=DFTNormalisation.RuleTemplate(
transformer_factory=SkLearnTransformerFactoryFactory.MaxAbsScaler()))
self.col_target = COL_GEN_POPULARITY_CLASS
self._y = None
def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
df: pd.DataFrame = pd.concat([x, y], axis=1)[[COL_ARTIST_NAME, self.col_target]]
df[self.col_target] = df[self.col_target].apply(lambda cls: 1 if cls == CLASS_POPULAR else 0)
gb = df.groupby(COL_ARTIST_NAME)
s = gb.sum()[self.col_target]
s.name = "sum"
c = gb.count()[self.col_target]
c.name = "cnt"
m = s / c
m.name = "mean"
self._y = df[[self.col_target]]
self._values = pd.concat([s, c, m], axis=1)
def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
ctx: VectorRegressionModel
is_training = ctx.is_being_fitted()
if is_training:
def val_t(t):
lookup = self._values.loc[getattr(t, COL_ARTIST_NAME)]
s = lookup["sum"] - self._y.loc[t.Index][self.col_target]
c = lookup["cnt"] - 1
if c == 0:
return np.nan
else:
return s / c
values = [val_t(t) for t in df.itertuples()]
# clean up
self._y = None
self._values.drop(columns=["sum", "cnt"])
else:
def val_i(artist_name):
try:
return self._values.loc[artist_name]["mean"]
except KeyError:
return np.nan
values = df[COL_ARTIST_NAME].apply(val_i)
return pd.DataFrame({"mean_artist_popularity": values}, index=df.index)
registry = FeatureGeneratorRegistry()
registry.register_factory(FeatureName.MUSICAL_DEGREES, lambda: FeatureGeneratorTakeColumns(COLS_MUSICAL_DEGREES,
normalisation_rule_template=DFTNormalisation.RuleTemplate(skip=True)))
registry.register_factory(FeatureName.MUSICAL_CATEGORIES, lambda: FeatureGeneratorTakeColumns(COLS_MUSICAL_CATEGORIES,
categorical_feature_names=COLS_MUSICAL_CATEGORIES))
registry.register_factory(FeatureName.LOUDNESS, lambda: FeatureGeneratorTakeColumns(COL_LOUDNESS,
normalisation_rule_template=DFTNormalisation.RuleTemplate(
transformer_factory=SkLearnTransformerFactoryFactory.StandardScaler())))
registry.register_factory(FeatureName.TEMPO, lambda: FeatureGeneratorTakeColumns(COL_TEMPO,
normalisation_rule_template=DFTNormalisation.RuleTemplate(
transformer_factory=SkLearnTransformerFactoryFactory.StandardScaler())))
registry.register_factory(FeatureName.DURATION, lambda: FeatureGeneratorTakeColumns(COL_DURATION_MS,
normalisation_rule_template=DFTNormalisation.RuleTemplate(
transformer_factory=SkLearnTransformerFactoryFactory.StandardScaler())))
registry.register_factory(FeatureName.YEAR, lambda: FeatureGeneratorTakeColumns(COL_YEAR,
normalisation_rule_template=DFTNormalisation.RuleTemplate(
transformer_factory=SkLearnTransformerFactoryFactory.StandardScaler())))
registry.register_factory(FeatureName.MEAN_ARTIST_FREQ_POPULAR, FeatureGeneratorMeanArtistPopularity)