forked from Trusted-AI/AIX360
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenerateData.py
411 lines (342 loc) · 16.9 KB
/
GenerateData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# This file will generate a synthetic dataset to predict employee attrition
# Like most datasets it will have a feature vector and a Y label for each instance.
# However, unlike most datasets it will also have an Explanation (E) for each instance, encoded as an non-negative integer.
# This is motivated by the TED framework, but can be used by other explainability algorithms as a metric for explainability
# See the AIES'19 paper by Hind et al for more information on the TED framework.
# See the tutorial notebook TED_Cartesian_test for information about how to use this dataset and the TED framework.
# The comments in this code also provide some insight into how this dataset is generated
import random
from random import choices
import pandas as pd
Any = -99 # This is only applicable in the rule
Low = -1 # These 3, Low, Med, High, can be values in the dataset and are used in the rules
Med = -2
High = -3
Yes = -10 # This is the positive Y label
No = -11 # This is the negative Y label
Random = -12 # This signfiies a random choice should be made for the Y label (either Yes or No) ]
# Features, values, and distribution, details below
featureThresholds = [
# 1 Position: 4(5%), 3(20%), 2(30%), 1(45%)
[4, [0.05, 0.20, 0.30, 0.45]],
# 2 Organization "Org": 3(30%); 2(30%); 1(40%)
[3, [0.30, 0.30, 0.40]],
# 3 Potential "Pot": Yes (50%), No (50%)
[2, [0.50, 0.50]],
# 4 Rating value "Rat": High(15%), Med(80%), Low(5%)
[3, [0.15, 0.80, 0.05]],
# 5 Rating Slope "Slope": High (15%), Med(80%), Low(5%)
[3, [0.15, 0.80, 0.05]],
# 6 Salary Competitiveness "Sal": High (10%); Med(70%); Low(20%)
[3, [0.10, 0.70, 0.20]],
# 7 Tenure Low "TenL" & High Values "TenH": [0..360], 30% in 0..24; 30% in 25..60; 40% in 61..360
[3, [0.30, 0.30, 0.40], [[0, 24], [25, 60], [61, 360]]],
# 8 Position Tenure Low "BTenL" & High Values "BTenH": [0..360], 70% in 0..12; 20% in 13..24; 10% in 25..360
# Position tenure needs to be lower than tenure, ensured in generation code below
[3, [0.70, 0.20, 0.10], [[0, 12], [13, 24], [25, 360]]]
]
# Some convenient population lists
HighMedLowPopulation = [High, Med, Low]
YesNoPopulation = [Yes, No]
Index3Population = [0, 1, 2]
Integer4Population = [4, 3, 2, 1]
Integer3Population = [3, 2, 1]
# Rules used to label a feature vector with a label and an explanation
# Format: features, label, explanation #, Explanation String
RetentionRules = [
#POS ORG Pot RAT Slope SALC TENL H BTEN LH
[Any, 1, Any, High, Any, Low, Any, Any, Any, Any, #0
Yes, 2, "Seeking Higher Salary in Org 1"],
[1, 1, Any, Any, Any, Any, Any, Any, 15, Any, #1
Yes, 3, "Promotion Lag, Org 1, Position 1"],
[2, 1, Any, Any, Any, Any, Any, Any, 15, Any, #2
Yes, 3, "Promotion Lag, Org 1, Position 2"],
[3, 1, Any, Any, Any, Any, Any, Any, 15, Any, #3
Yes, 3, "Promotion Lag, Org 1, Position 3"],
[1, 2, Any, Any, Any, Any, Any, Any, 20, Any, #4
Yes, 4, "Promotion Lag, Org 2, Position 1"],
[2, 2, Any, Any, Any, Any, Any, Any, 20, Any, #5
Yes, 4, "Promotion Lag, Org 2, Position 2"],
[3, 2, Any, Any, Any, Any, Any, Any, 30, Any, #6
Yes, 5, "Promotion Lag, Org 2, Position 3"],
[1, 3, Any, Any, Any, Any, Any, Any, 20, Any, #7
Yes, 6, "Promotion Lag, Org 3, Position 1"],
[2, 3, Any, Any, Any, Any, Any, Any, 30, Any, #8
Yes, 7, "Promotion Lag, Org 3, Position 2"],
[3, 3, Any, Any, Any, Any, Any, Any, 30, Any, #9
Yes, 7, "Promotion Lag, Org 3, Position 3"],
[1, 1, Any, Any, Any, Any, 0, 12, Any, Any, #10
Yes, 8, "New employee, Org 1, Position 1"],
[2, 1, Any, Any, Any, Any, 0, 12, Any, Any, #11
Yes, 8, "New employee, Org 1, Position 2"],
[3, 1, Any, Any, Any, Any, 0, 30, Any, Any, #12
Yes, 9, "New employee, Org 1, Position 3"],
[1, 2, Any, Any, Any, Any, 0, 24, Any, Any, #13
Yes, 10, "New employee, Org 2, Position 1"],
[2, 2, Any, Any, Any, Any, 0, 30, Any, Any, #14
Yes, 11, "New employee, Org 2, Position 2"],
[Any, 1, Any, Low, High, Any, Any, Any, Any, Any, #15
Yes, 13, "Disappointing evaluation, Org 1"],
[Any, 2, Any, Low, High, Any, Any, Any, Any, Any, #16
Yes, 14, "Disappointing evaluation, Org 2"],
[Any, Any, Yes, Med, High, Low, Any, Any, Any, Any, #17
Yes, 15, "Compensation doesn't match evaluations, Med rating"],
[Any, Any, Yes, High, High, Low, Any, Any, Any, Any, #18
Yes, 15, "Compensation doesn't match evaluations, High rating"],
[Any, 1, Yes, Med, High, Med, Any, Any, Any, Any, #19
Yes, 16, "Compensation doesn't match evaluations, Org 1, Med rating"],
[Any, 2, Yes, Med, High, Med, Any, Any, Any, Any, #20
Yes, 16, "Compensation doesn't match evaluations, Org 2, Med rating"],
[Any, 1, Yes, High, High, Med, Any, Any, Any, Any, #21
Yes, 16, "Compensation doesn't match evaluations, Org 1, High rating"],
[Any, 2, Yes, High, High, Med, Any, Any, Any, Any, #22
Yes, 16, "Compensation doesn't match evaluations, Org 2, High rating"],
[Any, 1, Any, Any, Med, Med, 120, 180, Any, Any, #23
Yes, 17, "Mid-career crisis, Org 1"],
[Any, 2, Yes, Any, Any, Med, 130, 190, Any, Any, #24
Yes, 18, "Mid-career crisis, Org 2"]
]
def ruleValToString(val):
""" Convert the value passed into a string """
if val == Any :
return "Any"
elif val == Low :
return "Low"
elif val == Med :
return "Med"
elif val == High :
return "High"
elif val == Yes :
return "Yes"
elif val == No :
return "No"
elif val == Random :
return "Random"
else :
return str(val)
def printFeatureStringHeader() :
""" Print the feature headings """
print(" Feature Headings")
print("[Pos, Org, Pot, Rating, Slope, Salary Competitiveness, Tenure, Position Tenure]")
def featuresToString(featureVector) :
""" Convert a feature vector into is string format"""
val = "["
for i in range(0, 2) : # These features are just ints, Position, Organization
val += str(featureVector[i])
val += " "
for i in range(2, 6) : # show encoding for these: Potential, Rating, Rating Slope, Salary Competiveness
val += ruleValToString(featureVector[i])
val += " "
for i in range(6, 8) : # These features are just ints: Tenure and Position Tenure
val += str(featureVector[i])
val += " "
val += "]"
return val
def printRule(rule) :
""" Print the passed rule """
print("Rule: ", end='')
for i in rule[0:1]: # ints or Any: Position and Organization
if i == Any:
print(ruleValToString(i) + ", ", end='')
for i in rule[2:5]: # encoded: Potentional, Rating, Rating Slope, Salary Competitiveness
print(ruleValToString(i) + ", ", end='')
for i in rule[6:9]: # next 4 are ints or ANY: Tenure Low, Tenure High, Position Tenure Low, Position Tenure High
if i == Any :
print(ruleValToString(i) + ", ", end='')
else :
print(str(i) + ", ", end='')
print("==> "+ ruleValToString(rule[10]) + "[" + str(rule[11]) + "] " + str(rule[12]))
def printRules(rules) :
""" print all rules"""
for r in rules:
printRule(r)
########################################################################
def chooseRangeValue(thresholds, rangeList):
""" Generate a random value based on the probability weights (thresholds) and list of ranges passed
Args:
thresholds : list of probalities for each choice
rangeList: a list of pair lists giving the lower and upper bounds to choose value from
"""
# pick a number 1..3 from weights
rangeVal = choices(Index3Population, thresholds)
# get the appropriate range given rangeVal
interval = rangeList[rangeVal[0]]
# construct a population list from the result
intervalPopulation = list(range(interval[0], interval[1]))
# construct a equally prob weights list
numElements = interval[1] - interval[0]
probVal = 1.0 / numElements
probList = [probVal] * numElements
# now choose the value from the population based on the weights
val = choices(intervalPopulation, probList)
return val[0]
def chooseValueAndAppend(instance, population, weights) :
""" Choose a random value from the population using weights list and append it to the passed instance
"""
val = choices(population, weights)
instance.append(val[0])
def generateFeatures(numInstances) :
""" generate the features (X) values for the dataset
Args:
numInstances (int) : number of instances to genreate
Returns:
dataset (list of lists) : the dataset with features, but no labels or explanations yet
"""
assert(numInstances > 0)
dataset = []
for i in range(numInstances) :
instance = []
#POS ORG Pot Rating Slope SALC TENL H BTEN LH
chooseValueAndAppend(instance, Integer4Population, featureThresholds[0][1]) # Position
chooseValueAndAppend(instance, Integer3Population, featureThresholds[1][1]) # Org
chooseValueAndAppend(instance, YesNoPopulation, featureThresholds[2][1]) # Potential
chooseValueAndAppend(instance, HighMedLowPopulation, featureThresholds[3][1]) # Rating
chooseValueAndAppend(instance, HighMedLowPopulation, featureThresholds[4][1]) # Rating slope
chooseValueAndAppend(instance, HighMedLowPopulation, featureThresholds[5][1]) # Sal competitiveness
val1 = chooseRangeValue(featureThresholds[6][1], featureThresholds[6][2]) # Tenure
instance.append(val1)
# Position tenure needs to be <= Tenure
val2 = chooseRangeValue(featureThresholds[7][1], featureThresholds[7][2]) # Pos Tenure
if val2 > val1 :
val2 = val1
instance.append(val2)
dataset.append(instance)
return dataset
#####################################################################################################
def match(ruleVal, featureVal) :
""" Check if passed ruleVal matches the featureVal or if ruleVal is Any, which matches everything
"""
# print("Match called: "+ ruleValToString(ruleVal) + " " + ruleValToString(featureVal))
if ruleVal == Any :
return True
return (ruleVal == featureVal)
def intervalMatch(ruleValLower, ruleValUpper, featureVal) :
""" Check to see if featureVal is in the interval defined by [ruleValLower, ruleValUpper)
"""
# Any in lower bound matches all values, (upper bound doesn't matter)
if ruleValLower == Any :
return True
if ruleValLower <= featureVal :
# Any in upper bound means infinitity
if featureVal < ruleValUpper or ruleValUpper == Any :
return True
return False
def ruleMatch(rule, featureVector) :
""" Determine if the passed featureVector matches the passed rule
"""
if (False) :
print("ruleMatch called, ", end="")
printRule(rule)
print(" feature vector: " + featuresToString(featureVector) )
for i in range(0, 6) : # loop over first 6 features, 0..5
if not match(rule[i], featureVector[i]) : # if we don't find a feature match, the rule doesn't match
# print("Didn't match feature #", i, ruleValToString(featureVector[i]))
return False
# These features are interval-based, so need a different matching routine
if not intervalMatch(rule[6], rule[7], featureVector[6]) : # rule[6] and rule[7] have the lower and upper bounds of interval
# print("Didn't match feature # 6: ", featureVector[6])
return False
if not intervalMatch(rule[8], rule[9], featureVector[7]) : # rule[8] and rule[9] have the lower and upper bounds of interval
# print("Didn't match feature # 7: ", featureVector[7])
return False
# print("Matched all features")
return True # if we didn't find a non-match by now, we found a match
def findRule(instance, ruleSet) :
""" find the rule(s) that matches the feture vector passed
"""
# print("*Looking for rule match for Feature vector: " + featuresToString(instance))
ruleNumber = 0 # counter to track rule number
ruleMatches = [] # will hold all rule numbers that matched
for rule in ruleSet :
if (ruleMatch(rule, instance)) :
ruleMatches.append(ruleNumber)
counts[ruleNumber] += 1 # update global histogram of rule matches for stats reporting
if (False) :
print(" ruleMatch found at rule #" + str(ruleNumber))
print(" ", end="")
printRule(rule)
ruleNumber += 1
return ruleMatches
def countAnys(rule) :
""" Count the number of Anys in the passed rule. An "Any" is a wildcard that matches all values
"""
count = 0
for feature in RetentionRules[rule] :
if feature == Any :
count += 1
return count
def pickBestRule(ruleList) :
""" Choose the rule with the least number of Any's in it
"""
assert(len(ruleList) > 0)
# print("ruleList: ", ruleList)
minAnys = len(RetentionRules[0]) + 1 # initialize to a value larger than possible # of Anys in a rule
bestRule = -1
for rule in ruleList :
# Count # of Any's in rule # rule
count = countAnys(rule)
if count < minAnys :
minAnys = count
bestRule = rule
assert(bestRule != -1) # We should find a best rule
return bestRule
def addLabelsAndExplanations(dataset, rules) :
""" This function will use a ruleset to add labels (Y) and explanations/rules (E) to a passed dataset
Arg:
dataset (list of lists) : a list of feature vectors (list)
rules (list of lists) : a list of rules
"""
noMatches = 0 # Counters to record how often there are no (Yes) matches, 1 (Yes) match, and multiple (Yes) matches
multiMatches = 0
oneMatches = 0
for instance in dataset :
ruleMatches = findRule(instance, rules)
if len(ruleMatches) == 0 : # We didn't match a (Yes) rule, so this ia No situation
rule = NoRiskRuleNum
label = No
noMatches +=1
elif len(ruleMatches) > 1 : # Matched multiple Yes rules, need to pick one
rule = pickBestRule(ruleMatches)
assert(rule >= 0 and rule < len(rules)) # Ensure rule number is valid
label = Yes
multiMatches += 1
else : # Found 1 Yes rule match, it's the winner
rule = ruleMatches[0]
label = Yes
oneMatches += 1
assert(rule >= 0 and rule < len(rules)) # Ensure rule number is valid
# print("Label: " + ruleValToString(label) + ", Rule: " + ruleValToString(rule))
instance.append(label)
instance.append(rule) # add the label and explanation (rule #) to the featureVector
if (True) :
print("\nRule matching statistics: ")
totalYes = oneMatches + multiMatches
total = oneMatches + multiMatches + noMatches
print(" Yes Labels: {}/{} ({:.2f}%)".format(totalYes, total, totalYes/total*100))
print(" Matched 1 Yes rule: {}/{} ({:.2f}%)".format(oneMatches, totalYes, oneMatches/totalYes*100))
print(" Matched multiple Yes rules: {}/{} ({:.2f}%)".format(multiMatches, totalYes, multiMatches/totalYes*100))
print(" No Laels: {}/{} ({:.2f}%)".format(noMatches, total, noMatches/total*100))
def printRuleUsage(counts, total) :
print("\nHistogram of rule usage:")
ruleNum = 0
for num in counts :
print(" Rule {} was used {} times, {:.2f}%".format(ruleNum, num, num/total*100))
ruleNum += 1
numRentionRules = len(RetentionRules)
counts = [0]*numRentionRules
NoRiskRuleNum = numRentionRules # the No Risk to leave rule is 1 more than than the total rules [0..]
random.seed(1)
# printFeatureStringHeader()
numInstances = 10000
dataset = generateFeatures(numInstances)
addLabelsAndExplanations(dataset, RetentionRules)
printRuleUsage(counts, numInstances)
# insert TED headers
NumFeatures = len(featureThresholds)
header = list(range(NumFeatures))
header.append("Y")
header.append("E")
dataset.insert(0, header)
# write to csv file
my_df = pd.DataFrame(dataset)
my_df.to_csv('Retention.csv', index=False, header=False)