-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbase.py
5559 lines (4979 loc) · 254 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from __future__ import annotations
from abc import (
ABC,
abstractmethod,
)
from collections.abc import (
Callable,
Collection,
Iterable,
Mapping,
MutableMapping,
Sized,
)
from pathlib import Path
import typing as t
from uuid import UUID
import warnings
import numpy as np
from pandas import DataFrame
from howso.utilities import internals, utilities as util
from howso.utilities.constants import _RENAMED_DETAIL_KEYS, _RENAMED_DETAIL_KEYS_EXTRA # type: ignore reportPrivateUsage
from howso.utilities.feature_attributes.base import (
MultiTableFeatureAttributes,
SingleTableFeatureAttributes,
)
from howso.utilities.features import serialize_cases
from howso.utilities.monitors import ProgressTimer
from .exceptions import (
HowsoError,
UnsupportedArgumentWarning,
)
from .schemas import (
HowsoVersion,
Project,
Reaction,
Session,
Trainee,
TraineeRuntime,
TraineeRuntimeOptions,
)
from .typing import (
AblationThresholdMap,
CaseIndices,
Cases,
Distances,
Evaluation,
GenerateNewCases,
LibraryType,
Mode,
NewCaseThreshold,
Persistence,
Precision,
SeriesIDTracking,
TabularData2D,
TabularData3D,
TargetedModel,
TrainStatus,
)
if t.TYPE_CHECKING:
from .cache import TraineeCache
from .configuration import HowsoConfiguration
class AbstractHowsoClient(ABC):
"""The base definition of the Howso client interface."""
configuration: "HowsoConfiguration"
"""The client configuration options."""
ERROR_MESSAGES = {
"missing_session": "There is currently no active session. Begin a new session to continue."
}
"""Mapping of error code to default error message."""
WARNING_MESSAGES = {
"invalid_precision": (
'Supported values for `precision` are "exact" and "similar". The operation will be completed as '
'if the value of `%s` is "exact".')
}
"""Mapping of warning type to default warning message."""
SUPPORTED_PRECISION_VALUES = ["exact", "similar"]
"""Allowed values for precision."""
@property
def batch_scaler_class(self):
"""The batch scaling manager class used by operations that batch requests."""
return internals.BatchScalingManager
@property
def verbose(self) -> bool:
"""Get verbose flag."""
# Backwards compatible reference to the verbose flag
return self.configuration.verbose
@property
@abstractmethod
def trainee_cache(self) -> TraineeCache:
"""Return the Trainee cache."""
@property
@abstractmethod
def active_session(self) -> Session:
"""Return the active Session."""
@property
@abstractmethod
def train_initial_batch_size(self) -> int:
"""The default number of cases in the first train batch."""
@property
@abstractmethod
def react_initial_batch_size(self) -> int:
"""The default number of cases in the first react batch."""
@staticmethod
def sanitize_for_json(payload: t.Any, *, exclude_null: bool = False) -> t.Any:
"""
Prepare payload for json serialization.
Parameters
----------
payload : Any
The payload to sanitize.
exclude_null : bool, default False
If top level Mapping keys should be filtered out if they are null.
Returns
-------
Any
The sanitized payload.
"""
payload = internals.sanitize_for_json(payload)
if exclude_null and isinstance(payload, Mapping):
payload = dict((k, v) for k, v in payload.items() if v is not None)
return payload
def resolve_feature_attributes(self, trainee_id: str) -> dict[str, dict]:
"""
Resolve a Trainee's feature attributes.
Returns cached feature attributes if available. Otherwise
resolves the Trainee and cache's its feature attributes.
Parameters
----------
trainee_id : str
The identifier of the Trainee.
Returns
-------
dict
The Trainee feature attributes.
"""
cached = self.trainee_cache.get_item(trainee_id, None)
if cached is None:
# Trainee not yet cached, resolve it first
trainee_id = self._resolve_trainee(trainee_id).id
cached = self.trainee_cache.get_item(trainee_id)
if cached["feature_attributes"] is None:
# Feature attributes not yet cached, get them
cached["feature_attributes"] = self.get_feature_attributes(trainee_id)
return cached["feature_attributes"]
@abstractmethod
def _resolve_trainee(self, trainee_id: str, **kwargs) -> Trainee:
"""
Resolve a Trainee resource.
Parameters
----------
trainee_id : str
The identifier of the Trainee to resolve.
Returns
-------
Trainee
The Trainee object.
"""
@abstractmethod
def _auto_persist_trainee(self, trainee_id: str):
"""
Automatically persists the Trainee if the persistence state allows.
Parameters
----------
trainee_id : str
The identifier of the Trainee to persist.
"""
@abstractmethod
def _store_session(self, trainee_id: str, session: Session):
"""
Store a session for a Trainee.
Parameters
----------
trainee_id : str
The identifier of the Trainee.
session : Session
The session to store.
"""
@abstractmethod
def _should_react_batch(self, params: dict, total_size: int) -> bool:
"""
Determine if given react should be batched.
Parameters
----------
params : dict
The react request parameters.
total_size : int
The size of the cases being reacted to.
Returns
-------
bool
Whether a react should be batched.
"""
@abstractmethod
def _get_trainee_thread_count(self, trainee_id: str) -> int:
"""
Get the number of available cpu threads a Trainee has access to.
Parameters
----------
trainee_id : str
The identifier of the Trainee.
Returns
-------
int
The allocated number of cpu threads for a Trainee.
"""
@abstractmethod
def execute(self, trainee_id: str, label: str, payload: t.Any, **kwargs) -> t.Any:
"""
Execute a label in Howso engine.
Parameters
----------
trainee_id : str
The identifier of the Trainee.
label : str
The label to execute.
payload : Any
The payload to send to label.
Returns
-------
Any
The label's response.
"""
@abstractmethod
def execute_sized(self, trainee_id: str, label: str, payload: t.Any, **kwargs) -> tuple[t.Any, int, int]:
"""
Execute a label in Howso engine and return the request and response sizes.
Parameters
----------
trainee_id : str
The identifier of the Trainee.
label : str
The label to execute.
payload : Any
The payload to send to label.
Returns
-------
Any
The label's response.
int
The request payload size.
int
The response payload size.
"""
@abstractmethod
def is_tracing_enabled(self, trainee_id: str) -> bool:
"""
Get if tracing is enabled for Trainee.
Parameters
----------
trainee_id : str
The identifier of the Trainee.
Returns
-------
bool
True, if tracing is enabled for provided Trainee.
"""
@abstractmethod
def get_version(self) -> HowsoVersion:
"""Get Howso version."""
@abstractmethod
def create_trainee(
self,
name: t.Optional[str] = None,
features: t.Optional[Mapping[str, Mapping]] = None,
*,
id: t.Optional[str | UUID] = None,
library_type: t.Optional[LibraryType] = None,
max_wait_time: t.Optional[int | float] = None,
metadata: t.Optional[MutableMapping[str, t.Any]] = None,
overwrite_trainee: bool = False,
persistence: Persistence = "allow",
project: t.Optional[str | Project] = None,
resources: t.Optional[Mapping[str, t.Any]] = None,
runtime: t.Optional[TraineeRuntimeOptions] = None
) -> Trainee:
"""
Create a trainee on the Howso service.
A trainee can be thought of as "model" in traditional ML sense.
Implementations of the client may honor different subsets of these
parameters.
Parameters
----------
name : str, optional
A name to use for the Trainee.
features : dict, optional
The Trainee feature attributes.
id : str or UUID, optional
A custom unique identifier to use with the Trainee, if the client
implementation supports manually assigning the name.
library_type : str, optional
The library type of the Trainee, if the client implementation
supports dynamically selecting this.
.. deprecated:: 31.0
Pass via `runtime` instead.
max_wait_time : int or float, default 30
The number of seconds to wait for a trainee to be created
and become available before aborting gracefully, if the client
supports this.
Set to `0` (or None) to wait as long as the system-configured maximum for
sufficient resources to become available, which is typically 20 minutes.
metadata : MutableMapping, optional
Arbitrary jsonifiable data to store along with the Trainee.
overwrite_trainee : bool, default False
If True, and if a trainee with name `trainee.name` already exists,
the given trainee will delete the old trainee and create the new
trainee.
persistence : {"allow", "always", "never"}, default "allow"
The requested persistence state of the Trainee.
project : str or Project, optional
The project to create this Trainee under, if the client
implementation supports this project.
resources : Mapping, optional
Customize the resources provisioned for the Trainee instance.
.. deprecated:: 80.0
Pass via `runtime` instead.
runtime : TraineeRuntimeOptions, optional
Runtime options used by the Trainee, including the library type
and resource and scaling options, if the client implementation
supports setting these. Takes precedence over `library_type` and
`resources` if these options are set.
Returns
-------
Trainee
The trainee object that was created.
"""
@abstractmethod
def update_trainee(self, trainee: Mapping | Trainee) -> Trainee:
"""Update an existing trainee in the Howso service."""
@abstractmethod
def get_trainee(self, trainee_id: str) -> Trainee:
"""Get an existing trainee from the Howso service."""
@abstractmethod
def get_trainee_runtime(self, trainee_id: str) -> TraineeRuntime:
"""
Get runtime details of a Trainee.
Parameters
----------
trainee_id : str
The identifier of the Trainee.
Returns
-------
TraineeRuntime
The Trainee runtime details. Including Trainee version and
configuration parameters.
"""
@abstractmethod
def query_trainees(self, search_terms: t.Optional[str] = None) -> list[dict]:
"""Query accessible Trainees."""
@abstractmethod
def delete_trainee(self, trainee_id: str, *, file_path: t.Optional[Path | str] = None):
"""Delete a Trainee from the Howso service."""
@abstractmethod
def copy_trainee(
self,
trainee_id: str,
new_trainee_name: t.Optional[str] = None,
new_trainee_id: t.Optional[str] = None,
*,
library_type: t.Optional[LibraryType] = None,
resources: t.Optional[Mapping[str, t.Any]] = None,
runtime: t.Optional[TraineeRuntimeOptions] = None
) -> Trainee:
"""Copy a trainee in the Howso service."""
@abstractmethod
def acquire_trainee_resources(self, trainee_id: str, *, max_wait_time: t.Optional[int | float] = None):
"""Acquire resources for a Trainee in the Howso service."""
@abstractmethod
def release_trainee_resources(self, trainee_id: str):
"""Release a Trainee's resources from the Howso service."""
@abstractmethod
def persist_trainee(self, trainee_id: str):
"""Persist a trainee in the Howso service."""
@abstractmethod
def begin_session(self, name: str | None = 'default', metadata: t.Optional[Mapping] = None) -> Session:
"""Begin a new session."""
@abstractmethod
def query_sessions(
self,
search_terms: t.Optional[str] = None,
*,
trainee: t.Optional[str | Trainee] = None,
**kwargs
) -> list[Session]:
"""Query all accessible sessions."""
@abstractmethod
def get_session(self, session_id: str) -> Session:
"""Get session details."""
@abstractmethod
def update_session(self, session_id: str, *, metadata: t.Optional[Mapping] = None) -> Session:
"""Update a session."""
def set_random_seed(self, trainee_id: str, seed: int | float | str):
"""
Sets the random seed for the Trainee.
Parameters
----------
trainee_id : str
The ID of the Trainee to set the random seed for.
seed : int or float or str
The random seed.
Ex: ``7998``, ``"myrandomseed"``
"""
trainee_id = self._resolve_trainee(trainee_id).id
if self.configuration.verbose:
print(f'Setting random seed for Trainee with id: {trainee_id}')
self.execute(trainee_id, "set_random_seed", {"seed": seed})
self._auto_persist_trainee(trainee_id)
def train( # noqa: C901
self,
trainee_id: str,
cases: TabularData2D,
features: t.Optional[Collection[str]] = None,
*,
accumulate_weight_feature: t.Optional[str] = None,
batch_size: t.Optional[int] = None,
derived_features: t.Optional[Collection[str]] = None,
initial_batch_size: t.Optional[int] = None,
input_is_substituted: bool = False,
progress_callback: t.Optional[Callable] = None,
series: t.Optional[str] = None,
skip_auto_analyze: bool = False,
skip_reduce_data: bool = False,
train_weights_only: bool = False,
validate: bool = True,
) -> TrainStatus:
"""
Train one or more cases into a Trainee.
Parameters
----------
trainee_id : str
The ID of the target Trainee.
cases : list of list of object or pandas.DataFrame
One or more cases to train into the model.
features : Collection of str, optional
The feature names corresponding to the case values.
This parameter should be provided in the following scenarios:
a. When cases are not in the format of a DataFrame, or
the DataFrame does not define named columns.
b. You want to train only a subset of columns defined in your
cases DataFrame.
c. You want to re-order the columns that are trained.
accumulate_weight_feature : str, optional
Name of feature into which to accumulate neighbors'
influences as weight for ablated cases. If unspecified, will not
accumulate weights.
batch_size : int, optional
Define the number of cases to train at once. If left unspecified,
the batch size will be determined automatically.
derived_features : Collection of str, optional
Feature names for which values should be derived in the specified
order. If this list is not provided, features with the
'auto_derive_on_train' feature attribute set to True will be
auto-derived. If provided an empty list, no features are derived.
Any derived_features that are already in the 'features' list will
not be derived since their values are being explicitly provided.
initial_batch_size : int, optional
Define the number of cases to train in the first batch. If
unspecified, the value of the ``train_initial_batch_size`` property
is used. The number of cases in following batches will be
automatically adjusted. This value is ignored if ``batch_size`` is
specified.
input_is_substituted : bool, default False
if True assumes provided nominal feature values have
already been substituted.
progress_callback : callable, optional
A callback method that will be called before each
batched call to train and at the end of training. The method is
given a ProgressTimer containing metrics on the progress and timing
of the train operation.
series : str, optional
Name of the series to pull features and case values
from internal series storage. If specified, trains on all cases
that are stored in the internal series store for the specified
series. The trained feature set is the combined features from
storage and the passed in features. If cases is of length one,
the value(s) of this case are appended to all cases in the series.
If cases is the same length as the series, the value of each case
in cases is applied in order to each of the cases in the series.
skip_auto_analyze : bool, default False
When true, the Trainee will not auto-analyze when appropriate.
Instead, the return dict will have a "needs_analyze" flag if an
`analyze` is needed.
skip_reduce_data : bool, default False
When true, the Trainee will not call `reduce_data` when
appropriate. Instead, the return dict will have a
"needs_data_reduction" flag if a call to `reduce_data` is
recommended.
train_weights_only : bool, default False
When true, and accumulate_weight_feature is provided,
will accumulate all of the cases' neighbor weights instead of
training the cases into the model.
validate : bool, default True
Whether to validate the data against the provided feature
attributes. Issues warnings if there are any discrepancies between
the data and the features dictionary.
Returns
-------
dict
A dict containing variable keys if there are important messages to
share from the Engine, such as 'needs_analyze` and
'needs_data_reduction'. Otherwise, an empty dict.
"""
trainee_id = self._resolve_trainee(trainee_id).id
feature_attributes = self.resolve_feature_attributes(trainee_id)
if not self.active_session:
raise HowsoError(self.ERROR_MESSAGES["missing_session"], code="missing_session")
# Make sure single table dicts are wrapped by SingleTableFeatureAttributes
if (
isinstance(feature_attributes, Mapping) and
not isinstance(feature_attributes, MultiTableFeatureAttributes)
):
feature_attributes = SingleTableFeatureAttributes(feature_attributes, {})
# Check to see if the feature attributes still generally describe
# the data, and warn the user if they do not
if isinstance(cases, DataFrame) and validate:
try:
feature_attributes.validate(cases)
except NotImplementedError:
# MultiTableFeatureAttributes does not yet support DataFrame validation
pass
# See if any features were inferred to have data that is unsupported by the OS.
# Issue a warning and drop the feature before training, if so.
unsupported_features = []
if isinstance(feature_attributes, MultiTableFeatureAttributes):
for stfa in feature_attributes.values():
unsupported_features = [feat for feat in stfa.keys() if stfa.has_unsupported_data(feat)]
elif isinstance(feature_attributes, SingleTableFeatureAttributes):
unsupported_features = [feat for feat in feature_attributes.keys()
if feature_attributes.has_unsupported_data(feat)]
if isinstance(cases, DataFrame):
for feature in unsupported_features:
warnings.warn(
f'Ignoring feature {feature} as it contains values that are too '
'large or small for your operating system. Please evaluate the '
'bounds for this feature.')
cases.drop(feature, axis=1, inplace=True)
util.validate_list_shape(features, 1, "features", "str")
util.validate_list_shape(cases, 2, "cases", "list", allow_none=False)
if features is None:
features = internals.get_features_from_data(cases)
serialized_cases = serialize_cases(cases, features, feature_attributes, warn=True) or []
status = {}
if self.configuration.verbose:
print(f'Training session(s) on Trainee with id: {trainee_id}')
with ProgressTimer(len(serialized_cases)) as progress:
gen_batch_size = None
batch_scaler = None
if series is not None:
# If training series, always send full size
batch_size = len(serialized_cases)
if not batch_size:
# Scale the batch size automatically
start_batch_size = initial_batch_size or self.train_initial_batch_size
batch_scaler = self.batch_scaler_class(start_batch_size, progress)
gen_batch_size = batch_scaler.gen_batch_size()
batch_size = next(gen_batch_size, None)
while not progress.is_complete and batch_size:
if isinstance(progress_callback, Callable):
progress_callback(progress)
start = progress.current_tick
end = progress.current_tick + batch_size
response, in_size, out_size = self.execute_sized(trainee_id, "train", {
"cases": serialized_cases[start:end],
"accumulate_weight_feature": accumulate_weight_feature,
"derived_features": derived_features,
"features": features,
"input_is_substituted": input_is_substituted,
"series": series,
"session": self.active_session.id,
"skip_auto_analyze": skip_auto_analyze,
"skip_reduce_data": skip_reduce_data,
"train_weights_only": train_weights_only,
})
if response and response.get('status') == 'analyze':
status['needs_analyze'] = True
if response and response.get('status') == 'reduce_data':
status['needs_data_reduction'] = True
if batch_scaler is None or gen_batch_size is None:
progress.update(batch_size)
else:
batch_size = batch_scaler.send(
gen_batch_size,
batch_scaler.SendOptions(None, (in_size, out_size)))
# Final call to batch callback on completion
if isinstance(progress_callback, Callable):
progress_callback(progress)
self._store_session(trainee_id, self.active_session)
self._auto_persist_trainee(trainee_id)
return status
def impute(
self,
trainee_id: str,
features: t.Optional[Collection[str]] = None,
features_to_impute: t.Optional[Collection[str]] = None,
batch_size: int = 1
):
"""
Impute, or fill in the missing values, for the specified features.
If no 'features' are specified, will use all features in the trainee
for imputation. If no 'features_to_impute' are specified, will impute
all features specified by 'features'.
Parameters
----------
trainee_id : str
The ID of the Trainee to impute.
features : Collection of str, optional
A list of feature names to use for imputation.
If not specified, all features will be used imputed.
features_to_impute : Collection of str, optional
A list of feature names to impute.
If not specified, features will be used (see above)
batch_size : int, default 1
Larger batch size will increase accuracy and decrease speed.
Batch size indicates how many rows to fill before recomputing
conviction.
The default value (which is 1) should return the best accuracy but
might be slower. Higher values should improve performance but may
decrease accuracy of results.
"""
trainee_id = self._resolve_trainee(trainee_id).id
if not self.active_session:
raise HowsoError(self.ERROR_MESSAGES["missing_session"], code="missing_session")
util.validate_list_shape(features, 1, "features", "str")
util.validate_list_shape(features_to_impute, 1, "features_to_impute", "str")
if self.configuration.verbose:
print(f'Imputing Trainee with id: {trainee_id}')
self.execute(trainee_id, "impute", {
"features": features,
"features_to_impute": features_to_impute,
"session": self.active_session.id,
"batch_size": batch_size,
})
self._auto_persist_trainee(trainee_id)
def remove_cases(
self,
trainee_id: str,
num_cases: int,
*,
case_indices: t.Optional[CaseIndices] = None,
condition: t.Optional[Mapping] = None,
condition_session: t.Optional[str] = None,
distribute_weight_feature: t.Optional[str] = None,
precision: t.Optional[Precision] = None,
) -> int:
"""
Removes training cases from a Trainee.
The training cases will be completely purged from the model and
the model will behave as if it had never been trained with them.
Parameters
----------
trainee_id : str
The ID of the Trainee to remove cases from.
num_cases : int
The number of cases to remove; minimum 1 case must be removed.
Ignored if case_indices is specified.
case_indices : Sequence of tuple of {str, int}, optional
A list of tuples containing session ID and session training index
for each case to be removed.
condition : Mapping of str to object, optional
The condition map to select the cases to remove that meet all the
provided conditions. Ignored if case_indices is specified.
.. NOTE::
The dictionary keys are the feature name and values are one of:
- None
- A value, must match exactly.
- An array of two numeric values, specifying an inclusive
range. Only applicable to continuous and numeric ordinal
features.
- An array of string values, must match any of these values
exactly. Only applicable to nominal and string ordinal
features.
.. TIP::
Example 1 - Remove all values belonging to `feature_name`::
criteria = {"feature_name": None}
Example 2 - Remove cases that have the value 10::
criteria = {"feature_name": 10}
Example 3 - Remove cases that have a value in range [10, 20]::
criteria = {"feature_name": [10, 20]}
Example 4 - Remove cases that match one of ['a', 'c', 'e']::
condition = {"feature_name": ['a', 'c', 'e']}
condition_session : str, optional
If specified, ignores the condition and operates on cases for
the specified session id. Ignored if case_indices is specified.
distribute_weight_feature : str, optional
When specified, will distribute the removed cases' weights
from this feature into their neighbors.
precision : {"exact", "similar"}, optional
The precision to use when moving the cases, defaults to "exact".
Ignored if case_indices is specified.
Returns
-------
int
The number of cases removed.
Raises
------
ValueError
If `num_cases` is not at least 1.
"""
trainee_id = self._resolve_trainee(trainee_id).id
if num_cases < 1:
raise ValueError('num_cases must be a value greater than 0')
if precision is not None and precision not in self.SUPPORTED_PRECISION_VALUES:
warnings.warn(self.WARNING_MESSAGES['invalid_precision'].format("precision"))
# Convert session instance to id
if (
isinstance(condition, MutableMapping) and
isinstance(condition.get('.session'), Session)
):
condition['.session'] = condition['.session'].id
if self.configuration.verbose:
print(f'Removing case(s) from Trainee with id: {trainee_id}')
result = self.execute(trainee_id, "remove_cases", {
"case_indices": case_indices,
"condition": condition,
"condition_session": condition_session,
"precision": precision,
"num_cases": num_cases,
"distribute_weight_feature": distribute_weight_feature,
})
self._auto_persist_trainee(trainee_id)
if not result:
return 0
return result.get('count', 0)
def move_cases(
self,
trainee_id: str,
num_cases: int,
*,
case_indices: t.Optional[CaseIndices] = None,
condition: t.Optional[Mapping] = None,
condition_session: t.Optional[str] = None,
precision: t.Optional[Precision] = None,
preserve_session_data: bool = False,
source_id: t.Optional[str] = None,
source_path: t.Optional[Collection[str]] = None,
target_path: t.Optional[Collection[str]] = None,
target_id: t.Optional[str] = None
) -> int:
"""
Moves training cases from one Trainee to another in the hierarchy.
Parameters
----------
trainee_id : str
The identifier of the Trainee doing the moving.
num_cases : int
The number of cases to move; minimum 1 case must be moved.
Ignored if case_indices is specified.
case_indices : Sequence of tuple of {str, int}, optional
A list of tuples containing session ID and session training index
for each case to be removed.
condition : Mapping, optional
The condition map to select the cases to move that meet all the
provided conditions. Ignored if case_indices is specified.
.. NOTE::
The dictionary keys are the feature name and values are one of:
- None
- A value, must match exactly.
- An array of two numeric values, specifying an inclusive
range. Only applicable to continuous and numeric ordinal
features.
- An array of string values, must match any of these values
exactly. Only applicable to nominal and string ordinal
features.
.. TIP::
Example 1 - Move all values belonging to `feature_name`::
criteria = {"feature_name": None}
Example 2 - Move cases that have the value 10::
criteria = {"feature_name": 10}
Example 3 - Move cases that have a value in range [10, 20]::
criteria = {"feature_name": [10, 20]}
Example 4 - Remove cases that match one of ['a', 'c', 'e']::
condition = {"feature_name": ['a', 'c', 'e']}
Example 5 - Move cases using session name and index::
criteria = {'.session':'your_session_name',
'.session_index': 1}
condition_session : str, optional
If specified, ignores the condition and operates on cases for
the specified session id. Ignored if case_indices is specified.
precision : {"exact", "similar"}, optional
The precision to use when moving the cases. Options are 'exact'
or 'similar'. If not specified, "exact" will be used.
Ignored if case_indices is specified.
preserve_session_data : bool, default False
When True, will move cases without cleaning up session data.
source_id : str, optional
The source trainee unique id from which to move cases. Ignored
if source_path is specified. If neither source_path nor
source_id are specified, moves cases from the trainee itself.
source_path : list of str, optional
List of strings specifying the user-friendly path of the child
subtrainee from which to move cases.
target_path : list of str, optional
List of strings specifying the user-friendly path of the child
subtrainee to move cases to.
target_id : str, optional
The target trainee id to move the cases to. Ignored if
target_path is specified. If neither target_path nor
target_id are specified, moves cases to the trainee itself.
Returns
-------
int
The number of cases moved.
"""
trainee_id = self._resolve_trainee(trainee_id).id
if not self.active_session:
raise HowsoError(self.ERROR_MESSAGES["missing_session"], code="missing_session")
if num_cases < 1:
raise ValueError('num_cases must be a value greater than 0')
if precision is not None and precision not in self.SUPPORTED_PRECISION_VALUES:
warnings.warn(self.WARNING_MESSAGES['invalid_precision'].format("precision"))
# Convert session instance to id
if (
isinstance(condition, MutableMapping) and
isinstance(condition.get('.session'), Session)
):
condition['.session'] = condition['.session'].id
if self.configuration.verbose:
print(f'Moving case(s) from Trainee with id: {trainee_id}')
result = self.execute(trainee_id, "move_cases", {
"target_id": target_id,
"case_indices": case_indices,
"condition": condition,
"condition_session": condition_session,
"precision": precision,
"num_cases": num_cases,
"preserve_session_data": preserve_session_data,
"session": self.active_session.id,
"source_id": source_id,
"source_path": source_path,
"target_path": target_path
})
self._auto_persist_trainee(trainee_id)
if not result:
return 0
return result.get('count', 0)
def edit_cases(
self,
trainee_id: str,
feature_values: Collection[t.Any] | DataFrame,
*,
case_indices: t.Optional[CaseIndices] = None,
condition: t.Optional[Mapping] = None,
condition_session: t.Optional[str] = None,
features: t.Optional[Collection[str]] = None,
num_cases: t.Optional[int] = None,
precision: t.Optional[Precision] = None,
) -> int:
"""
Edit feature values for the specified cases.
Updates the accumulated data mass for the model proportional to the
number of cases and features modified.
Parameters
----------
trainee_id : str
The ID of the Trainee to edit the cases of.
feature_values : list of object or pandas.DataFrame
The feature values to edit the case(s) with. If specified as a list,
the order corresponds with the order of the `features` parameter.
If specified as a DataFrame, only the first row will be used.
case_indices : Sequence of tuple of {str, int}, optional
Sequence of tuples containing the session id and index, where index