-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmodel_parameters.py
124 lines (101 loc) · 3.18 KB
/
model_parameters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from typing import Callable
import pandas as pd
"""
This file manages the modeling-related parameters that are **fixed**.
If we want to run parameter search on certain parameters, then consider parameterize them in CLI.
"""
EXPERIMENT_ROOT = "experiments"
MODEL_SIZE = "12b"
DATA_SCHEME = "deduped"
GENERATION_HF_DATASET_NAME = "usvsnsp/semantic-filters"
"""
Feature Catalog
"""
NATURAL_LANGUAGE_SCORE_COLUMN = "nl_scores"
# https://eleutherai.notion.site/Evaluate-NL-Code-Classifier-on-Memorized-Samples-7742b4b768d54131afc07b06f0610148
NATURAL_LANGUAGE_SCORE_THRESHOLDS = [0.525, 0.4]
CONTINUOUS_FEATURE_COLUMNS = [
"sequence_duplicates",
"max_frequency",
"avg_frequency",
"min_frequency",
"median_frequency",
"p25_frequency",
"p75_frequency",
"generation_perplexity",
"prompt_perplexity",
"sequence_perplexity",
"0_8_templates",
"0_8_snowclones",
"huffman_coding_length",
]
CATEGORICAL_FEATURE_COLUMNS = [
# This feature needs to be derived from the dataset
"is_templating",
]
ALL_FEATURE_COLUMNS = CONTINUOUS_FEATURE_COLUMNS + CATEGORICAL_FEATURE_COLUMNS
"""
Derived Features
"""
def derive_is_templating_feature(row: pd.Series) -> int:
"""
This function derives the `is_templating` feature from the dataset.
Args:
row (pd.Series): A row of the dataset
Returns:
int: 1 if the row exhibits templating behavior, 0 otherwise
"""
if row.is_incrementing or row.is_repeating:
return 1
return 0
"""
Taxonomy Catalog
"""
TAXONOMIES = ["recitation", "reconstruction", "recollection"]
TAXONOMY_QUANTILES = [0.25, 0.5, 0.75]
TAXONOMY_SEARCH_FEATURES = [
"max_frequency",
"avg_frequency",
"min_frequency",
"median_frequency",
"0_8_templates",
"0_8_snowclones",
"sequence_duplicates",
"huffman_coding_length",
"is_templating",
"generation_perplexity",
]
"""
Taxonomy Function and Parameters
"""
def taxonomy_function(sequence_duplication_threshold: int = 10) -> Callable[[pd.Series], str]:
"""
Get the taxonomy function for each sample.
Args:
sequence_duplication_threshold (int, optional): The threshold to classify a sample as recitation. Defaults to 10.
Returns:
Callable[[pd.Series], str]: The taxonomy function.
"""
def classify_row(row: pd.Series) -> str:
if row.sequence_duplicates >= sequence_duplication_threshold:
return "recitation"
if row.is_templating:
return "reconstruction"
return "recollection"
return classify_row
"""
Model Training Hyper-parameters
"""
GLOBAL_SEED = 1024
TRAIN_SIZE = 0.8
VALIDATION_SIZE = 0.1
TEST_SIZE = 0.2
MAX_MODEL_ITERATIONS = 10000
FIT_INTERCEPT = True
REG_NAME = "l2"
# Inverse of regularization strength; smaller values specify stronger regularization.
# Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
REG_STRENGTH = 1.0
# num_samples / (num_classes * np.bincount(labels))
# Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
CLASS_WEIGHT_METHOD = "balanced"