Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/kuefmz/somef
Browse files Browse the repository at this point in the history
  • Loading branch information
lwdemo123 committed Jan 13, 2024
2 parents db54310 + 1f5b100 commit 8ef3559
Show file tree
Hide file tree
Showing 88 changed files with 1,164 additions and 881 deletions.
12 changes: 8 additions & 4 deletions experiments/create_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@
'tfdtc': make_pipeline(CountVectorizer(), DecisionTreeClassifier()),
'tflr': make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear')),
'tfnb': make_pipeline(TfidfVectorizer(), MultinomialNB()),
'tfper': make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0)),
'tfrfc': make_pipeline(TfidfVectorizer(), RandomForestClassifier()), #(max_depth=3, random_state=0))
'tfsgd': make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log')),
'tfsgd': make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log_loss')),
'tfxgb': make_pipeline(TfidfVectorizer(), XGBClassifier(use_label_encoder=False,eval_metric="logloss"))
}
#'tfper': make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0)),
evaluation_names = ('cvlr', 'tflr', 'tfnb', 'cvnb', 'cvbb', 'tfsgd', 'tfxgb', 'tfper', 'tfrfc', 'tfdtc', 'tfada')
evaluation_text = {
'cvbb': '"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,',
Expand All @@ -54,21 +54,25 @@
'tfdtc': '"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,',
'tflr': '"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,',
'tfnb': '"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,',
'tfper': '"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,',
'tfrfc': '"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,',
'tfsgd': '"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = \'log\',Allen,',
'tfxgb': '"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,'
}
#'tfper': '"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,',

def evaluate_category(corpora,category):
dec = 3
cv = StratifiedKFold(n_splits = 5, shuffle=True)
file_content = "sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID"
file_content = f"sklearn Primitive - {category},Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID"
limit = 0.0
file_to_copy = ""
for name in evaluation_text:
X = corpora[category].excerpt
Y = corpora[category][category]
#print(X)
#for e in X:
# print(e)
#Y = Y.astype(int)
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2)
pipeline = pipelines[name]
pipeline.fit(x_train, y_train)
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
category,total correct,total incorrect,total missed,precision,recall
citation,33,7,3,0.825,0.917
run,20,3,1,0.870,0.952
install,72,16,10,0.818,0.878
download,2,3,0,0.400,1.000
requirements,29,2,2,0.935,0.935
contact,1,0,2,1.000,0.333
description,13,5,7,0.722,0.650
contributor,3,0,0,1.000,1.000
documentation,18,2,0,0.900,1.000
license,30,0,0,1.000,1.000
usage,55,32,20,0.632,0.733
faq,3,6,2,0.333,0.600
support,6,8,3,0.429,0.667
ack,7,0,3,1.000,0.700
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
24 changes: 12 additions & 12 deletions experiments/ranking/citation_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.732,0.687,0.984,0.808,citcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.877,0.838,0.975,0.901,citcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.917,0.913,0.946,0.929,citcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.828,0.913,0.785,0.837,cittfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.792,0.77,0.908,0.833,cittfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.875,0.837,0.971,0.899,cittflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.904,0.869,0.981,0.921,cittfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.884,0.868,0.94,0.902,cittfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.814,0.782,0.937,0.852,cittfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.895,0.863,0.972,0.914,cittfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.783,0.783,0.867,0.817,cittfxgb.p
sklearn Primitive - citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.754,0.675,0.993,0.803,citcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.922,0.912,0.938,0.924,citcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.927,0.909,0.952,0.929,citcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.886,0.948,0.821,0.877,cittfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.879,0.95,0.8,0.868,cittfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.938,0.958,0.917,0.936,cittflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.92,0.91,0.934,0.922,cittfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.907,0.899,0.921,0.909,cittfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.879,0.864,0.907,0.882,cittfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.939,0.957,0.921,0.938,cittfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.839,0.906,0.759,0.825,cittfxgb.p
24 changes: 12 additions & 12 deletions experiments/ranking/description_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.725,0.934,0.559,0.699,descvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.818,0.876,0.795,0.833,descvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.816,0.776,0.955,0.856,descvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.798,0.838,0.8,0.818,destfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.767,0.823,0.759,0.787,destfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.827,0.802,0.926,0.859,destflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.779,0.728,0.982,0.836,destfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.826,0.853,0.842,0.847,destfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.784,0.83,0.783,0.805,destfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.847,0.875,0.857,0.865,destfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.76,0.806,0.768,0.786,destfxgb.p
sklearn Primitive - description,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.733,0.914,0.516,0.657,descvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.813,0.854,0.758,0.801,descvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.784,0.716,0.954,0.816,descvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.731,0.73,0.737,0.733,destfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.686,0.701,0.648,0.67,destfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.802,0.788,0.836,0.809,destflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.775,0.695,0.982,0.814,destfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.804,0.777,0.854,0.814,destfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.745,0.744,0.751,0.747,destfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.815,0.799,0.843,0.82,destfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.72,0.729,0.726,0.724,destfxgb.p
24 changes: 12 additions & 12 deletions experiments/ranking/installation_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.75,0.7,0.986,0.819,inscvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.871,0.87,0.912,0.89,inscvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.879,0.844,0.968,0.902,inscvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.79,0.899,0.714,0.794,instfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.816,0.877,0.789,0.83,instfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.894,0.908,0.906,0.907,instflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.842,0.791,0.986,0.877,instfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.865,0.895,0.864,0.879,instfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.851,0.9,0.833,0.865,instfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.896,0.918,0.899,0.908,instfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.812,0.881,0.776,0.825,instfxgb.p
sklearn Primitive - installation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.76,0.678,0.991,0.805,inscvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.894,0.874,0.923,0.897,inscvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.898,0.853,0.961,0.904,inscvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.797,0.882,0.685,0.771,instfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.825,0.841,0.807,0.822,instfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.894,0.938,0.845,0.889,instflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.897,0.851,0.965,0.904,instfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.887,0.887,0.887,0.887,instfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.87,0.854,0.894,0.873,instfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.903,0.922,0.882,0.901,instfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.833,0.874,0.781,0.824,instfxgb.p
24 changes: 12 additions & 12 deletions experiments/ranking/invocation_classifier.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
sklearn Primitive - Citation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.754,0.718,0.939,0.814,invcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.84,0.817,0.929,0.869,invcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.865,0.875,0.891,0.883,invcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.773,0.753,0.899,0.819,invtfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.763,0.847,0.716,0.776,invtfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.841,0.822,0.923,0.869,invtflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.869,0.848,0.94,0.892,invtfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.841,0.862,0.861,0.861,invtfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.826,0.868,0.82,0.843,invtfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.867,0.859,0.919,0.888,invtfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.798,0.778,0.907,0.837,invtfxgb.p
sklearn Primitive - invocation,Hyperparameters,Input Data Used,Accuracy,Precision,Recall,F-measure,Pickle ID
"sklearnpipeline(CountVectorizer, BernoulliBayes)", -,Allen,0.755,0.681,0.962,0.797,invcvbb.p
"sklearnpipeline(CountVectorizer, LogisticRegression)", - ,Allen,0.858,0.828,0.907,0.865,invcvlr.p
"sklearnpipeline(CountVectorizer, NaiveBayes)", -,Allen,0.885,0.874,0.899,0.886,invcvnb.p
"sklearnpipeline(TFIDFVectorizer, AdaBoostClassifier)", -,Allen,0.771,0.74,0.836,0.785,invtfada.p
"sklearnpipeline(TFIDFVectorizer, DecisionTreeClassifier)", -,Allen,0.766,0.757,0.784,0.77,invtfdtc.p
"sklearnpipeline(TFIDFVectorizer, LogisticRegression)", - ,Allen,0.869,0.87,0.868,0.869,invtflr.p
"sklearnpipeline(TFIDFVectorizer, NaiveBayes)", - ,Allen,0.89,0.878,0.907,0.892,invtfnb.p
"sklearnpipeline(TFIDFVectorizer, Perceptron)", -,Allen,0.848,0.845,0.853,0.849,invtfper.p
"sklearnpipeline(TFIDFVectorizer, RandomForestClassifier)", -,Allen,0.823,0.807,0.85,0.827,invtfrfc.p
"sklearnpipeline(TFIDFVectorizer, StochasticGradientDescent)",loss = 'log',Allen,0.879,0.878,0.881,0.879,invtfsgd.p
"sklearnpipeline(TFIDFVectorizer, XGBClassifier)", -,Allen,0.803,0.771,0.863,0.815,invtfxgb.p
13 changes: 9 additions & 4 deletions experiments/setup_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ def build_corpora():
def build_corpus(selected_category):
categories_df = {cat: pd.read_csv(f'./training_corpus/{cat}.csv') for cat in categories}
negative_sample_size = int(len(categories_df[selected_category]) / 4)
print(f"Selected Category: {selected_category}")
print(f"Selected Category: {selected_category}. Negative sample size for category: {negative_sample_size}")
for category in categories_df:
categories_df[category].drop('URL', 1, inplace=True)
categories_df[category].drop('URL', axis=1, inplace=True)
# add negative samples to a category from the other ones
if category != selected_category:
categories_df[category] = categories_df[category].sample(negative_sample_size)
categories_df[category] = categories_df[category].assign(**{selected_category: category == selected_category})
Expand All @@ -30,9 +31,13 @@ def build_corpus(selected_category):
map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), negative_sample_size)),
columns=["excerpt"]).assign(description=False)
# print("Treebank has {} samples.".format(len(treebank_background)))
# print("categories_df")
# Rename the column to match the corpus when merging
treebank_background = treebank_background.rename(columns={'description': selected_category})
# print(categories_df)
corpus = pd.concat(categories_df.values(), ignore_index=True, sort=False)
corpus.append(treebank_background, ignore_index=True, sort=False)
#corpus.append(treebank_background, ignore_index=True, sort=False)
corpus = pd.concat([corpus, treebank_background], ignore_index=True, sort=False)
corpus.fillna(value='', inplace=True)
# print(corpus)
return corpus

Binary file modified experiments/trained_models/citcvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/citcvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/citcvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfada.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/cittflr.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfper.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/cittfxgb.p
Binary file not shown.
Binary file modified experiments/trained_models/descvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/descvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/descvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/destfada.p
Binary file not shown.
Binary file modified experiments/trained_models/destfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/destflr.p
Binary file not shown.
Binary file modified experiments/trained_models/destfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/destfper.p
Binary file not shown.
Binary file modified experiments/trained_models/destfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/destfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/destfxgb.p
Binary file not shown.
Binary file modified experiments/trained_models/inscvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/inscvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/inscvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/instfada.p
Binary file not shown.
Binary file modified experiments/trained_models/instfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/instflr.p
Binary file not shown.
Binary file modified experiments/trained_models/instfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/instfper.p
Binary file not shown.
Binary file modified experiments/trained_models/instfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/instfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/instfxgb.p
Binary file not shown.
Binary file modified experiments/trained_models/invcvbb.p
Binary file not shown.
Binary file modified experiments/trained_models/invcvlr.p
Binary file not shown.
Binary file modified experiments/trained_models/invcvnb.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfada.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfdtc.p
Binary file not shown.
Binary file modified experiments/trained_models/invtflr.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfnb.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfper.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfrfc.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfsgd.p
Binary file not shown.
Binary file modified experiments/trained_models/invtfxgb.p
Binary file not shown.
Loading

0 comments on commit 8ef3559

Please sign in to comment.