Skip to content

Commit

Permalink
changes to enable domain adaptation
Browse files Browse the repository at this point in the history
  • Loading branch information
bruce-edelman committed Jul 6, 2023
1 parent a9461a8 commit 5b60f95
Show file tree
Hide file tree
Showing 3 changed files with 259 additions and 119 deletions.
215 changes: 96 additions & 119 deletions diploshic/diploSHIC
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
#!/usr/bin/env python

import argparse, time, sys, subprocess
import matplotlib
matplotlib.use("Agg")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import model_from_json

from network import construct_model
from domain_adaptive_dataloader import DADiploSHICDataLoader

pyExec = sys.executable
print(pyExec)
Expand Down Expand Up @@ -45,6 +59,12 @@ parser_a.add_argument(
help="max epochs for training CNN (default = 100)",
default=100,
)
parser_a.add_argument(
"--domain-adaptation",
action='store_true',
help="Optional Flag to run model with Domain Adaptation",
default=False,
)
parser_a.add_argument(
"--numSubWins",
type=int,
Expand Down Expand Up @@ -311,21 +331,6 @@ argsDict = vars(args)
if argsDict["mode"] in ["train", "predict"]:
###########################################################
# Import a bunch of libraries if everything checks out
import matplotlib

matplotlib.use("Agg")
import numpy as np
import tensorflow as tf
from keras.models import Sequential, Model
from keras import optimizers
from keras.layers import Dense, Dropout, Activation, Flatten, Input
from keras.layers import Conv2D, MaxPooling2D, concatenate
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K
import fnmatch

# nDims = argsDict['nDims']
numSubWins = argsDict["numSubWins"]

Expand All @@ -349,7 +354,6 @@ if argsDict["mode"] == "train":
ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, numSubWins))
lhard = np.loadtxt(trainingDir + "linkedHard.fvec", skiprows=1)
lh1 = np.reshape(lhard, (lhard.shape[0], nDims, numSubWins))

both = np.concatenate((h1, n1, s1, ls1, lh1))
y = np.concatenate(
(
Expand All @@ -360,12 +364,24 @@ if argsDict["mode"] == "train":
np.repeat(4, len(lh1)),
)
)

# reshape both to explicitly set depth image. need for theanno not sure with tensorflow
both = both.reshape(both.shape[0], nDims, numSubWins, 1)
if trainingDir == testingDir:
if argsDict["domain_adaptation"]:
empirical = np.loadtxt(trainingDir + "empirical.fvec", skiprows=1)
emp1 = np.reshape(empirical, (empirical.shape[0], nDims, numSubWins))
emp1 = emp1.reshape(emp1.shape[0], nDims, numSubWins, 1)
if trainingDir == testingDir:
X_train, X_test, X_train_emp, X_test_emp, y_train, y_test = train_test_split(
both, emp1, y, test_size=0.2
)
else:
X_train_emp = emp1
empirical = np.loadtxt(trainingDir + "empirical.fvec", skiprows=1)
emp1 = np.reshape(empirical, (empirical.shape[0], nDims, numSubWins))
X_test_emp = emp1.reshape(emp1.shape[0], nDims, numSubWins, 1)
elif trainingDir == testingDir:
X_train, X_test, y_train, y_test = train_test_split(
both, y, test_size=0.2
both, y, test_size=0.2
)
else:
X_train = both
Expand All @@ -381,7 +397,6 @@ if argsDict["mode"] == "train":
ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, numSubWins))
lhard = np.loadtxt(testingDir + "linkedHard.fvec", skiprows=1)
lh1 = np.reshape(lhard, (lhard.shape[0], nDims, numSubWins))

both2 = np.concatenate((h1, n1, s1, ls1, lh1))
X_test = both2.reshape(both2.shape[0], nDims, numSubWins, 1)
y_test = np.concatenate(
Expand All @@ -394,94 +409,43 @@ if argsDict["mode"] == "train":
)
)

Y_train = tf.keras.utils.to_categorical(y_train, 5)
Y_test = tf.keras.utils.to_categorical(y_test, 5)
X_valid, X_test, Y_valid, Y_test = train_test_split(
X_test, Y_test, test_size=0.5
)
Y_train = to_categorical(y_train, 5)
Y_test = to_categorical(y_test, 5)

datagen = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
horizontal_flip=True,
)
if argsDict["domain_adaptation"]:
X_valid, X_test, X_valid_emp, X_test_emp, Y_valid, Y_test = train_test_split(
X_test, X_test_emp, Y_test, test_size=0.5
)
datagen = DADiploSHICDataLoader(X_train, X_train_emp, Y_train, batch_size=32)
validation_gen = DADiploSHICDataLoader(X_test, X_test_emp, Y_test, batch_size=32)
test_gen = DADiploSHICDataLoader(X_valid, X_valid_emp, Y_valid, batch_size=32)
else:
X_valid, X_test, Y_valid, Y_test = train_test_split(
X_test, Y_test, test_size=0.5
)
dataget = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
horizontal_flip=True,
)

validation_gen = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
horizontal_flip=False,
)
test_gen = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
horizontal_flip=False,
)
validation_gen = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
horizontal_flip=False,
)
test_gen = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
horizontal_flip=False,
)

# print(X_train.shape)
print("training set has %d examples" % X_train.shape[0])
print("validation set has %d examples" % X_valid.shape[0])
print("test set has %d examples" % X_test.shape[0])

model_in = Input(X_train.shape[1:])
h = Conv2D(128, 3, activation="relu", padding="same", name="conv1_1")(
model_in
)
h = Conv2D(64, 3, activation="relu", padding="same", name="conv1_2")(h)
h = MaxPooling2D(pool_size=3, name="pool1", padding="same")(h)
h = Dropout(0.15, name="drop1")(h)
h = Flatten(name="flaten1")(h)

dh = Conv2D(
128,
2,
activation="relu",
dilation_rate=[1, 3],
padding="same",
name="dconv1_1",
)(model_in)
dh = Conv2D(
64,
2,
activation="relu",
dilation_rate=[1, 3],
padding="same",
name="dconv1_2",
)(dh)
dh = MaxPooling2D(pool_size=2, name="dpool1")(dh)
dh = Dropout(0.15, name="ddrop1")(dh)
dh = Flatten(name="dflaten1")(dh)

dh1 = Conv2D(
128,
2,
activation="relu",
dilation_rate=[1, 4],
padding="same",
name="dconv4_1",
)(model_in)
dh1 = Conv2D(
64,
2,
activation="relu",
dilation_rate=[1, 4],
padding="same",
name="dconv4_2",
)(dh1)
dh1 = MaxPooling2D(pool_size=2, name="d1pool1")(dh1)
dh1 = Dropout(0.15, name="d1drop1")(dh1)
dh1 = Flatten(name="d1flaten1")(dh1)

h = concatenate([h, dh, dh1])
h = Dense(512, name="512dense", activation="relu")(h)
h = Dropout(0.2, name="drop7")(h)
h = Dense(128, name="last_dense", activation="relu")(h)
h = Dropout(0.1, name="drop8")(h)
output = Dense(5, name="out_dense", activation="softmax")(h)
model = Model(inputs=[model_in], outputs=[output])

model.compile(
loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
model = construct_model(X_train.shape[1:], domain_adaptation=argsDict["domain_adaptation"])

# define early stopping callback
earlystop = EarlyStopping(
Expand All @@ -507,24 +471,39 @@ if argsDict["mode"] == "train":

callbacks_list = [earlystop, checkpoint]
# callbacks_list = [earlystop] #turning off checkpointing-- just want accuracy assessment

datagen.fit(X_train)
validation_gen.fit(X_valid)
test_gen.fit(X_test)
start = time.time()
model.fit(
datagen.flow(X_train, Y_train, batch_size=32),
steps_per_epoch=len(X_train) / 32,
epochs=epochOption,
verbose=1,
callbacks=callbacks_list,
validation_data=validation_gen.flow(X_valid, Y_valid, batch_size=32),
validation_steps=len(X_test) / 32,
)
# model.fit(X_train, Y_train, batch_size=32, epochs=100,validation_data=(X_test,Y_test),callbacks=callbacks_list, verbose=1)
score = model.evaluate(
test_gen.flow(X_test, Y_test, batch_size=32), steps=len(Y_test) / 32
)

if argsDict["domain_adaptation"]:
model.fit(
datagen, #.flow(X_train, Y_train, batch_size=32),
steps_per_epoch=len(X_train) / 32,
epochs=epochOption,
verbose=1,
callbacks=callbacks_list,
validation_data=validation_gen, #.flow(X_valid, Y_valid, batch_size=32),
validation_steps=len(X_test) / 32,
)
score = model.evaluate(
test_gen, #.flow(X_test, Y_test, batch_size=32),
steps=len(Y_test) / 32
)
else:
datagen.fit(X_train)
validation_gen.fit(X_valid)
test_gen.fit(X_test)
model.fit(
datagen.flow(X_train, Y_train, batch_size=32),
steps_per_epoch=len(X_train) / 32,
epochs=epochOption,
verbose=1,
callbacks=callbacks_list,
validation_data=validation_gen.flow(X_valid, Y_valid, batch_size=32),
validation_steps=len(X_test) / 32,
)
score = model.evaluate(
test_gen.flow(X_test, Y_test, batch_size=32),
steps=len(Y_test) / 32
)
sys.stderr.write(
"total time spent fitting and evaluating: %f secs\n"
% (time.time() - start)
Expand Down Expand Up @@ -555,8 +534,6 @@ if argsDict["mode"] == "train":
plt.savefig(confusionFile, bbox_inches="tight")

elif argsDict["mode"] == "predict":
import pandas as pd
from keras.models import model_from_json

# import data from predictFile
x_df = pd.read_table(argsDict["predictFile"])
Expand Down
55 changes: 55 additions & 0 deletions diploshic/domain_adaptive_dataloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from keras.utils import Sequence
import numpy as np
import gc


class DADiploSHICDataLoader(Sequence):
def __init__(self, X_src, X_tgt, Y_pred, batch_size):
self.tgt_data = X_src
self.src_data = X_tgt
self.y_pred = Y_pred

self.batch_size = batch_size

src_size = self.src_bgtm.shape[0]
tgt_size = self.tar_bgtm.shape[0]

self.no_batch = int(np.floor(np.minimum(src_size, tgt_size) / self.batch_size)) # model sees training sample at most once per epoch
self.src_pred_idx = np.arange(src_size)
self.src_discr_idx = np.arange(src_size)
self.tgt_discr_idx = np.arange(tgt_size)

np.random.shuffle(self.src_pred_idx)
np.random.shuffle(self.src_discr_idx)
np.random.shuffle(self.tgt_discr_idx)

def __len__(self):
return self.no_batch

def on_epoch_end(self):
np.random.shuffle(self.src_pred_idx)
np.random.shuffle(self.src_discr_idx)
np.random.shuffle(self.tgt_discr_idx)
gc.collect()

def __getitem__(self, idx):
pred_batch_idx = self.src_pred_idx[idx*self.batch_size:(idx+1)*self.batch_size]
discrSrc_batch_idx = self.src_discr_idx[idx*(self.batch_size//2):(idx+1)*(self.batch_size//2)]
discrTgt_batch_idx = self.tgt_discr_idx[idx*(self.batch_size//2):(idx+1)*(self.batch_size//2)]

batch_X = np.concatenate((self.src_data[pred_batch_idx],
self.src_data[discrSrc_batch_idx],
self.tgt_data[discrTgt_batch_idx]))

batch_Y_pred = np.concatenate((self.y_pred[pred_batch_idx],
-1*np.ones(len(discrSrc_batch_idx)),
-1*np.ones(len(discrTgt_batch_idx))))

batch_Y_discr = np.concatenate((-1*np.ones(len(pred_batch_idx)),
np.zeros(len(discrSrc_batch_idx)),
np.ones(len(discrTgt_batch_idx))))

assert batch_X.shape[0] == self.batch_size*2, batch_X.shape[0]
assert batch_Y_pred.shape == batch_Y_discr.shape, (batch_Y_pred, batch_Y_discr)

return batch_X, {"predictor":batch_Y_pred, "discriminator":batch_Y_discr}
Loading

0 comments on commit 5b60f95

Please sign in to comment.