Keras Model Weights for some layers become all “NaNs”

I am training a triplet model similar to -Keras model params are all "NaN"s after reloading; except that this model is built over the inception_v3 model.

(I am using Keras with Tensorflow backend)

But after just 2 epochs the model weights turn out to be NaN. And when I try to extract learnt features by passing an Input Image the features are all 0.

Model Architecture-

def triplet_loss(x, ALPHA=0.2):

anchor, positive, negative = x                                      



pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), 1)

neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), 1)



basic_loss = tf.add(tf.subtract(pos_dist, neg_dist), ALPHA)         

loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0), 0)               



return loss

class StyleNet():

def __init__(self, input_shape_x, input_shape_y, input_shape_z, n_classes, reg_lambda):



    self.input_shape_x = input_shape_x                              

    self.input_shape_y = input_shape_y                              

    self.input_shape_z = input_shape_z                              

    self.n_classes = n_classes                                      

    self.reg_lambda = reg_lambda                                    





def create_model(self):                                             



    anchor_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_1')

    positive_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_2')

    negative_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_3')



    input_image = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z))



    base_inception = InceptionV3(input_tensor = input_image, input_shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), weights=None, include_top=False, pooling='avg')

    base_pool5 = base_inception.output                              



    ##############Adding the Bottleneck layer Here#######################################################

    bottleneck_layer = Dense(256, kernel_regularizer=l2(self.reg_lambda), name='bottleneck_layer')(base_pool5)

    bottleneck_norm = BatchNormalization(name='bottleneck_norm')(bottleneck_layer)

    bottleneck_relu = Activation('relu', name='bottleneck_relu')(bottleneck_norm)

    bottleneck_drop = Dropout(0.5)(bottleneck_relu)                 



    fin = Dense(self.n_classes)(bottleneck_drop)                    

    fin_norm = BatchNormalization(name='fin_norm')(fin)             

    fin_softmax = Activation('softmax')(fin_norm)                   

    ######################################################################################################



    ###########Triplet Model Which learns the embedding layer relu6####################

    self.triplet_model = Model(input_image, bottleneck_drop)        

    positive_embedding = self.triplet_model(positive_example)       

    negative_embedding = self.triplet_model(negative_example)       

    anchor_embedding = self.triplet_model(anchor_example)           

    ###########Triplet Model Which learns the embedding layer relu6####################



    adam_opt = optimizers.Adam(lr=0.00001, clipnorm = 1.0, amsgrad=False)



    #The Triplet Model which optimizes over the triplet loss.       

    loss = Lambda(triplet_loss, output_shape=(1,))([anchor_embedding, positive_embedding, negative_embedding])

    self.triplet_model_worker = Model(inputs=[anchor_example, positive_example, negative_example], outputs = loss)

    self.triplet_model_worker.compile(loss='mean_absolute_error', optimizer=adam_opt)



def fit_model(self, pathname='./models/'):                          

    if not os.path.exists(pathname):                                

        os.makedirs(pathname)                                       

    if not os.path.exists(pathname+'/weights'):                     

        os.makedirs(pathname+'/weights')                            

    if not os.path.exists(pathname+'/tb'):                          

        os.makedirs(pathname+'/tb')                                 

    filepath=pathname+"weights/{epoch:02d}.hdf5"                    

    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=False, mode='auto')

    tensorboard = TensorBoard(log_dir=pathname+'/tb', write_graph=True, write_images=True)

    callbacks_list = [checkpoint, tensorboard]                      



    #Parameter                                                              

    params = {'dim': (224, 224), 'batch_size':32, 'n_classes':11, 'n_channels':3, 'shuffle':True}



    #Datasets                                                               

    partition = pickle.load(open('../../../data/bam_2_partition_triplet.pkl', 'rb'))

    labels = pickle.load(open('../../../data/bam_2_labels_triplet.pkl', 'rb'))



    #Generators                                                             

    training_generator = DataGenerator(partition['train'], labels, **params)

    self.triplet_model_worker.fit_generator(generator = training_generator,  epochs = 60, use_multiprocessing=True, workers = 10, callbacks = callbacks_list, verbose = 1)

What is troublesome is as answered in the link above. Even after using ***clipnorm=1.0**** the gradients explode and the weights give out "nan" values.

Saving and loading the model, then printing the weights. The NaNs are clearly visible.
Loading code:

m = load_model('/scratch/models_inception_stage2/yo/weights/02.hdf5', custom_objects={"tf":tf})



for layer in m.layers:                                                  

    weights = layer.get_weights()                                       

    print (weights)

Snippet for the printed weights

Here

[array([ 3.4517611e-04,  1.3431008e-03, -1.1081886e-03,  2.6104850e-04,

   -2.1620051e-04,  1.6816283e-03,  8.8927911e-05, -3.8964470e-04,

    1.7968584e-03,  1.0259283e-03,  5.0400384e-04, -3.6578919e-04,

   -1.1292399e-03,  1.1509922e-03,  3.2478449e-04, -3.6580343e-05,

   -4.4458261e-04,  4.8210021e-04, -9.5213606e-04, -6.4406055e-04,

    5.0959276e-04, -3.4098624e-04, -7.0486858e-05,  2.8134760e-04,

   -8.0100907e-04,  8.2962180e-04, -6.4140803e-04,  9.4872032e-04,

   -3.3409546e-05, -3.0277384e-04,  5.2237371e-04, -8.3427120e-04,

   -2.5856070e-04, -1.0346439e-03,  4.3354488e-05, -8.8099617e-04,

   -6.8233605e-04, -1.2386916e-04,  8.2019303e-04, -1.9070004e-03,

    1.5571159e-03, -3.4599879e-04,  6.2088901e-04, -8.4720332e-06,

    1.6024955e-04, -1.2059419e-03, -1.4946899e-04, -6.7080715e-04,

   -2.8154058e-05,  5.1517348e-04,  5.9993083e-05,  2.8555689e-04,

    3.9626448e-04, -5.1538437e-04,  1.9132573e-04,  1.1226863e-03,

    1.1591403e-03, -6.3404470e-04,  2.8910063e-04, -7.9366821e-04,

   -1.7228167e-04,  6.2899920e-04,  1.7438219e-04,  1.1385380e-04],

  dtype=float32), array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],

  dtype=float32), array([0.50248814, 0.48732147, 0.64627343, 0.49432775, 0.45906776,

   0.5168214 , 0.8561428 , 0.7308014 , 0.5067555 , 0.516555  ,

   1.3287057 , 0.5746422 , 0.55597156, 1.0038179 , 0.9859771 ,

   0.6110601 , 0.7357226 , 0.6123694 , 0.90676117, 0.5439505 ,

   0.48629472, 0.5434108 , 0.4934845 , 0.5407317 , 0.6443982 ,

   1.0403991 , 0.48624724, 0.83786434, 0.72478205, 0.7294607 ,

   0.536994  , 0.38235992, 1.0484552 , 0.45833316, 0.48205158,

   0.48236838, 0.71035874, 0.9472658 , 0.78085536, 1.0207686 ,

   0.5089741 , 0.97984046, 0.86524594, 0.9828817 , 0.49027866,

   0.7367909 , 0.57438385, 0.5011991 , 0.47189236, 0.52376693,

   0.45648402, 0.40523565, 0.8375675 , 0.57908285, 0.6055632 ,

   1.0325785 , 0.5377976 , 0.47033092, 0.83586556, 1.2780553 ,

   0.503384  , 0.54509026, 0.5375585 , 0.6091993 ], dtype=float32)]

Would be grateful for any help.

asked Nov 20 '18 at 19:46

Sid Gairola

162

Have you tried increasing your learning rate?

– kvish
Nov 20 '18 at 20:37

No. I have not tried that. Although I am not sure, I may have tried with 0.0001. Can increasing learning rate solve this problem? I donot understand how it would ?

– Sid Gairola
Nov 20 '18 at 20:56

1

There seems to be some numerical instability somewhere. Your learning rate coupled with low gradient could drive that too if your rate is pretty low. Which is why I was wondering if you tried higher learning rates!

– kvish
Nov 20 '18 at 21:10

I tried with 0.001 Learning rate. Same result.

– Sid Gairola
Nov 21 '18 at 8:19

1

thanks for sharing that link. Looks like Keras team has added this as a bug! In that case, hopefully, they can shed some light on what is actually going on outside our speculations :)

– kvish
Dec 31 '18 at 13:46

|
show 15 more comments

I am training a triplet model similar to -Keras model params are all "NaN"s after reloading; except that this model is built over the inception_v3 model.

(I am using Keras with Tensorflow backend)

But after just 2 epochs the model weights turn out to be NaN. And when I try to extract learnt features by passing an Input Image the features are all 0.

Model Architecture-

def triplet_loss(x, ALPHA=0.2):

anchor, positive, negative = x                                      



pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), 1)

neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), 1)



basic_loss = tf.add(tf.subtract(pos_dist, neg_dist), ALPHA)         

loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0), 0)               



return loss

class StyleNet():

def __init__(self, input_shape_x, input_shape_y, input_shape_z, n_classes, reg_lambda):



    self.input_shape_x = input_shape_x                              

    self.input_shape_y = input_shape_y                              

    self.input_shape_z = input_shape_z                              

    self.n_classes = n_classes                                      

    self.reg_lambda = reg_lambda                                    





def create_model(self):                                             



    anchor_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_1')

    positive_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_2')

    negative_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_3')



    input_image = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z))



    base_inception = InceptionV3(input_tensor = input_image, input_shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), weights=None, include_top=False, pooling='avg')

    base_pool5 = base_inception.output                              



    ##############Adding the Bottleneck layer Here#######################################################

    bottleneck_layer = Dense(256, kernel_regularizer=l2(self.reg_lambda), name='bottleneck_layer')(base_pool5)

    bottleneck_norm = BatchNormalization(name='bottleneck_norm')(bottleneck_layer)

    bottleneck_relu = Activation('relu', name='bottleneck_relu')(bottleneck_norm)

    bottleneck_drop = Dropout(0.5)(bottleneck_relu)                 



    fin = Dense(self.n_classes)(bottleneck_drop)                    

    fin_norm = BatchNormalization(name='fin_norm')(fin)             

    fin_softmax = Activation('softmax')(fin_norm)                   

    ######################################################################################################



    ###########Triplet Model Which learns the embedding layer relu6####################

    self.triplet_model = Model(input_image, bottleneck_drop)        

    positive_embedding = self.triplet_model(positive_example)       

    negative_embedding = self.triplet_model(negative_example)       

    anchor_embedding = self.triplet_model(anchor_example)           

    ###########Triplet Model Which learns the embedding layer relu6####################



    adam_opt = optimizers.Adam(lr=0.00001, clipnorm = 1.0, amsgrad=False)



    #The Triplet Model which optimizes over the triplet loss.       

    loss = Lambda(triplet_loss, output_shape=(1,))([anchor_embedding, positive_embedding, negative_embedding])

    self.triplet_model_worker = Model(inputs=[anchor_example, positive_example, negative_example], outputs = loss)

    self.triplet_model_worker.compile(loss='mean_absolute_error', optimizer=adam_opt)



def fit_model(self, pathname='./models/'):                          

    if not os.path.exists(pathname):                                

        os.makedirs(pathname)                                       

    if not os.path.exists(pathname+'/weights'):                     

        os.makedirs(pathname+'/weights')                            

    if not os.path.exists(pathname+'/tb'):                          

        os.makedirs(pathname+'/tb')                                 

    filepath=pathname+"weights/{epoch:02d}.hdf5"                    

    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=False, mode='auto')

    tensorboard = TensorBoard(log_dir=pathname+'/tb', write_graph=True, write_images=True)

    callbacks_list = [checkpoint, tensorboard]                      



    #Parameter                                                              

    params = {'dim': (224, 224), 'batch_size':32, 'n_classes':11, 'n_channels':3, 'shuffle':True}



    #Datasets                                                               

    partition = pickle.load(open('../../../data/bam_2_partition_triplet.pkl', 'rb'))

    labels = pickle.load(open('../../../data/bam_2_labels_triplet.pkl', 'rb'))



    #Generators                                                             

    training_generator = DataGenerator(partition['train'], labels, **params)

    self.triplet_model_worker.fit_generator(generator = training_generator,  epochs = 60, use_multiprocessing=True, workers = 10, callbacks = callbacks_list, verbose = 1)

What is troublesome is as answered in the link above. Even after using ***clipnorm=1.0**** the gradients explode and the weights give out "nan" values.

Saving and loading the model, then printing the weights. The NaNs are clearly visible.
Loading code:

m = load_model('/scratch/models_inception_stage2/yo/weights/02.hdf5', custom_objects={"tf":tf})



for layer in m.layers:                                                  

    weights = layer.get_weights()                                       

    print (weights)

Snippet for the printed weights

Here

[array([ 3.4517611e-04,  1.3431008e-03, -1.1081886e-03,  2.6104850e-04,

   -2.1620051e-04,  1.6816283e-03,  8.8927911e-05, -3.8964470e-04,

    1.7968584e-03,  1.0259283e-03,  5.0400384e-04, -3.6578919e-04,

   -1.1292399e-03,  1.1509922e-03,  3.2478449e-04, -3.6580343e-05,

   -4.4458261e-04,  4.8210021e-04, -9.5213606e-04, -6.4406055e-04,

    5.0959276e-04, -3.4098624e-04, -7.0486858e-05,  2.8134760e-04,

   -8.0100907e-04,  8.2962180e-04, -6.4140803e-04,  9.4872032e-04,

   -3.3409546e-05, -3.0277384e-04,  5.2237371e-04, -8.3427120e-04,

   -2.5856070e-04, -1.0346439e-03,  4.3354488e-05, -8.8099617e-04,

   -6.8233605e-04, -1.2386916e-04,  8.2019303e-04, -1.9070004e-03,

    1.5571159e-03, -3.4599879e-04,  6.2088901e-04, -8.4720332e-06,

    1.6024955e-04, -1.2059419e-03, -1.4946899e-04, -6.7080715e-04,

   -2.8154058e-05,  5.1517348e-04,  5.9993083e-05,  2.8555689e-04,

    3.9626448e-04, -5.1538437e-04,  1.9132573e-04,  1.1226863e-03,

    1.1591403e-03, -6.3404470e-04,  2.8910063e-04, -7.9366821e-04,

   -1.7228167e-04,  6.2899920e-04,  1.7438219e-04,  1.1385380e-04],

  dtype=float32), array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],

  dtype=float32), array([0.50248814, 0.48732147, 0.64627343, 0.49432775, 0.45906776,

   0.5168214 , 0.8561428 , 0.7308014 , 0.5067555 , 0.516555  ,

   1.3287057 , 0.5746422 , 0.55597156, 1.0038179 , 0.9859771 ,

   0.6110601 , 0.7357226 , 0.6123694 , 0.90676117, 0.5439505 ,

   0.48629472, 0.5434108 , 0.4934845 , 0.5407317 , 0.6443982 ,

   1.0403991 , 0.48624724, 0.83786434, 0.72478205, 0.7294607 ,

   0.536994  , 0.38235992, 1.0484552 , 0.45833316, 0.48205158,

   0.48236838, 0.71035874, 0.9472658 , 0.78085536, 1.0207686 ,

   0.5089741 , 0.97984046, 0.86524594, 0.9828817 , 0.49027866,

   0.7367909 , 0.57438385, 0.5011991 , 0.47189236, 0.52376693,

   0.45648402, 0.40523565, 0.8375675 , 0.57908285, 0.6055632 ,

   1.0325785 , 0.5377976 , 0.47033092, 0.83586556, 1.2780553 ,

   0.503384  , 0.54509026, 0.5375585 , 0.6091993 ], dtype=float32)]

Would be grateful for any help.

asked Nov 20 '18 at 19:46

Sid Gairola

162

Have you tried increasing your learning rate?

– kvish
Nov 20 '18 at 20:37

No. I have not tried that. Although I am not sure, I may have tried with 0.0001. Can increasing learning rate solve this problem? I donot understand how it would ?

– Sid Gairola
Nov 20 '18 at 20:56

1

There seems to be some numerical instability somewhere. Your learning rate coupled with low gradient could drive that too if your rate is pretty low. Which is why I was wondering if you tried higher learning rates!

– kvish
Nov 20 '18 at 21:10

I tried with 0.001 Learning rate. Same result.

– Sid Gairola
Nov 21 '18 at 8:19

1

thanks for sharing that link. Looks like Keras team has added this as a bug! In that case, hopefully, they can shed some light on what is actually going on outside our speculations :)

– kvish
Dec 31 '18 at 13:46

|
show 15 more comments

I am training a triplet model similar to -Keras model params are all "NaN"s after reloading; except that this model is built over the inception_v3 model.

(I am using Keras with Tensorflow backend)

But after just 2 epochs the model weights turn out to be NaN. And when I try to extract learnt features by passing an Input Image the features are all 0.

Model Architecture-

def triplet_loss(x, ALPHA=0.2):

anchor, positive, negative = x                                      



pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), 1)

neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), 1)



basic_loss = tf.add(tf.subtract(pos_dist, neg_dist), ALPHA)         

loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0), 0)               



return loss

class StyleNet():

def __init__(self, input_shape_x, input_shape_y, input_shape_z, n_classes, reg_lambda):



    self.input_shape_x = input_shape_x                              

    self.input_shape_y = input_shape_y                              

    self.input_shape_z = input_shape_z                              

    self.n_classes = n_classes                                      

    self.reg_lambda = reg_lambda                                    





def create_model(self):                                             



    anchor_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_1')

    positive_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_2')

    negative_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_3')



    input_image = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z))



    base_inception = InceptionV3(input_tensor = input_image, input_shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), weights=None, include_top=False, pooling='avg')

    base_pool5 = base_inception.output                              



    ##############Adding the Bottleneck layer Here#######################################################

    bottleneck_layer = Dense(256, kernel_regularizer=l2(self.reg_lambda), name='bottleneck_layer')(base_pool5)

    bottleneck_norm = BatchNormalization(name='bottleneck_norm')(bottleneck_layer)

    bottleneck_relu = Activation('relu', name='bottleneck_relu')(bottleneck_norm)

    bottleneck_drop = Dropout(0.5)(bottleneck_relu)                 



    fin = Dense(self.n_classes)(bottleneck_drop)                    

    fin_norm = BatchNormalization(name='fin_norm')(fin)             

    fin_softmax = Activation('softmax')(fin_norm)                   

    ######################################################################################################



    ###########Triplet Model Which learns the embedding layer relu6####################

    self.triplet_model = Model(input_image, bottleneck_drop)        

    positive_embedding = self.triplet_model(positive_example)       

    negative_embedding = self.triplet_model(negative_example)       

    anchor_embedding = self.triplet_model(anchor_example)           

    ###########Triplet Model Which learns the embedding layer relu6####################



    adam_opt = optimizers.Adam(lr=0.00001, clipnorm = 1.0, amsgrad=False)



    #The Triplet Model which optimizes over the triplet loss.       

    loss = Lambda(triplet_loss, output_shape=(1,))([anchor_embedding, positive_embedding, negative_embedding])

    self.triplet_model_worker = Model(inputs=[anchor_example, positive_example, negative_example], outputs = loss)

    self.triplet_model_worker.compile(loss='mean_absolute_error', optimizer=adam_opt)



def fit_model(self, pathname='./models/'):                          

    if not os.path.exists(pathname):                                

        os.makedirs(pathname)                                       

    if not os.path.exists(pathname+'/weights'):                     

        os.makedirs(pathname+'/weights')                            

    if not os.path.exists(pathname+'/tb'):                          

        os.makedirs(pathname+'/tb')                                 

    filepath=pathname+"weights/{epoch:02d}.hdf5"                    

    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=False, mode='auto')

    tensorboard = TensorBoard(log_dir=pathname+'/tb', write_graph=True, write_images=True)

    callbacks_list = [checkpoint, tensorboard]                      



    #Parameter                                                              

    params = {'dim': (224, 224), 'batch_size':32, 'n_classes':11, 'n_channels':3, 'shuffle':True}



    #Datasets                                                               

    partition = pickle.load(open('../../../data/bam_2_partition_triplet.pkl', 'rb'))

    labels = pickle.load(open('../../../data/bam_2_labels_triplet.pkl', 'rb'))



    #Generators                                                             

    training_generator = DataGenerator(partition['train'], labels, **params)

    self.triplet_model_worker.fit_generator(generator = training_generator,  epochs = 60, use_multiprocessing=True, workers = 10, callbacks = callbacks_list, verbose = 1)

What is troublesome is as answered in the link above. Even after using ***clipnorm=1.0**** the gradients explode and the weights give out "nan" values.

Saving and loading the model, then printing the weights. The NaNs are clearly visible.
Loading code:

m = load_model('/scratch/models_inception_stage2/yo/weights/02.hdf5', custom_objects={"tf":tf})



for layer in m.layers:                                                  

    weights = layer.get_weights()                                       

    print (weights)

Snippet for the printed weights

Here

[array([ 3.4517611e-04,  1.3431008e-03, -1.1081886e-03,  2.6104850e-04,

   -2.1620051e-04,  1.6816283e-03,  8.8927911e-05, -3.8964470e-04,

    1.7968584e-03,  1.0259283e-03,  5.0400384e-04, -3.6578919e-04,

   -1.1292399e-03,  1.1509922e-03,  3.2478449e-04, -3.6580343e-05,

   -4.4458261e-04,  4.8210021e-04, -9.5213606e-04, -6.4406055e-04,

    5.0959276e-04, -3.4098624e-04, -7.0486858e-05,  2.8134760e-04,

   -8.0100907e-04,  8.2962180e-04, -6.4140803e-04,  9.4872032e-04,

   -3.3409546e-05, -3.0277384e-04,  5.2237371e-04, -8.3427120e-04,

   -2.5856070e-04, -1.0346439e-03,  4.3354488e-05, -8.8099617e-04,

   -6.8233605e-04, -1.2386916e-04,  8.2019303e-04, -1.9070004e-03,

    1.5571159e-03, -3.4599879e-04,  6.2088901e-04, -8.4720332e-06,

    1.6024955e-04, -1.2059419e-03, -1.4946899e-04, -6.7080715e-04,

   -2.8154058e-05,  5.1517348e-04,  5.9993083e-05,  2.8555689e-04,

    3.9626448e-04, -5.1538437e-04,  1.9132573e-04,  1.1226863e-03,

    1.1591403e-03, -6.3404470e-04,  2.8910063e-04, -7.9366821e-04,

   -1.7228167e-04,  6.2899920e-04,  1.7438219e-04,  1.1385380e-04],

  dtype=float32), array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],

  dtype=float32), array([0.50248814, 0.48732147, 0.64627343, 0.49432775, 0.45906776,

   0.5168214 , 0.8561428 , 0.7308014 , 0.5067555 , 0.516555  ,

   1.3287057 , 0.5746422 , 0.55597156, 1.0038179 , 0.9859771 ,

   0.6110601 , 0.7357226 , 0.6123694 , 0.90676117, 0.5439505 ,

   0.48629472, 0.5434108 , 0.4934845 , 0.5407317 , 0.6443982 ,

   1.0403991 , 0.48624724, 0.83786434, 0.72478205, 0.7294607 ,

   0.536994  , 0.38235992, 1.0484552 , 0.45833316, 0.48205158,

   0.48236838, 0.71035874, 0.9472658 , 0.78085536, 1.0207686 ,

   0.5089741 , 0.97984046, 0.86524594, 0.9828817 , 0.49027866,

   0.7367909 , 0.57438385, 0.5011991 , 0.47189236, 0.52376693,

   0.45648402, 0.40523565, 0.8375675 , 0.57908285, 0.6055632 ,

   1.0325785 , 0.5377976 , 0.47033092, 0.83586556, 1.2780553 ,

   0.503384  , 0.54509026, 0.5375585 , 0.6091993 ], dtype=float32)]

Would be grateful for any help.

asked Nov 20 '18 at 19:46

Sid Gairola

162

I am training a triplet model similar to -Keras model params are all "NaN"s after reloading; except that this model is built over the inception_v3 model.

(I am using Keras with Tensorflow backend)

But after just 2 epochs the model weights turn out to be NaN. And when I try to extract learnt features by passing an Input Image the features are all 0.

Model Architecture-

def triplet_loss(x, ALPHA=0.2):

anchor, positive, negative = x                                      



pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), 1)

neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), 1)



basic_loss = tf.add(tf.subtract(pos_dist, neg_dist), ALPHA)         

loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0), 0)               



return loss

class StyleNet():

def __init__(self, input_shape_x, input_shape_y, input_shape_z, n_classes, reg_lambda):



    self.input_shape_x = input_shape_x                              

    self.input_shape_y = input_shape_y                              

    self.input_shape_z = input_shape_z                              

    self.n_classes = n_classes                                      

    self.reg_lambda = reg_lambda                                    





def create_model(self):                                             



    anchor_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_1')

    positive_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_2')

    negative_example = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), name='input_3')



    input_image = Input(shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z))



    base_inception = InceptionV3(input_tensor = input_image, input_shape=(self.input_shape_x, self.input_shape_y, self.input_shape_z), weights=None, include_top=False, pooling='avg')

    base_pool5 = base_inception.output                              



    ##############Adding the Bottleneck layer Here#######################################################

    bottleneck_layer = Dense(256, kernel_regularizer=l2(self.reg_lambda), name='bottleneck_layer')(base_pool5)

    bottleneck_norm = BatchNormalization(name='bottleneck_norm')(bottleneck_layer)

    bottleneck_relu = Activation('relu', name='bottleneck_relu')(bottleneck_norm)

    bottleneck_drop = Dropout(0.5)(bottleneck_relu)                 



    fin = Dense(self.n_classes)(bottleneck_drop)                    

    fin_norm = BatchNormalization(name='fin_norm')(fin)             

    fin_softmax = Activation('softmax')(fin_norm)                   

    ######################################################################################################



    ###########Triplet Model Which learns the embedding layer relu6####################

    self.triplet_model = Model(input_image, bottleneck_drop)        

    positive_embedding = self.triplet_model(positive_example)       

    negative_embedding = self.triplet_model(negative_example)       

    anchor_embedding = self.triplet_model(anchor_example)           

    ###########Triplet Model Which learns the embedding layer relu6####################



    adam_opt = optimizers.Adam(lr=0.00001, clipnorm = 1.0, amsgrad=False)



    #The Triplet Model which optimizes over the triplet loss.       

    loss = Lambda(triplet_loss, output_shape=(1,))([anchor_embedding, positive_embedding, negative_embedding])

    self.triplet_model_worker = Model(inputs=[anchor_example, positive_example, negative_example], outputs = loss)

    self.triplet_model_worker.compile(loss='mean_absolute_error', optimizer=adam_opt)



def fit_model(self, pathname='./models/'):                          

    if not os.path.exists(pathname):                                

        os.makedirs(pathname)                                       

    if not os.path.exists(pathname+'/weights'):                     

        os.makedirs(pathname+'/weights')                            

    if not os.path.exists(pathname+'/tb'):                          

        os.makedirs(pathname+'/tb')                                 

    filepath=pathname+"weights/{epoch:02d}.hdf5"                    

    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=False, mode='auto')

    tensorboard = TensorBoard(log_dir=pathname+'/tb', write_graph=True, write_images=True)

    callbacks_list = [checkpoint, tensorboard]                      



    #Parameter                                                              

    params = {'dim': (224, 224), 'batch_size':32, 'n_classes':11, 'n_channels':3, 'shuffle':True}



    #Datasets                                                               

    partition = pickle.load(open('../../../data/bam_2_partition_triplet.pkl', 'rb'))

    labels = pickle.load(open('../../../data/bam_2_labels_triplet.pkl', 'rb'))



    #Generators                                                             

    training_generator = DataGenerator(partition['train'], labels, **params)

    self.triplet_model_worker.fit_generator(generator = training_generator,  epochs = 60, use_multiprocessing=True, workers = 10, callbacks = callbacks_list, verbose = 1)

What is troublesome is as answered in the link above. Even after using ***clipnorm=1.0**** the gradients explode and the weights give out "nan" values.

Saving and loading the model, then printing the weights. The NaNs are clearly visible.
Loading code:

m = load_model('/scratch/models_inception_stage2/yo/weights/02.hdf5', custom_objects={"tf":tf})



for layer in m.layers:                                                  

    weights = layer.get_weights()                                       

    print (weights)

Snippet for the printed weights

Here

[array([ 3.4517611e-04,  1.3431008e-03, -1.1081886e-03,  2.6104850e-04,

   -2.1620051e-04,  1.6816283e-03,  8.8927911e-05, -3.8964470e-04,

    1.7968584e-03,  1.0259283e-03,  5.0400384e-04, -3.6578919e-04,

   -1.1292399e-03,  1.1509922e-03,  3.2478449e-04, -3.6580343e-05,

   -4.4458261e-04,  4.8210021e-04, -9.5213606e-04, -6.4406055e-04,

    5.0959276e-04, -3.4098624e-04, -7.0486858e-05,  2.8134760e-04,

   -8.0100907e-04,  8.2962180e-04, -6.4140803e-04,  9.4872032e-04,

   -3.3409546e-05, -3.0277384e-04,  5.2237371e-04, -8.3427120e-04,

   -2.5856070e-04, -1.0346439e-03,  4.3354488e-05, -8.8099617e-04,

   -6.8233605e-04, -1.2386916e-04,  8.2019303e-04, -1.9070004e-03,

    1.5571159e-03, -3.4599879e-04,  6.2088901e-04, -8.4720332e-06,

    1.6024955e-04, -1.2059419e-03, -1.4946899e-04, -6.7080715e-04,

   -2.8154058e-05,  5.1517348e-04,  5.9993083e-05,  2.8555689e-04,

    3.9626448e-04, -5.1538437e-04,  1.9132573e-04,  1.1226863e-03,

    1.1591403e-03, -6.3404470e-04,  2.8910063e-04, -7.9366821e-04,

   -1.7228167e-04,  6.2899920e-04,  1.7438219e-04,  1.1385380e-04],

  dtype=float32), array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

   nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],

  dtype=float32), array([0.50248814, 0.48732147, 0.64627343, 0.49432775, 0.45906776,

   0.5168214 , 0.8561428 , 0.7308014 , 0.5067555 , 0.516555  ,

   1.3287057 , 0.5746422 , 0.55597156, 1.0038179 , 0.9859771 ,

   0.6110601 , 0.7357226 , 0.6123694 , 0.90676117, 0.5439505 ,

   0.48629472, 0.5434108 , 0.4934845 , 0.5407317 , 0.6443982 ,

   1.0403991 , 0.48624724, 0.83786434, 0.72478205, 0.7294607 ,

   0.536994  , 0.38235992, 1.0484552 , 0.45833316, 0.48205158,

   0.48236838, 0.71035874, 0.9472658 , 0.78085536, 1.0207686 ,

   0.5089741 , 0.97984046, 0.86524594, 0.9828817 , 0.49027866,

   0.7367909 , 0.57438385, 0.5011991 , 0.47189236, 0.52376693,

   0.45648402, 0.40523565, 0.8375675 , 0.57908285, 0.6055632 ,

   1.0325785 , 0.5377976 , 0.47033092, 0.83586556, 1.2780553 ,

   0.503384  , 0.54509026, 0.5375585 , 0.6091993 ], dtype=float32)]

Would be grateful for any help.

python tensorflow keras

asked Nov 20 '18 at 19:46

Sid Gairola

162

asked Nov 20 '18 at 19:46

Sid Gairola

162

asked Nov 20 '18 at 19:46

Sid Gairola

162

asked Nov 20 '18 at 19:46

Sid Gairola

162

asked Nov 20 '18 at 19:46

Sid Gairola

162

Have you tried increasing your learning rate?

– kvish
Nov 20 '18 at 20:37

No. I have not tried that. Although I am not sure, I may have tried with 0.0001. Can increasing learning rate solve this problem? I donot understand how it would ?

– Sid Gairola
Nov 20 '18 at 20:56

1

There seems to be some numerical instability somewhere. Your learning rate coupled with low gradient could drive that too if your rate is pretty low. Which is why I was wondering if you tried higher learning rates!

– kvish
Nov 20 '18 at 21:10

I tried with 0.001 Learning rate. Same result.

– Sid Gairola
Nov 21 '18 at 8:19

1

thanks for sharing that link. Looks like Keras team has added this as a bug! In that case, hopefully, they can shed some light on what is actually going on outside our speculations :)

– kvish
Dec 31 '18 at 13:46

|
show 15 more comments

Have you tried increasing your learning rate?

– kvish
Nov 20 '18 at 20:37

No. I have not tried that. Although I am not sure, I may have tried with 0.0001. Can increasing learning rate solve this problem? I donot understand how it would ?

– Sid Gairola
Nov 20 '18 at 20:56

1

There seems to be some numerical instability somewhere. Your learning rate coupled with low gradient could drive that too if your rate is pretty low. Which is why I was wondering if you tried higher learning rates!

– kvish
Nov 20 '18 at 21:10

I tried with 0.001 Learning rate. Same result.

– Sid Gairola
Nov 21 '18 at 8:19

1

thanks for sharing that link. Looks like Keras team has added this as a bug! In that case, hopefully, they can shed some light on what is actually going on outside our speculations :)

– kvish
Dec 31 '18 at 13:46

Have you tried increasing your learning rate?

– kvish
Nov 20 '18 at 20:37

No. I have not tried that. Although I am not sure, I may have tried with 0.0001. Can increasing learning rate solve this problem? I donot understand how it would ?

– Sid Gairola
Nov 20 '18 at 20:56

There seems to be some numerical instability somewhere. Your learning rate coupled with low gradient could drive that too if your rate is pretty low. Which is why I was wondering if you tried higher learning rates!

– kvish
Nov 20 '18 at 21:10

I tried with 0.001 Learning rate. Same result.

– Sid Gairola
Nov 21 '18 at 8:19

thanks for sharing that link. Looks like Keras team has added this as a bug! In that case, hopefully, they can shed some light on what is actually going on outside our speculations :)

– kvish
Dec 31 '18 at 13:46

|
show 15 more comments

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53400472%2fkeras-model-weights-for-some-layers-become-all-nans%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Agfdhyk