// scale preds so that the class probas of each sample sum to 1
y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
// clip
y_pred = K.clip(y_pred, K.epsilon(), 1)
// calc
p = y_true*K.log(y_pred)
loss = p*self.weights
loss =-K.sum(loss,-1)
// return loss
return K.mean(loss)
// def nonzero_acc(y_true, y_pred):
// lab_true = K.argmax(y_true, axis=-1)
// lab_pred = K.argmax(y_pred, axis=-1)