results["logits"] = logits
return results
loss_func = self.model.loss_function or loss_function
base_loss = loss_func(logits, labels, aux_logits=aux_logits)
params = self.variable_mgr.trainable_variables_on_device(
rel_device_num, abs_device_num)
l2_loss = None
total_loss = base_loss
with tf.name_scope("l2_loss"):
fp32_params = params
if data_type == tf.float16 and self.params.fp16_vars:
// fp16 reductions are very slow on GPUs, so cast to fp32 before
// calling tf.nn.l2_loss and tf.add_n.
// TODO(b/36217816): Once the bug is fixed, investigate if we should do
// this reduction in fp16.
fp32_params = (tf.cast(p, tf.float32) for p in params)
if rel_device_num == len(self.devices) - 1:
// We compute the L2 loss for only one device instead of all of them,
// because the L2 loss for each device is the same. To adjust for this,
// we multiply the L2 loss by the number of devices. We choose the
// last device because for some reason, on a Volta DGX1, the first four
// GPUs take slightly longer to complete a step than the last four.
// TODO(reedwm): Shard the L2 loss computations across GPUs.
if self.params.single_l2_loss_op:
// TODO(reedwm): If faster, create a fused op that does the L2 loss
// on multiple tensors, and use that instead of concatenating
// tensors.
reshaped_params = [tf.reshape(p, (-1,)) for p in fp32_params]
l2_loss = tf.nn.l2_loss(tf.concat(reshaped_params, axis=0))
else:
l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in fp32_params])
weight_decay = self.params.weight_decay
if (weight_decay is not None and weight_decay != 0. and
l2_loss is not None):
total_loss += len(self.devices) * weight_decay * l2_loss
aggmeth = tf.AggregationMethod.DEFAULT
scaled_loss = (total_loss if self.loss_scale is None
else total_loss * self.loss_scale)
grads = tf.gradients(scaled_loss, params, aggregation_method=aggmeth)
if self.loss_scale is not None:
// TODO(reedwm): If automatic loss scaling is not used, we could avoid
// these multiplications by directly modifying the learning rate instead.
// If this is done, care must be taken to ensure that this scaling method
// is correct, as some optimizers square gradients and do other
// operations which might not be compatible with modifying both the
// gradients and the learning rate.
grads = [
grad * tf.cast(1. / self.loss_scale, grad.dtype) for grad in grads
]
if self.params.variable_update == "horovod":
import horovod.tensorflow as hvd // pylint: disable=g-import-not-at-top
if self.params.horovod_device:
horovod_device = "/%s:0" % self.params.horovod_device
else:
horovod_device = ""
// All-reduce gradients using Horovod.
grads = [hvd.allreduce(grad, average=False, device_dense=horovod_device)
for grad in grads]
if self.params.staged_vars:
grad_dtypes = [grad.dtype for grad in grads]
grad_shapes = [grad.shape for grad in grads]
grad_stage = data_flow_ops.StagingArea(grad_dtypes, grad_shapes)
grad_stage_op = grad_stage.put(grads)
// In general, this decouples the computation of the gradients and
// the updates of the weights.
// During the pipeline warm up, this runs enough training to produce
// the first set of gradients.
gpu_grad_stage_ops.append(grad_stage_op)
grads = grad_stage.get()
param_refs = self.variable_mgr.trainable_variables_on_device(
rel_device_num, abs_device_num, writable=True)
gradvars = list(zip(grads, param_refs))
if self.params.loss_type_to_report == "total_loss":
results["loss"] = total_loss
else:
results["loss"] = base_loss
results["gradvars"] = gradvars
return results
def get_image_preprocessor(self):
Returns the image preprocessor to used, based on the model.