58efec0f2bc7e8de2ea9b89ec638d4a0a2d60537,python/ray/util/sgd/torch/distributed_torch_runner.py,LocalDistributedRunner,_try_reserve_and_set_cuda,#LocalDistributedRunner#,278

Before Change



    def _try_reserve_and_set_cuda(self):
        use_found_device = os.environ.get("CUDA_VISIBLE_DEVICES") is None \
                           and torch.cuda.is_initialized()
        device = reserve_cuda_device()
        // This needs to be set even if torch.cuda is already
        // initialized because the env var is used later when
        // starting the DDP setup.
        os.environ["CUDA_VISIBLE_DEVICES"] = device
        if use_found_device:
            // Once cuda is initialized, torch.device ignores the os.env
            // so we have to set the right actual device.
            self._set_cuda_device(device)
        else:
            // if CUDA is not initialized, we can set the os.env.
            // Even if initialized, we want to set the device to use BatchNorm.
            // and make Torch think it only sees 1 GPU.
            self._set_cuda_device("0")

    def _set_cuda_device(self, device_str):
        Sets the CUDA device for this current local worker.
        if self._is_set:
            raise RuntimeError("CUDA devices already set.")

After Change


        // initialized because the env var is used later when
        // starting the DDP setup.
        os.environ["CUDA_VISIBLE_DEVICES"] = reserved_device
        if visible_devices:
            // We want to set the index on the visible devices list.
            if reserved_device not in visible_devices:
                raise RuntimeError(
                    "TorchTrainer reserved a device {} that was not in the "
                    "CUDA_VISIBLE_DEVICES {}. This may be because the "
                    "Ray cluster is not set with the right env vars. "
                    "If that is not the issue, please raise a "
                    "Github issue.".format(reserved_device, visible_devices))
            devices = visible_devices.split(",")
            scoped_index = devices.index(reserved_device)
            self._set_cuda_device(str(scoped_index))
        else:
            // Once cuda is initialized, torch.device ignores the os.env
            // so we have to set the right actual device.
            self._set_cuda_device(reserved_device)

    def _set_cuda_device(self, device_str):
        Sets the CUDA device for this current local worker.
        if self._is_set:
            raise RuntimeError("CUDA devices already set.")

In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 4

Instances

Link

Project Name: ray-project/ray

Commit Name: 58efec0f2bc7e8de2ea9b89ec638d4a0a2d60537

Time: 2020-06-12

Author: rliaw@berkeley.edu

File Name: python/ray/util/sgd/torch/distributed_torch_runner.py

Class Name: LocalDistributedRunner

Method Name: _try_reserve_and_set_cuda

Link

Project Name: ray-project/ray

Commit Name: 448013222978734615aaebc8e312f946fdce94fe

Time: 2021-03-31

Author: 74173148+iycheng@users.noreply.github.com

File Name: python/ray/util/client/server/dataservicer.py

Class Name: DataServicer

Method Name: Datapath

Link

Project Name: pytorch/fairseq

Commit Name: 01be083e46d2e4614dc274b0edf29d0ddd516186

Time: 2020-10-27

Author: myleott@fb.com

File Name: fairseq/dataclass/utils.py

Class Name:

Method Name: convert_namespace_to_omegaconf