12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- import torch
- import argparse
- import os
- from apex import amp
- # FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
- from apex.parallel import DistributedDataParallel
- parser = argparse.ArgumentParser()
- # FOR DISTRIBUTED: Parse for the local_rank argument, which will be supplied
- # automatically by torch.distributed.launch.
- parser.add_argument("--local_rank", default=0, type=int)
- args = parser.parse_args()
- # FOR DISTRIBUTED: If we are running under torch.distributed.launch,
- # the 'WORLD_SIZE' environment variable will also be set automatically.
- args.distributed = False
- if 'WORLD_SIZE' in os.environ:
- args.distributed = int(os.environ['WORLD_SIZE']) > 1
- if args.distributed:
- # FOR DISTRIBUTED: Set the device according to local_rank.
- torch.cuda.set_device(args.local_rank)
- # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will provide
- # environment variables, and requires that you use init_method=`env://`.
- torch.distributed.init_process_group(backend='nccl',
- init_method='env://')
- torch.manual_seed(torch.distributed.get_rank())
- torch.backends.cudnn.benchmark = True
- N, D_in, D_out = 64, 1024, 16
- # Each process receives its own batch of "fake input data" and "fake target data."
- # The "training loop" in each process just uses this fake batch over and over.
- # https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
- # example of distributed data sampling for both training and validation.
- x = torch.randn(N, D_in, device='cuda')
- y = torch.randn(N, D_out, device='cuda')
- model = torch.nn.Linear(D_in, D_out).cuda()
- optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
- model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
- if args.distributed:
- # FOR DISTRIBUTED: After amp.initialize, wrap the model with
- # apex.parallel.DistributedDataParallel.
- model = DistributedDataParallel(model)
- # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
- # model = torch.nn.parallel.DistributedDataParallel(model,
- # device_ids=[args.local_rank],
- # output_device=args.local_rank)
- loss_fn = torch.nn.MSELoss()
- for t in range(500):
- optimizer.zero_grad()
- y_pred = model(x)
- loss = loss_fn(y_pred, y)
- with amp.scale_loss(loss, optimizer) as scaled_loss:
- scaled_loss.backward()
- optimizer.step()
- if args.local_rank == 0:
- print("final loss = ", loss)
- torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank()))
- torch.save(list(amp.master_params(optimizer)), "rank{}master.pth".format(torch.distributed.get_rank()))
|