dist_utils.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. import os
  16. import paddle.fluid as fluid
  17. def nccl2_prepare(trainer_id, startup_prog, main_prog):
  18. config = fluid.DistributeTranspilerConfig()
  19. config.mode = "nccl2"
  20. t = fluid.DistributeTranspiler(config=config)
  21. t.transpile(
  22. trainer_id,
  23. trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
  24. current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
  25. startup_program=startup_prog,
  26. program=main_prog)
  27. def collective_prepare(trainer_id, startup_prog, main_prog):
  28. config = fluid.DistributeTranspilerConfig()
  29. config.mode = "collective"
  30. config.collective_mode = "grad_allreduce"
  31. t = fluid.DistributeTranspiler(config=config)
  32. t.transpile(
  33. trainer_id,
  34. trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
  35. current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
  36. startup_program=startup_prog,
  37. program=main_prog)
  38. def prepare_for_multi_process(exe, build_strategy, startup_prog, main_prog):
  39. trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0))
  40. num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
  41. if num_trainers < 2:
  42. return
  43. build_strategy.num_trainers = num_trainers
  44. build_strategy.trainer_id = trainer_id
  45. if fluid.core.is_compiled_with_npu():
  46. collective_prepare(trainer_id, startup_prog, main_prog)
  47. else:
  48. nccl2_prepare(trainer_id, startup_prog, main_prog)