run_test.sh 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. #!/bin/bash
  2. print_banner() {
  3. printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
  4. }
  5. print_banner "Distributed status: $1"
  6. echo $2
  7. DATADIR=$2
  8. if [ -n "$3" ]
  9. then
  10. USE_BASELINE=""
  11. else
  12. USE_BASELINE="--use_baseline"
  13. fi
  14. if [ "$1" == "single_gpu" ]
  15. then
  16. BASE_CMD="python main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
  17. fi
  18. if [ "$1" == "distributed" ]
  19. then
  20. BASE_CMD="python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
  21. fi
  22. ADAM_ARGS="--opt-level O2 --keep-batchnorm-fp32 False --fused-adam"
  23. keep_batchnorms=(
  24. ""
  25. "--keep-batchnorm-fp32 True"
  26. "--keep-batchnorm-fp32 False"
  27. )
  28. loss_scales=(
  29. ""
  30. "--loss-scale 1.0"
  31. "--loss-scale 128.0"
  32. "--loss-scale dynamic"
  33. )
  34. opt_levels=(
  35. "O0"
  36. "O1"
  37. "O2"
  38. "O3"
  39. )
  40. rm True*
  41. rm False*
  42. set -e
  43. print_banner "Installing Apex with --cuda_ext and --cpp_ext"
  44. pushd ../../..
  45. pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
  46. popd
  47. for opt_level in "${opt_levels[@]}"
  48. do
  49. for loss_scale in "${loss_scales[@]}"
  50. do
  51. for keep_batchnorm in "${keep_batchnorms[@]}"
  52. do
  53. if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
  54. then
  55. print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
  56. continue
  57. fi
  58. print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR"
  59. set -x
  60. ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR
  61. set +x
  62. done
  63. done
  64. done
  65. # Handle FusedAdam separately due to limited support.
  66. # FusedAdam will not be tested for bitwise accuracy against the Python implementation.
  67. # The L0 tests already do so. These tests are here to ensure that it actually runs,
  68. # and get an idea of performance.
  69. for loss_scale in "${loss_scales[@]}"
  70. do
  71. print_banner "${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR"
  72. set -x
  73. ${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR
  74. set +x
  75. done
  76. print_banner "Reinstalling apex without extensions"
  77. pushd ../../..
  78. pip install -v --no-cache-dir .
  79. popd
  80. for opt_level in "${opt_levels[@]}"
  81. do
  82. for loss_scale in "${loss_scales[@]}"
  83. do
  84. for keep_batchnorm in "${keep_batchnorms[@]}"
  85. do
  86. if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
  87. then
  88. print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
  89. continue
  90. fi
  91. print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR"
  92. set -x
  93. ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR
  94. set +x
  95. done
  96. done
  97. done
  98. print_banner "Checking for bitwise accuracy between Python-only and cpp/cuda extension installs"
  99. for opt_level in "${opt_levels[@]}"
  100. do
  101. for loss_scale in "${loss_scales[@]}"
  102. do
  103. for keep_batchnorm in "${keep_batchnorms[@]}"
  104. do
  105. echo ""
  106. if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
  107. then
  108. echo "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
  109. continue
  110. fi
  111. echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
  112. set -x
  113. python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline
  114. set +x
  115. done
  116. done
  117. done
  118. print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext"
  119. pushd ../../..
  120. pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
  121. popd