CoCalc -- run

GitHub Repository: ai-forever/sber-swap
Path: blob/main/apex/tests/L1/common/run_test.sh
¹⁰⁶² views
1
#!/bin/bash
2

3
print_banner() {
4
  printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
5
}
6

7
print_banner "Distributed status:  $1"
8

9
echo $2
10
DATADIR=$2
11

12
if [ -n "$3" ]
13
then
14
  USE_BASELINE=""
15
else
16
  USE_BASELINE="--use_baseline"
17
fi
18

19
if [ "$1" == "single_gpu" ]
20
then
21
  BASE_CMD="python main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
22
fi
23

24
if [ "$1" == "distributed" ]
25
then
26
  BASE_CMD="python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
27
fi
28

29
ADAM_ARGS="--opt-level O2 --keep-batchnorm-fp32 False --fused-adam"
30

31
keep_batchnorms=(
32
""
33
"--keep-batchnorm-fp32 True"
34
"--keep-batchnorm-fp32 False"
35
)
36

37
loss_scales=(
38
""
39
"--loss-scale 1.0"
40
"--loss-scale 128.0"
41
"--loss-scale dynamic"
42
)
43

44
opt_levels=(
45
"O0"
46
"O1"
47
"O2"
48
"O3"
49
)
50

51
rm True*
52
rm False*
53

54
set -e
55

56
print_banner "Installing Apex with --cuda_ext and --cpp_ext"
57

58
pushd ../../..
59
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
60
popd
61

62
for opt_level in "${opt_levels[@]}"
63
do
64
  for loss_scale in "${loss_scales[@]}"
65
  do
66
    for keep_batchnorm in "${keep_batchnorms[@]}"
67
    do
68
      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
69
      then
70
        print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
71
        continue
72
      fi
73
      print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR"
74
      set -x
75
      ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR
76
      set +x
77
    done
78
  done
79
done
80

81
# Handle FusedAdam separately due to limited support.
82
# FusedAdam will not be tested for bitwise accuracy against the Python implementation.
83
# The L0 tests already do so.  These tests are here to ensure that it actually runs,
84
# and get an idea of performance.
85
for loss_scale in "${loss_scales[@]}"
86
do
87
  print_banner "${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR"
88
  set -x
89
  ${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR
90
  set +x
91
done
92

93
print_banner "Reinstalling apex without extensions"
94

95
pushd ../../..
96
pip install -v --no-cache-dir .
97
popd
98

99
for opt_level in "${opt_levels[@]}"
100
do
101
  for loss_scale in "${loss_scales[@]}"
102
  do
103
    for keep_batchnorm in "${keep_batchnorms[@]}"
104
    do
105
      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
106
      then
107
        print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
108
        continue
109
      fi
110
      print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR"
111
      set -x
112
      ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR
113
      set +x
114
    done
115
  done
116
done
117

118
print_banner "Checking for bitwise accuracy between Python-only and cpp/cuda extension installs"
119

120
for opt_level in "${opt_levels[@]}"
121
do
122
  for loss_scale in "${loss_scales[@]}"
123
  do
124
    for keep_batchnorm in "${keep_batchnorms[@]}"
125
    do
126
      echo ""
127
      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
128
      then
129
        echo "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
130
        continue
131
      fi
132
      echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
133
      set -x
134
      python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline
135
      set +x
136
    done
137
  done
138
done
139

140
print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext"
141

142
pushd ../../..
143
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
144
popd
145

146
Product

Resources

Company