-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscript_lra_others.sh
55 lines (49 loc) · 1.31 KB
/
script_lra_others.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
export HYDRA_FULL_ERROR=1
export DATA_PATH=path_to_data_path
program_path=path_to_lra_dir
TASK=$1
ARCH=$2
BS=$3
N_LAYERS=$4
D_MODEL=$5
NORM=$6
lr=${7}
wd=${8}
cards=${9}
n_works=${10}
dropout=${11}
n_works=4
PRENORM=${12}
warmup_steps=${13}
training_steps=${14}
expand_ratio_glu=${15}
expand_ratio=${16}
training_epochs=${17}
use_lower_bound=${18}
encoder=${19}
use_series=${20}
gradient_clip=${21}
echo ${cards}
mkdir -p logs_${TASK}
START_TIME=`date +%Y%m%d-%H:%M:%S`
torchrun --standalone \
--nproc_per_node=${cards} \
${program_path}/train.py wandb=null experiment=${ARCH}-lra-${TASK} \
trainer.devices=$cards \
trainer.precision=bf16 \
loader.batch_size=${BS} \
loader.num_workers=${n_works} \
scheduler.num_warmup_steps=${warmup_steps} \
scheduler.num_training_steps=${training_steps} \
optimizer.lr=${lr} optimizer.weight_decay=${wd} \
model.n_layers=${N_LAYERS} model.d_model=${D_MODEL} \
model.norm=${NORM} model.prenorm=${PRENORM} train.seed=2222 \
model.dropout=${dropout} \
model.expand_ratio_glu=${expand_ratio_glu} \
model.expand_ratio=${expand_ratio} \
model.use_lower_bound=${use_lower_bound} \
trainer.max_epochs=${training_epochs} \
model.encoder=${encoder} \
model.use_series=${use_series} \
trainer.gradient_clip_val=${gradient_clip} \
decoder.mode=pool | tee logs_${TASK}/${START_TIME}_${ARCH}-lra-${TASK}.log