-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun_experiments_parallel.sh
78 lines (67 loc) · 2.38 KB
/
run_experiments_parallel.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
param_str=fifty_epochs
experiment_name=bananas
# set all instance names and zones
instances=(bananas-t4-1-vm bananas-t4-2-vm bananas-t4-3-vm bananas-t4-4-vm \
bananas-t4-5-vm bananas-t4-6-vm bananas-t4-7-vm bananas-t4-8-vm \
bananas-t4-9-vm bananas-t4-10-vm)
zones=(us-west1-b us-west1-b us-west1-b us-west1-b us-west1-b us-west1-b \
us-west1-b us-west1-b us-west1-b us-west1-b)
# set parameters based on the param string
if [ $param_str = test ]; then
start_iteration=0
end_iteration=1
k=10
untrained_filename=untrained_spec
trained_filename=trained_spec
epochs=1
fi
if [ $param_str = fifty_epochs ]; then
start_iteration=0
end_iteration=9
k=10
untrained_filename=untrained_spec
trained_filename=trained_spec
epochs=50
fi
# start bananas
for i in $(seq $start_iteration $end_iteration)
do
let start=$i*$k
let end=($i+1)*$k-1
# train the neural net
# input: all pickle files with index from 0 to i*k-1
# output: k pickle files for the architectures to train next (indices i*k to (i+1)*k-1)
echo about to run meta neural network in iteration $i
python3 metann_runner.py --experiment_name $experiment_name --params $nas_params --k $k \
--untrained_filename $untrained_filename --trained_filename $trained_filename --query $start
echo outputted architectures to train in iteration $i
# train the k architectures
let max_j=$k-1
for j in $(seq 0 $max_j )
do
let query=$i*$k+$j
instance=${instances[$j]}
zone=${zones[$j]}
untrained_filepath=$experiment_name/$untrained_filename\_$query.pkl
trained_filepath=$experiment_name/$trained_filename\_$query.pkl
echo about to copy file $untrained_filepath to instance $instance
gcloud compute scp $untrained_filepath $instance:~/naszilla/$experiment_name/ --zone $zone
echo about to ssh into instance $instance
gcloud compute ssh $instance --zone $zone --command="cd naszilla; \
python3 train_arch_runner.py --untrained_filepath $untrained_filepath \
--trained_filepath $trained_filepath --epochs $epochs" &
done
wait
echo all architectures trained in iteration $i
# copy results of trained architectures to the master CPU
let max_j=$k-1
for j in $(seq 0 $max_j )
do
let query=$i*$k+$j
instance=${instances[$j]}
zone=${zones[$j]}
trained_filepath=$experiment_name/$trained_filename\_$query.pkl
gcloud compute scp $instance:~/naszilla/$trained_filepath $experiment_name --zone $zone
done
echo finished iteration $i
done