diff --git a/example_run.sh b/example_run.sh index 59e567c..953c7aa 100755 --- a/example_run.sh +++ b/example_run.sh @@ -1,12 +1,13 @@ export MPICH_GPU_SUPPORT_ENABLED=1 srun \ + -t 0:10:0 \ -N $NODES \ - -n $((NODES*GPUS_PER_NODE)) \ + --ntasks-per-node $GPUS_PER_NODE \ --partition=standard-g \ --account=project_465000858 \ --gpus $((NODES*GPUS_PER_NODE)) \ singularity exec \ -B $(pwd):/myrun \ - /project/project_465000858/klajster.sif \ - /myrun/examples/torch/job.sh $((NODES*GPUS_PER_NODE)) + /flash/project_465000858/klajster.sif \ + /myrun/examples/torch/job.sh $GPUS_PER_NODE $NODES diff --git a/examples/torch/job.sh b/examples/torch/job.sh index 7bde81d..2120f40 100755 --- a/examples/torch/job.sh +++ b/examples/torch/job.sh @@ -1,6 +1,7 @@ #!/bin/bash $WITH_CONDA GPUS=$1 +NUM_NODES=$2 echo "GPUS: $GPUS" -MAX_GPU_ID=$((GPUS=1)) -python /myrun/examples/torch/train.py --rootdir /myrun/runs/torch-example --gpu_device $(seq -s ' ' 0 $MAX_GPU_ID) --batch_size 768 +MAX_GPU_ID=$((GPUS-1)) +python /myrun/examples/torch/train_lightning.py --rootdir /myrun/runs/torch-example --gpu_device $(seq -s ' ' 0 $MAX_GPU_ID) --batch_size 768 --num_nodes $NUM_NODES diff --git a/examples/torch/train_lightning.py b/examples/torch/train_lightning.py index 89eab95..686893a 100644 --- a/examples/torch/train_lightning.py +++ b/examples/torch/train_lightning.py @@ -16,13 +16,16 @@ parser.add_argument("--batch_size", type=int, default=768, help="") parser.add_argument("--num_workers", type=int, default=4, help="") parser.add_argument("--gpu_devices", type=int, nargs="+", default=None, help="") +parser.add_argument("--num_nodes", type=int, default=1, help="") +parser.add_argument('--rootdir', type=str, help='') class CIFARDataModule(pl.LightningDataModule): - def __init__(self, batch_size: int, num_workers: int): + def __init__(self, batch_size: int, num_workers: int, rootdir: str): super().__init__() self.batch_size = batch_size self.num_workers = num_workers + self.rootdir = rootdir self.train_data = None self.transforms = T.Compose( @@ -36,7 +39,7 @@ def __init__(self, batch_size: int, num_workers: int): def setup(self, stage: str) -> None: self.train_data = CIFAR10( - root="./data", + root=self.rootdir, train=True, download=True, transform=self.transforms, @@ -89,6 +92,7 @@ def main(): dm = CIFARDataModule( batch_size=args.batch_size, num_workers=args.num_workers, + rootdir=args.rootdir, ) model = Model(lr=args.lr) @@ -97,6 +101,7 @@ def main(): devices=len(args.gpu_devices), strategy="ddp", max_epochs=5, + num_nodes=args.num_nodes ) trainer.fit(model=model, datamodule=dm)