Skip to content

Commit

Permalink
idziemystad
Browse files Browse the repository at this point in the history
  • Loading branch information
MGniew committed Nov 28, 2023
1 parent 5ff437b commit e8699ca
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 7 deletions.
7 changes: 4 additions & 3 deletions example_run.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
export MPICH_GPU_SUPPORT_ENABLED=1

srun \
-t 0:10:0 \
-N $NODES \
-n $((NODES*GPUS_PER_NODE)) \
--ntasks-per-node $GPUS_PER_NODE \
--partition=standard-g \
--account=project_465000858 \
--gpus $((NODES*GPUS_PER_NODE)) \
singularity exec \
-B $(pwd):/myrun \
/project/project_465000858/klajster.sif \
/myrun/examples/torch/job.sh $((NODES*GPUS_PER_NODE))
/flash/project_465000858/klajster.sif \
/myrun/examples/torch/job.sh $GPUS_PER_NODE $NODES
5 changes: 3 additions & 2 deletions examples/torch/job.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash
$WITH_CONDA
GPUS=$1
NUM_NODES=$2
echo "GPUS: $GPUS"
MAX_GPU_ID=$((GPUS=1))
python /myrun/examples/torch/train.py --rootdir /myrun/runs/torch-example --gpu_device $(seq -s ' ' 0 $MAX_GPU_ID) --batch_size 768
MAX_GPU_ID=$((GPUS-1))
python /myrun/examples/torch/train_lightning.py --rootdir /myrun/runs/torch-example --gpu_device $(seq -s ' ' 0 $MAX_GPU_ID) --batch_size 768 --num_nodes $NUM_NODES
9 changes: 7 additions & 2 deletions examples/torch/train_lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@
parser.add_argument("--batch_size", type=int, default=768, help="")
parser.add_argument("--num_workers", type=int, default=4, help="")
parser.add_argument("--gpu_devices", type=int, nargs="+", default=None, help="")
parser.add_argument("--num_nodes", type=int, default=1, help="")
parser.add_argument('--rootdir', type=str, help='')


class CIFARDataModule(pl.LightningDataModule):
def __init__(self, batch_size: int, num_workers: int):
def __init__(self, batch_size: int, num_workers: int, rootdir: str):
super().__init__()
self.batch_size = batch_size
self.num_workers = num_workers
self.rootdir = rootdir

self.train_data = None
self.transforms = T.Compose(
Expand All @@ -36,7 +39,7 @@ def __init__(self, batch_size: int, num_workers: int):

def setup(self, stage: str) -> None:
self.train_data = CIFAR10(
root="./data",
root=self.rootdir,
train=True,
download=True,
transform=self.transforms,
Expand Down Expand Up @@ -89,6 +92,7 @@ def main():
dm = CIFARDataModule(
batch_size=args.batch_size,
num_workers=args.num_workers,
rootdir=args.rootdir,
)
model = Model(lr=args.lr)

Expand All @@ -97,6 +101,7 @@ def main():
devices=len(args.gpu_devices),
strategy="ddp",
max_epochs=5,
num_nodes=args.num_nodes
)

trainer.fit(model=model, datamodule=dm)
Expand Down

0 comments on commit e8699ca

Please sign in to comment.