From d18d984a045330eedfc91ba1e910626c06b26a65 Mon Sep 17 00:00:00 2001 From: arjkesh <33526713+arjkesh@users.noreply.github.com> Date: Tue, 14 Jan 2025 16:16:27 -0800 Subject: [PATCH] Add python sitecustomize file (#4486) --- miscellaneous_scripts/dlc_template.py | 14 ++++++ .../inference/docker/2.4/py3/Dockerfile.cpu | 2 + .../docker/2.4/py3/cu124/Dockerfile.gpu | 2 + src/image_builder.py | 44 ++++++++++++++----- src/utils.py | 20 +++++++++ .../test_pt_dlc_telemetry_test.py | 20 +++++---- .../bin/test_tf_dlc_telemetry_test.py | 21 +++++---- 7 files changed, 96 insertions(+), 27 deletions(-) create mode 100644 miscellaneous_scripts/dlc_template.py diff --git a/miscellaneous_scripts/dlc_template.py b/miscellaneous_scripts/dlc_template.py new file mode 100644 index 000000000000..6a674d13befd --- /dev/null +++ b/miscellaneous_scripts/dlc_template.py @@ -0,0 +1,14 @@ +import os + +try: + if os.path.exists("/usr/local/bin/deep_learning_container.py") and ( + os.getenv("OPT_OUT_TRACKING") is None or os.getenv("OPT_OUT_TRACKING", "").lower() != "true" + ): + import threading + + cmd = "python /usr/local/bin/deep_learning_container.py --framework {FRAMEWORK} --framework-version {FRAMEWORK_VERSION} --container-type {CONTAINER_TYPE} &>/dev/null" + x = threading.Thread(target=lambda: os.system(cmd)) + x.setDaemon(True) + x.start() +except Exception: + pass diff --git a/pytorch/inference/docker/2.4/py3/Dockerfile.cpu b/pytorch/inference/docker/2.4/py3/Dockerfile.cpu index af236e8357b2..05084b3d483d 100644 --- a/pytorch/inference/docker/2.4/py3/Dockerfile.cpu +++ b/pytorch/inference/docker/2.4/py3/Dockerfile.cpu @@ -189,6 +189,8 @@ RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.4/l COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY sitecustomize.py /usr/local/lib/${PYTHON_SHORT_VERSION}/sitecustomize.py + RUN chmod +x /usr/local/bin/deep_learning_container.py RUN HOME_DIR=/root \ diff --git a/pytorch/inference/docker/2.4/py3/cu124/Dockerfile.gpu b/pytorch/inference/docker/2.4/py3/cu124/Dockerfile.gpu index 4e87faf9004e..11a982cc98da 100644 --- a/pytorch/inference/docker/2.4/py3/cu124/Dockerfile.gpu +++ b/pytorch/inference/docker/2.4/py3/cu124/Dockerfile.gpu @@ -242,6 +242,8 @@ RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.4/l COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY sitecustomize.py /usr/local/lib/${PYTHON_SHORT_VERSION}/sitecustomize.py + RUN chmod +x /usr/local/bin/deep_learning_container.py RUN HOME_DIR=/root \ diff --git a/src/image_builder.py b/src/image_builder.py index 7c3d3b6109b8..11c94b50f611 100644 --- a/src/image_builder.py +++ b/src/image_builder.py @@ -240,6 +240,39 @@ def image_builder(buildspec, image_types=[], device_types=[]): } } ) + # job_type will be either inference or training, based on the repo URI + if "training" in image_repo_uri: + label_job_type = "training" + elif "inference" in image_repo_uri: + label_job_type = "inference" + else: + raise RuntimeError( + f"Cannot find inference or training job type in {image_repo_uri}. " + f"This is required to set job_type label." + ) + + template_file = os.path.join( + os.sep, get_cloned_folder_path(), "miscellaneous_scripts", "dlc_template.py" + ) + + template_fw_version = ( + str(image_config["framework_version"]) + if image_config.get("framework_version") + else str(BUILDSPEC["version"]) + ) + template_fw = str(BUILDSPEC["framework"]) + post_template_file = utils.generate_dlc_cmd( + template_path=template_file, + output_path=os.path.join(image_config["root"], "out.py"), + framework=template_fw, + framework_version=template_fw_version, + container_type=label_job_type, + ) + + ARTIFACTS.update( + {"customize": {"source": post_template_file, "target": "sitecustomize.py"}} + ) + context = Context(ARTIFACTS, f"build/{image_name}.tar.gz", image_config["root"]) if "labels" in image_config: @@ -265,17 +298,6 @@ def image_builder(buildspec, image_types=[], device_types=[]): label_contributor = str(BUILDSPEC.get("contributor")) label_transformers_version = str(transformers_version).replace(".", "-") - # job_type will be either inference or training, based on the repo URI - if "training" in image_repo_uri: - label_job_type = "training" - elif "inference" in image_repo_uri: - label_job_type = "inference" - else: - raise RuntimeError( - f"Cannot find inference or training job type in {image_repo_uri}. " - f"This is required to set job_type label." - ) - if cx_type == "sagemaker": # Adding standard labels to all images labels[ diff --git a/src/utils.py b/src/utils.py index b236560abcc2..76506c1aab0a 100644 --- a/src/utils.py +++ b/src/utils.py @@ -642,3 +642,23 @@ def verify_if_child_image_is_built_on_top_of_base_image(base_image_uri, child_im if base_layer_sha != child_layer_sha: return False return True + + +def generate_dlc_cmd(template_path, output_path, framework, framework_version, container_type): + with open(template_path, "r") as tf: + content = tf.read() + + replacements = { + "FRAMEWORK": framework, + "FRAMEWORK_VERSION": framework_version, + "CONTAINER_TYPE": container_type, + } + + for anchor, value in replacements.items(): + content = content.replace(f"{{{anchor}}}", value) + + with open(output_path, "w") as out_f: + out_f.write(content) + + # Return base path and set as artifact + return os.path.basename(output_path) diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test.py b/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test.py index 385049955d68..e59bef56d055 100644 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test.py +++ b/test/dlc_tests/container_tests/bin/pytorch_tests/test_pt_dlc_telemetry_test.py @@ -10,13 +10,13 @@ def _clean_up_reports(): os.system("rm /tmp/test_tag_request.txt") -def opt_in_opt_out_test(): +def opt_in_opt_out_test(exec_cmd): os.environ["TEST_MODE"] = "1" for opt_out_value in ["True", "TRUE", "true"]: _clean_up_reports() os.environ["OPT_OUT_TRACKING"] = opt_out_value - cmd = "python -c 'import torch'" + cmd = f"python -c '{exec_cmd}'" os.system(cmd) time.sleep(5) assert not os.path.exists( @@ -29,7 +29,7 @@ def opt_in_opt_out_test(): for opt_out_value in ["False", "XYgg"]: _clean_up_reports() os.environ["OPT_OUT_TRACKING"] = opt_out_value - cmd = "python -c 'import torch'" + cmd = f"python -c '{exec_cmd}'" os.system(cmd) time.sleep(5) assert os.path.exists( @@ -43,7 +43,7 @@ def opt_in_opt_out_test(): print("Opt-In/Opt-Out Test passed") -def perf_test(): +def perf_test(exec_cmd): os.environ["TEST_MODE"] = "0" os.environ["OPT_OUT_TRACKING"] = "False" NUM_ITERATIONS = 5 @@ -51,7 +51,7 @@ def perf_test(): for itr in range(NUM_ITERATIONS): total_time_in = 0 for x in range(NUM_ITERATIONS): - cmd = "python -c 'import torch'" + cmd = f"python -c '{exec_cmd}'" start = time.time() os.system(cmd) total_time_in += time.time() - start @@ -59,7 +59,7 @@ def perf_test(): total_time_out = 0 for x in range(NUM_ITERATIONS): - cmd = "export OPT_OUT_TRACKING='true' && python -c 'import torch'" + cmd = f"export OPT_OUT_TRACKING='true' && python -c '{exec_cmd}'" start = time.time() os.system(cmd) total_time_out += time.time() - start @@ -72,7 +72,11 @@ def perf_test(): print("DLC Telemetry performance test Passed") -perf_test() -opt_in_opt_out_test() +perf_test("import torch") +opt_in_opt_out_test("import torch") + +# Disabling os tests until it is added to all new images +# perf_test("import os") +# opt_in_opt_out_test("import os") print("All DLC telemetry test passed") diff --git a/test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test.py b/test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test.py index af1ba12f62ba..091e5e722134 100644 --- a/test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test.py +++ b/test/dlc_tests/container_tests/bin/test_tf_dlc_telemetry_test.py @@ -10,13 +10,13 @@ def _clean_up_reports(): os.system("rm /tmp/test_tag_request.txt") -def opt_in_opt_out_test(): +def opt_in_opt_out_test(exec_cmd): os.environ["TEST_MODE"] = "1" for opt_out_value in ["True", "TRUE", "true"]: _clean_up_reports() os.environ["OPT_OUT_TRACKING"] = opt_out_value - cmd = "python -c 'import tensorflow'" + cmd = f"python -c '{exec_cmd}'" os.system(cmd) time.sleep(5) assert not os.path.exists( @@ -29,7 +29,7 @@ def opt_in_opt_out_test(): for opt_out_value in ["False", "XYgg"]: _clean_up_reports() os.environ["OPT_OUT_TRACKING"] = opt_out_value - cmd = "python -c 'import tensorflow'" + cmd = f"python -c '{exec_cmd}'" os.system(cmd) time.sleep(5) assert os.path.exists( @@ -43,7 +43,7 @@ def opt_in_opt_out_test(): print("Opt-In/Opt-Out Test passed") -def performance_test(): +def performance_test(exec_cmd): os.environ["TEST_MODE"] = "0" os.environ["OPT_OUT_TRACKING"] = "False" NUM_ITERATIONS = 5 @@ -51,7 +51,7 @@ def performance_test(): for itr in range(NUM_ITERATIONS): total_time_in = 0 for x in range(NUM_ITERATIONS): - cmd = "python -c 'import tensorflow'" + cmd = f"python -c '{exec_cmd}'" start = time.time() os.system(cmd) total_time_in += time.time() - start @@ -59,7 +59,7 @@ def performance_test(): total_time_out = 0 for x in range(NUM_ITERATIONS): - cmd = "export OPT_OUT_TRACKING='true' && python -c 'import tensorflow'" + cmd = f"export OPT_OUT_TRACKING='true' && python -c '{exec_cmd}'" start = time.time() os.system(cmd) total_time_out += time.time() - start @@ -72,7 +72,12 @@ def performance_test(): print("DLC Telemetry performance test Passed") -performance_test() -opt_in_opt_out_test() +# test framework functionality +performance_test("import tensorflow") +opt_in_opt_out_test("import tensorflow") + +# Disabling os tests until it is added to all new images +# performance_test("import os") +# opt_in_opt_out_test("import os") print("All DLC telemetry test passed")