diff --git a/gigl/src/validation_check/config_validator.py b/gigl/src/validation_check/config_validator.py index ec0ca4caf..bf6f3f5e1 100644 --- a/gigl/src/validation_check/config_validator.py +++ b/gigl/src/validation_check/config_validator.py @@ -16,6 +16,7 @@ assert_trained_model_exists, ) from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import ( + check_custom_launcher_config_requires_glt_backend, check_inferencer_graph_store_compatibility, check_trainer_graph_store_compatibility, ) @@ -23,6 +24,7 @@ check_if_kfp_pipeline_job_name_valid, ) from gigl.src.validation_check.libs.resource_config_checks import ( + check_custom_launcher_config_shape, check_if_inferencer_resource_config_valid, check_if_preprocessor_resource_config_valid, check_if_shared_resource_config_valid, @@ -202,25 +204,31 @@ GiGLComponents.ConfigPopulator.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_custom_launcher_config_requires_glt_backend, ], GiGLComponents.DataPreprocessor.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_custom_launcher_config_requires_glt_backend, ], GiGLComponents.SubgraphSampler.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_custom_launcher_config_requires_glt_backend, ], GiGLComponents.SplitGenerator.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_custom_launcher_config_requires_glt_backend, ], GiGLComponents.Trainer.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_custom_launcher_config_requires_glt_backend, ], GiGLComponents.Inferencer.value: [ check_inferencer_graph_store_compatibility, + check_custom_launcher_config_requires_glt_backend, ], # PostProcessor doesn't need graph store compatibility checks } @@ -347,6 +355,15 @@ def kfp_validation_checks( resource_config_wrapper=resource_config_wrapper, ) + # Validate any populated CustomLauncherConfig has a non-empty command. + # Unconditional — the check is shape-only and does not call out to any + # external service. + for component in (GiGLComponents.Trainer, GiGLComponents.Inferencer): + check_custom_launcher_config_shape( + resource_config_pb=resource_config_pb, + component=component, + ) + # check if trained model file exist when skipping training if gbml_config_pb.shared_config.should_skip_training == True: assert_trained_model_exists(gbml_config_pb=gbml_config_pb) diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index fc12d1939..290a68544 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -135,3 +135,63 @@ def check_inferencer_graph_store_compatibility( raise AssertionError( f"If one of GbmlConfig.inferencer_config.graph_store_storage_config or GiglResourceConfig.inferencer_resource_config is set, the other must also be set. GbmlConfig.inferencer_config.graph_store_storage_config is set: {gbml_has_graph_store}, GiglResourceConfig.inferencer_resource_config is set: {resource_has_graph_store}." ) + + +def check_custom_launcher_config_requires_glt_backend( + gbml_config_pb_wrapper: GbmlConfigPbWrapper, + resource_config_wrapper: GiglResourceConfigWrapper, +) -> None: + """Enforce that ``CustomLauncherConfig`` is only used with the GLT (v2) backend. + + The v1 trainer/inferencer dispatchers never consult the + ``custom_trainer_config`` / ``custom_inferencer_config`` oneof, so pairing + a ``CustomLauncherConfig`` with a task config that has + ``should_use_glt_backend=False`` would silently fall through the v1 path + and fail at runtime. Catch it up-front here so the failure is loud and + actionable at validation time. + + Note on naming: the wrapper exposes ``should_use_glt_backend`` (bool) but + the raw YAML key users set is ``feature_flags.should_run_glt_backend``. + The wrapper translates one into the other; this check always reads the + wrapper property and never the raw map. + + Args: + gbml_config_pb_wrapper: The GbmlConfig wrapper (template config). + resource_config_wrapper: The GiglResourceConfig wrapper (resource config). + + Raises: + ValueError: If either the trainer or inferencer resource config is a + ``CustomLauncherConfig`` and ``should_use_glt_backend`` is False. + """ + logger.info( + "Config validation check: CustomLauncherConfig requires GLT (v2) backend." + ) + trainer_is_custom = isinstance( + resource_config_wrapper.trainer_config, + gigl_resource_config_pb2.CustomLauncherConfig, + ) + inferencer_is_custom = isinstance( + resource_config_wrapper.inferencer_config, + gigl_resource_config_pb2.CustomLauncherConfig, + ) + if not (trainer_is_custom or inferencer_is_custom): + return + + if not gbml_config_pb_wrapper.should_use_glt_backend: + offending: list[str] = [] + if trainer_is_custom: + offending.append("trainer_resource_config.custom_trainer_config") + if inferencer_is_custom: + offending.append("inferencer_resource_config.custom_inferencer_config") + raise ValueError( + "CustomLauncherConfig is only wired into the GLT (v2) dispatchers " + "(glt_trainer.py / glt_inferencer.py); the v1 trainer/inferencer " + "never consult the custom oneof and would fall through to an " + "'Unsupported resource config' error at runtime. The following " + f"custom resource configs were set: {offending}, but the task " + "config has should_use_glt_backend=False (raw YAML key: " + "feature_flags.should_run_glt_backend). Either set " + "feature_flags.should_run_glt_backend='True' in the task config, " + "or replace the CustomLauncherConfig with a built-in resource " + "config." + ) diff --git a/gigl/src/validation_check/libs/resource_config_checks.py b/gigl/src/validation_check/libs/resource_config_checks.py index 660068bd4..90ae2ad41 100644 --- a/gigl/src/validation_check/libs/resource_config_checks.py +++ b/gigl/src/validation_check/libs/resource_config_checks.py @@ -3,6 +3,7 @@ from google.cloud.aiplatform_v1.types.accelerator_type import AcceleratorType from gigl.common.logger import Logger +from gigl.src.common.constants.components import GiGLComponents from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.types.pb_wrappers.gigl_resource_config import ( GiglResourceConfigWrapper, @@ -312,6 +313,57 @@ def _validate_machine_config( ) +def check_custom_launcher_config_shape( + resource_config_pb: gigl_resource_config_pb2.GiglResourceConfig, + component: GiGLComponents, +) -> None: + """Assert the trainer / inferencer's ``CustomLauncherConfig`` is well-shaped. + + Resolves the component's resource config through the wrapper; if it is not + a ``CustomLauncherConfig`` this helper is a no-op. Otherwise it asserts + ``command`` is non-empty so the launcher's runtime guard does not fail + on a typo in the YAML. + + Args: + resource_config_pb: The resource config to inspect. The trainer or + inferencer oneof (depending on ``component``) is pulled out of the + wrapper. + component: Which GiGL component to check. Must be Trainer or + Inferencer; other components never carry a ``CustomLauncherConfig``. + + Raises: + ValueError: If ``component`` is not Trainer or Inferencer, or if a + populated ``CustomLauncherConfig`` has an empty ``command``. + """ + if component not in {GiGLComponents.Trainer, GiGLComponents.Inferencer}: + raise ValueError( + f"check_custom_launcher_config_shape only supports " + f"Trainer and Inferencer components; got {component}." + ) + + wrapper = GiglResourceConfigWrapper(resource_config=resource_config_pb) + component_config: Union[ + gigl_resource_config_pb2.LocalResourceConfig, + gigl_resource_config_pb2.VertexAiResourceConfig, + gigl_resource_config_pb2.KFPResourceConfig, + gigl_resource_config_pb2.VertexAiGraphStoreConfig, + gigl_resource_config_pb2.DataflowResourceConfig, + gigl_resource_config_pb2.CustomLauncherConfig, + ] + if component == GiGLComponents.Trainer: + component_config = wrapper.trainer_config + else: + component_config = wrapper.inferencer_config + + if not isinstance(component_config, gigl_resource_config_pb2.CustomLauncherConfig): + return + + if not component_config.command: + raise ValueError( + f"CustomLauncherConfig.command must be set for {component.value}." + ) + + def check_if_trainer_graph_store_storage_command_valid( gbml_config_pb_wrapper: GbmlConfigPbWrapper, ) -> None: diff --git a/tests/unit/src/validation/config_validator_test.py b/tests/unit/src/validation/config_validator_test.py index 5373803f7..3de5d915c 100644 --- a/tests/unit/src/validation/config_validator_test.py +++ b/tests/unit/src/validation/config_validator_test.py @@ -352,5 +352,61 @@ def test_resource_config_validation_failure_with_mock_configs( ) +class TestCustomLauncherConfigInValidationChain(TestCase): + """Confirm the new CustomLauncherConfig gates are wired into the chain. + + The shape check (``check_custom_launcher_config_shape``) runs + unconditionally inside ``kfp_validation_checks``. A resource config + with a ``CustomLauncherConfig`` trainer that has an empty ``command`` + must surface a ``ValueError`` from the chain, not a launcher-side + runtime failure. + """ + + def setUp(self): + gigl.env.pipelines_config._resource_config = None + self._temp_dir = tempfile.mkdtemp() + self._proto_utils = ProtoUtils() + + def tearDown(self): + shutil.rmtree(self._temp_dir, ignore_errors=True) + gigl.env.pipelines_config._resource_config = None + + def _write_proto_to_file( + self, proto: google.protobuf.message.Message, filename: str + ) -> Uri: + filepath = os.path.join(self._temp_dir, filename) + uri = UriFactory.create_uri(filepath) + self._proto_utils.write_proto_to_yaml(proto, uri) + return uri + + def test_empty_custom_trainer_command_raises_via_chain(self) -> None: + # Build a live-SGS task config (so the chain runs through + # CustomLauncherConfig-aware paths). + task_config_uri = self._write_proto_to_file( + _create_valid_live_subgraph_sampling_task_config(), + "live_task_config.yaml", + ) + + # Resource config with a CustomLauncherConfig trainer whose + # command is empty — the new shape check must catch it. + resource_config = _create_valid_live_subgraph_sampling_resource_config() + resource_config.trainer_resource_config.ClearField("trainer_config") + resource_config.trainer_resource_config.custom_trainer_config.command = "" + resource_config_uri = self._write_proto_to_file( + resource_config, "live_custom_empty_resource_config.yaml" + ) + + with self.assertRaises(ValueError) as ctx: + kfp_validation_checks( + job_name="custom_launcher_config_chain_test", + task_config_uri=task_config_uri, + start_at="config_populator", + resource_config_uri=resource_config_uri, + ) + # Error message must come from check_custom_launcher_config_shape, + # not from a generic shape check elsewhere. + self.assertIn("CustomLauncherConfig.command", str(ctx.exception)) + + if __name__ == "__main__": absltest.main() diff --git a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py index c70450501..5f4c5704a 100644 --- a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py +++ b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py @@ -5,12 +5,18 @@ GiglResourceConfigWrapper, ) from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import ( + check_custom_launcher_config_requires_glt_backend, check_inferencer_graph_store_compatibility, check_trainer_graph_store_compatibility, ) from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2 from tests.test_assets.test_case import TestCase +# Placeholder shell snippet used by CustomLauncherConfig fixtures in this +# module — these tests only exercise type-of-config dispatch, not actual +# subprocess execution. +_FAKE_COMMAND = "echo fake" + # Helper functions for creating VertexAiGraphStoreConfig @@ -214,5 +220,107 @@ def test_resource_has_inferencer_graph_store_template_does_not(self): ) +# Helper functions for custom + glt-backend compatibility tests + + +def _create_gbml_config_with_glt_flag(value: str) -> GbmlConfigPbWrapper: + """Create a GbmlConfig whose feature_flags.should_run_glt_backend is set. + + Note the raw YAML key is ``should_run_glt_backend`` (not + ``should_use_glt_backend``). The wrapper's ``should_use_glt_backend`` + property reads this key from the ``feature_flags`` map and converts it to + a bool. + """ + gbml_config = gbml_config_pb2.GbmlConfig() + gbml_config.feature_flags["should_run_glt_backend"] = value + return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) + + +def _create_resource_config_with_custom_trainer() -> GiglResourceConfigWrapper: + """Create a GiglResourceConfig whose trainer is a CustomLauncherConfig.""" + config = gigl_resource_config_pb2.GiglResourceConfig() + _create_shared_resource_config(config) + config.trainer_resource_config.custom_trainer_config.command = _FAKE_COMMAND + # Inferencer uses a built-in config so only the trainer path is custom. + config.inferencer_resource_config.vertex_ai_inferencer_config.CopyFrom( + _create_vertex_ai_resource_config() + ) + return GiglResourceConfigWrapper(resource_config=config) + + +def _create_resource_config_with_custom_inferencer() -> GiglResourceConfigWrapper: + """Create a GiglResourceConfig whose inferencer is a CustomLauncherConfig.""" + config = gigl_resource_config_pb2.GiglResourceConfig() + _create_shared_resource_config(config) + config.trainer_resource_config.vertex_ai_trainer_config.CopyFrom( + _create_vertex_ai_resource_config() + ) + config.inferencer_resource_config.custom_inferencer_config.command = _FAKE_COMMAND + return GiglResourceConfigWrapper(resource_config=config) + + +class TestCustomLauncherConfigRequiresGltBackend(TestCase): + """Test suite for the CustomLauncherConfig + GLT-backend compatibility guard. + + Because v1 trainer/inferencer dispatchers don't consult the custom oneof, + pairing a ``CustomLauncherConfig`` with + ``feature_flags.should_run_glt_backend = "False"`` must be caught at + validation time rather than at runtime. + """ + + def test_custom_trainer_without_glt_raises(self): + """CustomLauncherConfig trainer + glt=False raises a clear ValueError.""" + gbml_config = _create_gbml_config_with_glt_flag("False") + resource_config = _create_resource_config_with_custom_trainer() + with self.assertRaises(ValueError) as ctx: + check_custom_launcher_config_requires_glt_backend( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + self.assertIn("should_run_glt_backend", str(ctx.exception)) + self.assertIn("custom_trainer_config", str(ctx.exception)) + + def test_custom_inferencer_without_glt_raises(self): + """CustomLauncherConfig inferencer + glt=False raises a clear ValueError.""" + gbml_config = _create_gbml_config_with_glt_flag("False") + resource_config = _create_resource_config_with_custom_inferencer() + with self.assertRaises(ValueError) as ctx: + check_custom_launcher_config_requires_glt_backend( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + self.assertIn("custom_inferencer_config", str(ctx.exception)) + + def test_custom_trainer_with_glt_passes(self): + """CustomLauncherConfig trainer + glt=True passes validation.""" + gbml_config = _create_gbml_config_with_glt_flag("True") + resource_config = _create_resource_config_with_custom_trainer() + # Should not raise any exception + check_custom_launcher_config_requires_glt_backend( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + def test_custom_inferencer_with_glt_passes(self): + """CustomLauncherConfig inferencer + glt=True passes validation.""" + gbml_config = _create_gbml_config_with_glt_flag("True") + resource_config = _create_resource_config_with_custom_inferencer() + # Should not raise any exception + check_custom_launcher_config_requires_glt_backend( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + def test_no_custom_config_passes_without_glt(self): + """No CustomLauncherConfig at all passes regardless of glt flag.""" + gbml_config = _create_gbml_config_with_glt_flag("False") + resource_config = _create_resource_config_without_graph_stores() + # Should not raise any exception: no custom oneof means nothing to enforce. + check_custom_launcher_config_requires_glt_backend( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + if __name__ == "__main__": absltest.main() diff --git a/tests/unit/src/validation/lib/resource_config_checks_test.py b/tests/unit/src/validation/lib/resource_config_checks_test.py index 7f00b7102..a8bb7c534 100644 --- a/tests/unit/src/validation/lib/resource_config_checks_test.py +++ b/tests/unit/src/validation/lib/resource_config_checks_test.py @@ -1,11 +1,15 @@ +from unittest.mock import patch + from absl.testing import absltest +from gigl.src.common.constants.components import GiGLComponents from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.validation_check.libs.resource_config_checks import ( _check_if_dataflow_resource_config_valid, _check_if_spark_resource_config_valid, _validate_accelerator_type, _validate_machine_config, + check_custom_launcher_config_shape, check_if_inferencer_graph_store_storage_command_valid, check_if_inferencer_resource_config_valid, check_if_preprocessor_resource_config_valid, @@ -18,6 +22,9 @@ from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2 from tests.test_assets.test_case import TestCase +# Placeholder shell snippet used by CustomLauncherConfig fixtures. +_FAKE_COMMAND = "echo fake" + # Helper functions for creating valid configurations @@ -204,6 +211,28 @@ def _create_valid_local_inferencer_config() -> ( return config +def _create_valid_custom_trainer_config( + command: str = _FAKE_COMMAND, + args: list[str] | None = None, +) -> gigl_resource_config_pb2.GiglResourceConfig: + """Create a GiglResourceConfig with a CustomLauncherConfig trainer.""" + config = gigl_resource_config_pb2.GiglResourceConfig() + config.trainer_resource_config.custom_trainer_config.command = command + config.trainer_resource_config.custom_trainer_config.args.extend(args or []) + return config + + +def _create_valid_custom_inferencer_config( + command: str = _FAKE_COMMAND, + args: list[str] | None = None, +) -> gigl_resource_config_pb2.GiglResourceConfig: + """Create a GiglResourceConfig with a CustomLauncherConfig inferencer.""" + config = gigl_resource_config_pb2.GiglResourceConfig() + config.inferencer_resource_config.custom_inferencer_config.command = command + config.inferencer_resource_config.custom_inferencer_config.args.extend(args or []) + return config + + def _create_valid_vertex_ai_config() -> gigl_resource_config_pb2.VertexAiResourceConfig: """Create a valid Vertex AI resource configuration.""" config = gigl_resource_config_pb2.VertexAiResourceConfig() @@ -791,6 +820,106 @@ def test_no_graph_store_config(self): check_if_inferencer_graph_store_storage_command_valid(gbml_config) +class TestCustomLauncherConfigBypass(TestCase): + """Test suite for CustomLauncherConfig caller-level bypass. + + ``CustomLauncherConfig`` is launcher-pluggable: it has no concrete machine + shape to validate. The callers (``check_if_trainer_resource_config_valid`` + and ``check_if_inferencer_resource_config_valid``) short-circuit before + reaching ``_validate_machine_config``, which keeps that helper's contract + ("validate a concrete machine spec") intact. + """ + + def test_trainer_custom_config_bypasses_machine_validation(self): + """CustomLauncherConfig trainer bypasses _validate_machine_config entirely.""" + config = _create_valid_custom_trainer_config(args=["--cluster_size=4"]) + with patch( + "gigl.src.validation_check.libs.resource_config_checks._validate_machine_config" + ) as mock_validate: + check_if_trainer_resource_config_valid(resource_config_pb=config) + mock_validate.assert_not_called() + + def test_inferencer_custom_config_bypasses_machine_validation(self): + """CustomLauncherConfig inferencer bypasses _validate_machine_config entirely.""" + config = _create_valid_custom_inferencer_config(args=["--cluster_size=4"]) + with patch( + "gigl.src.validation_check.libs.resource_config_checks._validate_machine_config" + ) as mock_validate: + check_if_inferencer_resource_config_valid(resource_config_pb=config) + mock_validate.assert_not_called() + + def test_vertex_ai_trainer_still_calls_machine_validation(self): + """Sanity: non-custom trainer still dispatches to _validate_machine_config.""" + config = _create_valid_vertex_ai_trainer_config() + with patch( + "gigl.src.validation_check.libs.resource_config_checks._validate_machine_config" + ) as mock_validate: + check_if_trainer_resource_config_valid(resource_config_pb=config) + mock_validate.assert_called_once() + + def test_vertex_ai_inferencer_still_calls_machine_validation(self): + """Sanity: non-custom inferencer still dispatches to _validate_machine_config.""" + config = _create_valid_vertex_ai_inferencer_config() + with patch( + "gigl.src.validation_check.libs.resource_config_checks._validate_machine_config" + ) as mock_validate: + check_if_inferencer_resource_config_valid(resource_config_pb=config) + mock_validate.assert_called_once() + + +class TestCustomLauncherConfigShape(TestCase): + """Test suite for ``check_custom_launcher_config_shape``. + + Shape-only check: a populated ``CustomLauncherConfig`` must have a + non-empty ``command``. Non-custom configs are no-ops. + """ + + def test_populated_command_passes(self): + config = _create_valid_custom_trainer_config(command="python -m my.cli") + check_custom_launcher_config_shape( + resource_config_pb=config, + component=GiGLComponents.Trainer, + ) + + def test_empty_trainer_command_raises(self): + config = _create_valid_custom_trainer_config(command="") + with self.assertRaises(ValueError): + check_custom_launcher_config_shape( + resource_config_pb=config, + component=GiGLComponents.Trainer, + ) + + def test_empty_inferencer_command_raises(self): + config = _create_valid_custom_inferencer_config(command="") + with self.assertRaises(ValueError): + check_custom_launcher_config_shape( + resource_config_pb=config, + component=GiGLComponents.Inferencer, + ) + + def test_non_custom_trainer_is_no_op(self): + config = _create_valid_vertex_ai_trainer_config() + check_custom_launcher_config_shape( + resource_config_pb=config, + component=GiGLComponents.Trainer, + ) + + def test_non_custom_inferencer_is_no_op(self): + config = _create_valid_vertex_ai_inferencer_config() + check_custom_launcher_config_shape( + resource_config_pb=config, + component=GiGLComponents.Inferencer, + ) + + def test_unsupported_component_raises(self): + config = _create_valid_custom_trainer_config() + with self.assertRaises(ValueError): + check_custom_launcher_config_shape( + resource_config_pb=config, + component=GiGLComponents.DataPreprocessor, + ) + + class TestReservationAffinityValidation(TestCase): """Validate VertexAiResourceConfig.reservation_affinity handling."""