Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions gigl/src/validation_check/config_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
assert_trained_model_exists,
)
from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import (
check_custom_launcher_config_requires_glt_backend,
check_inferencer_graph_store_compatibility,
check_trainer_graph_store_compatibility,
)
from gigl.src.validation_check.libs.name_checks import (
check_if_kfp_pipeline_job_name_valid,
)
from gigl.src.validation_check.libs.resource_config_checks import (
check_custom_launcher_config_shape,
check_if_inferencer_resource_config_valid,
check_if_preprocessor_resource_config_valid,
check_if_shared_resource_config_valid,
Expand Down Expand Up @@ -202,25 +204,31 @@
GiGLComponents.ConfigPopulator.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.DataPreprocessor.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.SubgraphSampler.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.SplitGenerator.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.Trainer.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
GiGLComponents.Inferencer.value: [
check_inferencer_graph_store_compatibility,
check_custom_launcher_config_requires_glt_backend,
],
# PostProcessor doesn't need graph store compatibility checks
}
Expand Down Expand Up @@ -347,6 +355,15 @@ def kfp_validation_checks(
resource_config_wrapper=resource_config_wrapper,
)

# Validate any populated CustomLauncherConfig has a non-empty command.
# Unconditional — the check is shape-only and does not call out to any
# external service.
for component in (GiGLComponents.Trainer, GiGLComponents.Inferencer):
check_custom_launcher_config_shape(
resource_config_pb=resource_config_pb,
component=component,
)

# check if trained model file exist when skipping training
if gbml_config_pb.shared_config.should_skip_training == True:
assert_trained_model_exists(gbml_config_pb=gbml_config_pb)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,63 @@ def check_inferencer_graph_store_compatibility(
raise AssertionError(
f"If one of GbmlConfig.inferencer_config.graph_store_storage_config or GiglResourceConfig.inferencer_resource_config is set, the other must also be set. GbmlConfig.inferencer_config.graph_store_storage_config is set: {gbml_has_graph_store}, GiglResourceConfig.inferencer_resource_config is set: {resource_has_graph_store}."
)


def check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper: GbmlConfigPbWrapper,
resource_config_wrapper: GiglResourceConfigWrapper,
) -> None:
"""Enforce that ``CustomLauncherConfig`` is only used with the GLT (v2) backend.

The v1 trainer/inferencer dispatchers never consult the
``custom_trainer_config`` / ``custom_inferencer_config`` oneof, so pairing
a ``CustomLauncherConfig`` with a task config that has
``should_use_glt_backend=False`` would silently fall through the v1 path
and fail at runtime. Catch it up-front here so the failure is loud and
actionable at validation time.

Note on naming: the wrapper exposes ``should_use_glt_backend`` (bool) but
the raw YAML key users set is ``feature_flags.should_run_glt_backend``.
The wrapper translates one into the other; this check always reads the
wrapper property and never the raw map.

Args:
gbml_config_pb_wrapper: The GbmlConfig wrapper (template config).
resource_config_wrapper: The GiglResourceConfig wrapper (resource config).

Raises:
ValueError: If either the trainer or inferencer resource config is a
``CustomLauncherConfig`` and ``should_use_glt_backend`` is False.
"""
logger.info(
"Config validation check: CustomLauncherConfig requires GLT (v2) backend."
)
trainer_is_custom = isinstance(
resource_config_wrapper.trainer_config,
gigl_resource_config_pb2.CustomLauncherConfig,
)
inferencer_is_custom = isinstance(
resource_config_wrapper.inferencer_config,
gigl_resource_config_pb2.CustomLauncherConfig,
)
if not (trainer_is_custom or inferencer_is_custom):
return

if not gbml_config_pb_wrapper.should_use_glt_backend:
offending: list[str] = []
if trainer_is_custom:
offending.append("trainer_resource_config.custom_trainer_config")
if inferencer_is_custom:
offending.append("inferencer_resource_config.custom_inferencer_config")
raise ValueError(
"CustomLauncherConfig is only wired into the GLT (v2) dispatchers "
"(glt_trainer.py / glt_inferencer.py); the v1 trainer/inferencer "
"never consult the custom oneof and would fall through to an "
"'Unsupported resource config' error at runtime. The following "
f"custom resource configs were set: {offending}, but the task "
"config has should_use_glt_backend=False (raw YAML key: "
"feature_flags.should_run_glt_backend). Either set "
"feature_flags.should_run_glt_backend='True' in the task config, "
"or replace the CustomLauncherConfig with a built-in resource "
"config."
)
52 changes: 52 additions & 0 deletions gigl/src/validation_check/libs/resource_config_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from google.cloud.aiplatform_v1.types.accelerator_type import AcceleratorType

from gigl.common.logger import Logger
from gigl.src.common.constants.components import GiGLComponents
from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper
from gigl.src.common.types.pb_wrappers.gigl_resource_config import (
GiglResourceConfigWrapper,
Expand Down Expand Up @@ -312,6 +313,57 @@ def _validate_machine_config(
)


def check_custom_launcher_config_shape(
resource_config_pb: gigl_resource_config_pb2.GiglResourceConfig,
component: GiGLComponents,
) -> None:
"""Assert the trainer / inferencer's ``CustomLauncherConfig`` is well-shaped.

Resolves the component's resource config through the wrapper; if it is not
a ``CustomLauncherConfig`` this helper is a no-op. Otherwise it asserts
``command`` is non-empty so the launcher's runtime guard does not fail
on a typo in the YAML.

Args:
resource_config_pb: The resource config to inspect. The trainer or
inferencer oneof (depending on ``component``) is pulled out of the
wrapper.
component: Which GiGL component to check. Must be Trainer or
Inferencer; other components never carry a ``CustomLauncherConfig``.

Raises:
ValueError: If ``component`` is not Trainer or Inferencer, or if a
populated ``CustomLauncherConfig`` has an empty ``command``.
"""
if component not in {GiGLComponents.Trainer, GiGLComponents.Inferencer}:
raise ValueError(
f"check_custom_launcher_config_shape only supports "
f"Trainer and Inferencer components; got {component}."
)

wrapper = GiglResourceConfigWrapper(resource_config=resource_config_pb)
component_config: Union[
gigl_resource_config_pb2.LocalResourceConfig,
gigl_resource_config_pb2.VertexAiResourceConfig,
gigl_resource_config_pb2.KFPResourceConfig,
gigl_resource_config_pb2.VertexAiGraphStoreConfig,
gigl_resource_config_pb2.DataflowResourceConfig,
gigl_resource_config_pb2.CustomLauncherConfig,
]
if component == GiGLComponents.Trainer:
component_config = wrapper.trainer_config
else:
component_config = wrapper.inferencer_config

if not isinstance(component_config, gigl_resource_config_pb2.CustomLauncherConfig):
return

if not component_config.command:
raise ValueError(
f"CustomLauncherConfig.command must be set for {component.value}."
)


def check_if_trainer_graph_store_storage_command_valid(
gbml_config_pb_wrapper: GbmlConfigPbWrapper,
) -> None:
Expand Down
56 changes: 56 additions & 0 deletions tests/unit/src/validation/config_validator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,5 +352,61 @@ def test_resource_config_validation_failure_with_mock_configs(
)


class TestCustomLauncherConfigInValidationChain(TestCase):
"""Confirm the new CustomLauncherConfig gates are wired into the chain.

The shape check (``check_custom_launcher_config_shape``) runs
unconditionally inside ``kfp_validation_checks``. A resource config
with a ``CustomLauncherConfig`` trainer that has an empty ``command``
must surface a ``ValueError`` from the chain, not a launcher-side
runtime failure.
"""

def setUp(self):
gigl.env.pipelines_config._resource_config = None
self._temp_dir = tempfile.mkdtemp()
self._proto_utils = ProtoUtils()

def tearDown(self):
shutil.rmtree(self._temp_dir, ignore_errors=True)
gigl.env.pipelines_config._resource_config = None

def _write_proto_to_file(
self, proto: google.protobuf.message.Message, filename: str
) -> Uri:
filepath = os.path.join(self._temp_dir, filename)
uri = UriFactory.create_uri(filepath)
self._proto_utils.write_proto_to_yaml(proto, uri)
return uri

def test_empty_custom_trainer_command_raises_via_chain(self) -> None:
# Build a live-SGS task config (so the chain runs through
# CustomLauncherConfig-aware paths).
task_config_uri = self._write_proto_to_file(
_create_valid_live_subgraph_sampling_task_config(),
"live_task_config.yaml",
)

# Resource config with a CustomLauncherConfig trainer whose
# command is empty — the new shape check must catch it.
resource_config = _create_valid_live_subgraph_sampling_resource_config()
resource_config.trainer_resource_config.ClearField("trainer_config")
resource_config.trainer_resource_config.custom_trainer_config.command = ""
resource_config_uri = self._write_proto_to_file(
resource_config, "live_custom_empty_resource_config.yaml"
)

with self.assertRaises(ValueError) as ctx:
kfp_validation_checks(
job_name="custom_launcher_config_chain_test",
task_config_uri=task_config_uri,
start_at="config_populator",
resource_config_uri=resource_config_uri,
)
# Error message must come from check_custom_launcher_config_shape,
# not from a generic shape check elsewhere.
self.assertIn("CustomLauncherConfig.command", str(ctx.exception))


if __name__ == "__main__":
absltest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
GiglResourceConfigWrapper,
)
from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import (
check_custom_launcher_config_requires_glt_backend,
check_inferencer_graph_store_compatibility,
check_trainer_graph_store_compatibility,
)
from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2
from tests.test_assets.test_case import TestCase

# Placeholder shell snippet used by CustomLauncherConfig fixtures in this
# module — these tests only exercise type-of-config dispatch, not actual
# subprocess execution.
_FAKE_COMMAND = "echo fake"

# Helper functions for creating VertexAiGraphStoreConfig


Expand Down Expand Up @@ -214,5 +220,107 @@ def test_resource_has_inferencer_graph_store_template_does_not(self):
)


# Helper functions for custom + glt-backend compatibility tests


def _create_gbml_config_with_glt_flag(value: str) -> GbmlConfigPbWrapper:
"""Create a GbmlConfig whose feature_flags.should_run_glt_backend is set.

Note the raw YAML key is ``should_run_glt_backend`` (not
``should_use_glt_backend``). The wrapper's ``should_use_glt_backend``
property reads this key from the ``feature_flags`` map and converts it to
a bool.
"""
gbml_config = gbml_config_pb2.GbmlConfig()
gbml_config.feature_flags["should_run_glt_backend"] = value
return GbmlConfigPbWrapper(gbml_config_pb=gbml_config)


def _create_resource_config_with_custom_trainer() -> GiglResourceConfigWrapper:
"""Create a GiglResourceConfig whose trainer is a CustomLauncherConfig."""
config = gigl_resource_config_pb2.GiglResourceConfig()
_create_shared_resource_config(config)
config.trainer_resource_config.custom_trainer_config.command = _FAKE_COMMAND
# Inferencer uses a built-in config so only the trainer path is custom.
config.inferencer_resource_config.vertex_ai_inferencer_config.CopyFrom(
_create_vertex_ai_resource_config()
)
return GiglResourceConfigWrapper(resource_config=config)


def _create_resource_config_with_custom_inferencer() -> GiglResourceConfigWrapper:
"""Create a GiglResourceConfig whose inferencer is a CustomLauncherConfig."""
config = gigl_resource_config_pb2.GiglResourceConfig()
_create_shared_resource_config(config)
config.trainer_resource_config.vertex_ai_trainer_config.CopyFrom(
_create_vertex_ai_resource_config()
)
config.inferencer_resource_config.custom_inferencer_config.command = _FAKE_COMMAND
return GiglResourceConfigWrapper(resource_config=config)


class TestCustomLauncherConfigRequiresGltBackend(TestCase):
"""Test suite for the CustomLauncherConfig + GLT-backend compatibility guard.

Because v1 trainer/inferencer dispatchers don't consult the custom oneof,
pairing a ``CustomLauncherConfig`` with
``feature_flags.should_run_glt_backend = "False"`` must be caught at
validation time rather than at runtime.
"""

def test_custom_trainer_without_glt_raises(self):
"""CustomLauncherConfig trainer + glt=False raises a clear ValueError."""
gbml_config = _create_gbml_config_with_glt_flag("False")
resource_config = _create_resource_config_with_custom_trainer()
with self.assertRaises(ValueError) as ctx:
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)
self.assertIn("should_run_glt_backend", str(ctx.exception))
self.assertIn("custom_trainer_config", str(ctx.exception))

def test_custom_inferencer_without_glt_raises(self):
"""CustomLauncherConfig inferencer + glt=False raises a clear ValueError."""
gbml_config = _create_gbml_config_with_glt_flag("False")
resource_config = _create_resource_config_with_custom_inferencer()
with self.assertRaises(ValueError) as ctx:
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)
self.assertIn("custom_inferencer_config", str(ctx.exception))

def test_custom_trainer_with_glt_passes(self):
"""CustomLauncherConfig trainer + glt=True passes validation."""
gbml_config = _create_gbml_config_with_glt_flag("True")
resource_config = _create_resource_config_with_custom_trainer()
# Should not raise any exception
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)

def test_custom_inferencer_with_glt_passes(self):
"""CustomLauncherConfig inferencer + glt=True passes validation."""
gbml_config = _create_gbml_config_with_glt_flag("True")
resource_config = _create_resource_config_with_custom_inferencer()
# Should not raise any exception
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)

def test_no_custom_config_passes_without_glt(self):
"""No CustomLauncherConfig at all passes regardless of glt flag."""
gbml_config = _create_gbml_config_with_glt_flag("False")
resource_config = _create_resource_config_without_graph_stores()
# Should not raise any exception: no custom oneof means nothing to enforce.
check_custom_launcher_config_requires_glt_backend(
gbml_config_pb_wrapper=gbml_config,
resource_config_wrapper=resource_config,
)


if __name__ == "__main__":
absltest.main()
Loading