Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions gigl/src/validation_check/config_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import (
check_inferencer_graph_store_compatibility,
check_trainer_graph_store_compatibility,
check_vertex_ai_trainer_tensorboard_compatibility,
)
from gigl.src.validation_check.libs.name_checks import (
check_if_kfp_pipeline_job_name_valid,
Expand Down Expand Up @@ -202,22 +203,27 @@
GiGLComponents.ConfigPopulator.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_vertex_ai_trainer_tensorboard_compatibility,
],
GiGLComponents.DataPreprocessor.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_vertex_ai_trainer_tensorboard_compatibility,
],
GiGLComponents.SubgraphSampler.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_vertex_ai_trainer_tensorboard_compatibility,
],
GiGLComponents.SplitGenerator.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_vertex_ai_trainer_tensorboard_compatibility,
],
GiGLComponents.Trainer.value: [
check_trainer_graph_store_compatibility,
check_inferencer_graph_store_compatibility,
check_vertex_ai_trainer_tensorboard_compatibility,
],
GiGLComponents.Inferencer.value: [
check_inferencer_graph_store_compatibility,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
If graph store mode is set up for trainer or inferencer in one config, it must be set up in the other.
"""

from typing import Literal
import re
from typing import Final, Literal

from google.protobuf.message import Message

Expand All @@ -18,6 +19,12 @@

logger = Logger()

# Vertex AI Experiment IDs are MetadataStore Context IDs and must satisfy
# this regex.
_VERTEX_RESOURCE_ID_PATTERN: Final[re.Pattern[str]] = re.compile(
r"^[a-z0-9][a-z0-9-]{0,127}$"
)


def _gbml_config_has_graph_store(
gbml_config_pb_wrapper: GbmlConfigPbWrapper,
Expand Down Expand Up @@ -102,6 +109,76 @@ def check_trainer_graph_store_compatibility(
)


def check_vertex_ai_trainer_tensorboard_compatibility(
gbml_config_pb_wrapper: GbmlConfigPbWrapper,
resource_config_wrapper: GiglResourceConfigWrapper,
) -> None:
"""Check that Vertex AI trainer TensorBoard config is complete.

``tensorboard_resource_name`` and ``tensorboard_experiment_name`` must be
supplied together (or both unset). The trainer's chief-rank uploader needs
both to call ``aiplatform.start_upload_tb_log``; setting only one
produces no observable behavior.

Args:
gbml_config_pb_wrapper: The GbmlConfig wrapper.
resource_config_wrapper: The GiglResourceConfig wrapper.

Raises:
AssertionError: If exactly one of ``tensorboard_resource_name`` /
``tensorboard_experiment_name`` is set, or if
``tensorboard_experiment_name`` doesn't satisfy the Vertex AI
resource-ID format, or if ``should_log_to_tensorboard`` is set
without both TB fields.
"""
logger.info(
"Config validation check: Vertex AI trainer TensorBoard compatibility between template and resource configs."
)

trainer_resource_config = resource_config_wrapper.trainer_config
if isinstance(
trainer_resource_config, gigl_resource_config_pb2.VertexAiResourceConfig
):
vertex_ai_config = trainer_resource_config
elif isinstance(
trainer_resource_config, gigl_resource_config_pb2.VertexAiGraphStoreConfig
):
# Graph-store mode reads TB metaparams from the compute pool, the
# same way it reads other Vertex AI resource fields.
vertex_ai_config = trainer_resource_config.compute_pool
else:
return

has_resource_name = bool(vertex_ai_config.tensorboard_resource_name)
has_experiment_name = bool(vertex_ai_config.tensorboard_experiment_name)
if has_resource_name != has_experiment_name:
raise AssertionError(
"VertexAiResourceConfig.tensorboard_resource_name and "
"tensorboard_experiment_name must be set together. "
f"tensorboard_resource_name set: {has_resource_name}, "
f"tensorboard_experiment_name set: {has_experiment_name}."
)

if has_experiment_name and not _VERTEX_RESOURCE_ID_PATTERN.match(
vertex_ai_config.tensorboard_experiment_name
):
raise AssertionError(
"VertexAiResourceConfig.tensorboard_experiment_name "
f"({vertex_ai_config.tensorboard_experiment_name!r}) is not a "
f"valid Vertex AI Experiment ID; it must match "
f"{_VERTEX_RESOURCE_ID_PATTERN.pattern}."
)

if not gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard:
return

assert has_resource_name, (
"GbmlConfig.trainer_config.should_log_to_tensorboard is true, so a "
"Vertex AI TensorBoard resource name and experiment name must be "
"set in the trainer resource config."
)


def check_inferencer_graph_store_compatibility(
gbml_config_pb_wrapper: GbmlConfigPbWrapper,
resource_config_wrapper: GiglResourceConfigWrapper,
Expand Down
16 changes: 16 additions & 0 deletions proto/snapchat/research/gbml/gigl_resource_config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,22 @@ message VertexAiResourceConfig {
// Compute Engine reservation affinity for the job.
// See https://docs.cloud.google.com/vertex-ai/docs/training/use-reservations
VertexAiReservationAffinity reservation_affinity = 9;

// Existing Vertex AI TensorBoard resource the job's chief rank streams
// TensorBoard events to.
// Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id}
// See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview
// for the Tensorboard data model.
// Must be set together with tensorboard_experiment_name (or both unset).
string tensorboard_resource_name = 10;

// Optional. Stable Vertex AI TensorboardExperiment name. Multiple jobs
// that share this value land in the same TensorboardExperiment, so they
// appear as comparable runs on one TensorBoard page. Allowed characters:
// lowercase letters, digits, hyphens (Vertex AI Experiment ID rules).
// Must be set together with tensorboard_resource_name (or both unset).
// See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview.
string tensorboard_experiment_name = 11;
}

// Configuration for KFP job resources
Expand Down
5 changes: 4 additions & 1 deletion proto/snapchat/research/gbml/trained_model_metadata.proto
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ message TrainedModelMetadata{
string scripted_model_uri = 2;
// The path where evaluation metrics are stored
string eval_metrics_uri = 3;
// Path where tensorboard logs will be stored
// Path where tensorboard logs will be stored. Vertex AI maps this URI to
// ``AIP_TENSORBOARD_LOG_DIR`` inside trainer containers via
// ``CustomJobSpec.baseOutputDirectory``. See
// https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec.
string tensorboard_logs_uri = 4;
}
Loading