diff --git a/Makefile b/Makefile index dc3accd94..e76dbc37c 100644 --- a/Makefile +++ b/Makefile @@ -260,7 +260,7 @@ run_all_e2e_tests: # Example: # `make compiled_pipeline_path="/tmp/gigl/my_pipeline.yaml" compile_gigl_kubeflow_pipeline` # Can be a GCS URI as well -compile_gigl_kubeflow_pipeline: compile_jars push_new_docker_images +compile_gigl_kubeflow_pipeline: push_new_docker_images uv run python -m gigl.orchestration.kubeflow.runner \ --action=compile \ --container_image_cuda=${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG} \ diff --git a/docs/plans/20260506-drop-submit-tensorboard.md b/docs/plans/20260506-drop-submit-tensorboard.md new file mode 100644 index 000000000..c01605f95 --- /dev/null +++ b/docs/plans/20260506-drop-submit-tensorboard.md @@ -0,0 +1,178 @@ +# Drop `submit(tensorboard=...)`, single-uploader TB + +Date: 2026-05-06 +Predecessor PR: https://github.com/Snapchat/GiGL/pull/603 + +## Goal + +Eliminate Vertex AI's auto-uploader. Keep only the chief-rank +`aiplatform.start_upload_tb_log` uploader for both live streaming and multi-run +comparison. Single uploader, single experiment, less plumbing. + +## Why + +PR #603 ships a dual-uploader design: Vertex's auto-uploader (gated on +`submit(tensorboard=...)`) plus a chief-rank `start_upload_tb_log` uploader. +That's because the SDK forces `submit(tensorboard=)` and `submit(experiment=)` +to be mutually exclusive, so getting both R1 (per-job UI link) and R2 (multi-run +comparison) required two parallel uploaders streaming from the same log dir. + +We want to keep streaming and multi-run comparison, but we don't actually need +R1 (the "Open TensorBoard" button on the Vertex job page) — we can replace it +with a logged URL in trainer stdout. Dropping `submit(tensorboard=...)` removes +the dual-uploader oddity and most of the supporting plumbing in +`vertex_ai.py`. + +## Step 0 — Constraint check (resolved via docs) + +**Question:** does Vertex AI populate `AIP_TENSORBOARD_LOG_DIR` inside the +worker container when `baseOutputDirectory` is set on `CustomJobSpec` but +`submit(tensorboard=)` is NOT? + +**Answer: yes.** Vertex's training-code-requirements doc +(https://cloud.google.com/vertex-ai/docs/training/code-requirements) is +unambiguous: when `baseOutputDirectory` is configured, Vertex AI sets +`AIP_MODEL_DIR`, `AIP_CHECKPOINT_DIR`, and `AIP_TENSORBOARD_LOG_DIR` env vars +unconditionally. The `tensorboard` field on `CustomJobSpec` is not a +prerequisite. **Step 4 below is not required** and is dropped from this plan. + +(If smoke testing later reveals a discrepancy, Step 4 can be re-introduced as +a fallback.) + +## Step 1 — Tighten validation: both fields or neither + +**File:** `gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py` + +In `check_vertex_ai_trainer_tensorboard_compatibility`, replace the current +"experiment name requires resource name" rule with "both must be set together +(or both unset)": + +- If exactly one of `tensorboard_resource_name` / + `tensorboard_experiment_name` is set, raise. +- Add the Vertex resource-ID regex check on `tensorboard_experiment_name` here + (moved from `_submit_job`). + +This shifts the precondition out of submit-time into the validation-check +stage, where the rest of the resource-config rules live. + +**Backwards compat:** zero risk. Both proto fields landed in PR #603 (this +branch); neither exists on `main`. No production config has +`tensorboard_resource_name` set without `tensorboard_experiment_name`. + +**File:** `tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py` +- Add a "resource_name set, experiment_name unset" failure test. +- Existing "experiment_name set, resource_name unset" failure test stays. +- Add a regex-failure test for an invalid experiment name. + +## Step 2 — Drop the `submit(tensorboard=...)` path + +**File:** `gigl/common/services/vertex_ai.py` + +In `_submit_job` (around lines 411-440): +- Delete `tensorboard=job_config.tensorboard_resource_name or None` kwarg from + `job.submit(...)`. +- Delete the experiment-name regex precondition block (lines 411-424); moved + to validation in Step 1. + +URL logging (lines 450-470): +- Delete the per-job URL log (lines 450-459). +- Keep the cross-job URL log (lines 460-470). Validation now guarantees both + names are present whenever either is, so the inner `if` simplifies. + +`VertexAiJobConfig` (around lines 213-214): +- Delete `tensorboard_resource_name` and `tensorboard_experiment_name` fields. + They were carriers from launcher into `_submit_job`; nothing reads them now. + +`_VERTEX_RESOURCE_ID_PATTERN` constant: delete from this file (only used by +validation now, which has its own copy or imports it). + +## Step 3 — Stop wiring TB names into VertexAiJobConfig (launcher) + +**File:** `gigl/src/common/vertex_ai_launcher.py` + +- `_build_job_config` (around lines 405-412): drop `tensorboard_resource_name=...` + and `tensorboard_experiment_name=...` kwargs to `VertexAiJobConfig`. +- Env-var injection block (lines 339-369): keep. The "both set" guard at line + 357-358 simplifies — since validation now enforces all-or-nothing, it's + exactly one condition (either field set implies both). +- `baseOutputDirectory` plumbing: unchanged. + +## Step 4 — Surface the named-experiment URL where users will see it + +**File:** `gigl/utils/tensorboard_writer.py` + +In `_maybe_start_uploader`, after `aiplatform.start_upload_tb_log(...)` +succeeds, log the cross-job experiment URL using the same format as +`vertex_ai.py:_build_tensorboard_experiment_url`. Either move that helper to a +shared location (`gigl/common/services/vertex_ai_url_helpers.py` or similar) +or inline the format string in the writer — it's three lines, duplication is +fine. + +Compensates for losing the Vertex UI's "Open TensorBoard" button by putting +the link in trainer stdout, where engineers already look. + +## Step 5 — Tests + +**File:** `tests/unit/src/common/vertex_ai_launcher_test.py` +- Drop assertions on `cfg.tensorboard_resource_name` / + `cfg.tensorboard_experiment_name` (the dataclass fields are gone). Env-var + injection assertions stay and become the primary contract test. + +**File:** `tests/unit/utils/tensorboard_writer_test.py` +- Add coverage for the URL log line emitted by `_maybe_start_uploader` on + success (Step 4). + +## Step 6 — Verification + +- `make type_check` clean. +- Per-file: `make unit_test_py PY_TEST_FILES="vertex_ai_launcher_test.py"`, + `tensorboard_writer_test.py`, + `gbml_and_resource_config_compatibility_checks_test.py`. +- Smoke: rerun the same two-runs-on-one-experiment smoke from PR #603. Confirm: + - Vertex job page no longer renders a TB button (expected regression). + - Trainer stdout logs the named-experiment URL. + - Both runs land on the same TB page side-by-side. + - `printenv | grep AIP_` confirms `AIP_TENSORBOARD_LOG_DIR` is set even + without `submit(tensorboard=)` (sanity check on the Step 0 doc claim). +- Full e2e CORA pipeline regression. + +## Risk and rollback + +- **Step 0's claim is load-bearing.** Resolved via docs, but the smoke run in + Step 6 should cross-check `AIP_TENSORBOARD_LOG_DIR` actually appears in the + worker container before relying on it in production. +- **UX regression on the Vertex UI button.** Mitigated by Step 4's stdout + logging. Call out in the PR description so reviewers aren't surprised. +- **Rollback:** single PR, easy to revert. Proto is unchanged; both fields + stay as carriers for the chief-rank uploader. Reverting just adds back the + `submit(tensorboard=...)` kwarg and the dropped `VertexAiJobConfig` fields. + +## Critical files + +- `gigl/common/services/vertex_ai.py` — drop submit kwarg, drop dataclass + fields, drop URL helpers (Step 2). +- `gigl/src/common/vertex_ai_launcher.py` — drop dataclass kwargs (Step 3). +- `gigl/utils/tensorboard_writer.py` — surface URL on uploader start (Step 4). +- `gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py` + — tighten to all-or-nothing + regex check (Step 1). +- Tests under `tests/unit/src/common/`, `tests/unit/utils/`, + `tests/unit/src/validation/lib/`. + +## Out of scope + +- Structured "trainer output metadata" file for KFP UI surfacing of the TB + URL. Considered useful but separate; defer. +- Removing `tensorboard_resource_name` field entirely. The chief-rank uploader + needs it (it's how `start_upload_tb_log` knows which `Tensorboard` instance + to write to), so the field stays. + +## References + +- Vertex AI training code requirements (env vars): + https://cloud.google.com/vertex-ai/docs/training/code-requirements +- `CustomJobSpec` REST (`baseOutputDirectory`): + https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec +- TB data model: + https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview +- `aiplatform.start_upload_tb_log`: + https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform#google_cloud_aiplatform_start_upload_tb_log diff --git a/examples/link_prediction/README.md b/examples/link_prediction/README.md index cd730f595..9c1a5ba9b 100644 --- a/examples/link_prediction/README.md +++ b/examples/link_prediction/README.md @@ -23,6 +23,14 @@ are example inference and training loops for the DBLP dataset. The DBLP dataset You can follow along with [dblp.ipynb](./dblp.ipynb) to run an e2e GiGL pipeline on the DBLP dataset. It will guide you through running each component: `config_populator` -> `data_preprocessor` -> `trainer` -> `inferencer` +## Vertex AI TensorBoard + +The example trainer configs enable TensorBoard logging with `trainerConfig.shouldLogToTensorboard: true`. + +To surface those events in Vertex AI TensorBoard, set `tensorboard_resource_name` on the trainer Vertex resource config, +use a regional bucket, and keep the bucket, CustomJob, and TensorBoard instance in the same region. The attached service +account should have `roles/storage.admin` and `roles/aiplatform.user`. + ```{toctree} :maxdepth: 2 :hidden: diff --git a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml index 606f13c29..7cdf22b03 100644 --- a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml +++ b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml @@ -18,6 +18,10 @@ trainerConfig: log_every_n_batch: "50" # Frequency in which we log batch information num_neighbors: "[10, 10]" # Fanout per hop, specified as a string representation of a list for the homogeneous use case command: python -m examples.link_prediction.homogeneous_training + # To enable cross-job TensorBoard comparison, set + # ``GiglResourceConfig.trainerResourceConfig...tensorboardExperimentName`` + # alongside the ``tensorboardResourceName`` on the same resource config. + # See ``proto/snapchat/research/gbml/gigl_resource_config.proto`` for details. inferencerConfig: inferencerArgs: # Example argument to inferencer diff --git a/examples/link_prediction/configs/example_resource_config.yaml b/examples/link_prediction/configs/example_resource_config.yaml index 2b7d7a02a..0d8531215 100644 --- a/examples/link_prediction/configs/example_resource_config.yaml +++ b/examples/link_prediction/configs/example_resource_config.yaml @@ -43,6 +43,8 @@ trainer_resource_config: gpu_type: NVIDIA_TESLA_T4 gpu_limit: 2 num_replicas: 2 + tensorboard_resource_name: "projects/USER_PROVIDED_PROJECT/locations/us-central1/tensorboards/USER_PROVIDED_TENSORBOARD_ID" + tensorboard_experiment_name: "USER_PROVIDED_EXPERIMENT_NAME" inferencer_resource_config: vertex_ai_inferencer_config: machine_type: n1-standard-16 diff --git a/examples/link_prediction/graph_store/configs/example_resource_config.yaml b/examples/link_prediction/graph_store/configs/example_resource_config.yaml index 869f627ca..68929311e 100644 --- a/examples/link_prediction/graph_store/configs/example_resource_config.yaml +++ b/examples/link_prediction/graph_store/configs/example_resource_config.yaml @@ -58,6 +58,8 @@ trainer_resource_config: gpu_type: NVIDIA_TESLA_T4 gpu_limit: 2 num_replicas: 2 + tensorboard_resource_name: "projects/USER_PROVIDED_PROJECT/locations/us-central1/tensorboards/USER_PROVIDED_TENSORBOARD_ID" + tensorboard_experiment_name: "USER_PROVIDED_EXPERIMENT_NAME" inferencer_resource_config: vertex_ai_graph_store_inferencer_config: graph_store_pool: diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index ec42cf45a..1c0e956a0 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -115,6 +115,7 @@ from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout +from gigl.utils.tensorboard_writer import TensorBoardWriter logger = Logger() @@ -459,12 +460,15 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) print(f"---Rank {rank} training process set device {device}") + is_chief_process = rank == 0 + tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), temperature=0.07, remove_accidental_hits=True, ) + batch_idx = 0 if not args.should_skip_training: train_main_loader, train_random_negative_loader = _setup_dataloaders( @@ -525,7 +529,6 @@ def _training_process( # Entering the training loop training_start_time = time.time() - batch_idx = 0 avg_train_loss = 0.0 last_n_batch_avg_loss: list[float] = [] last_n_batch_time: list[float] = [] @@ -567,17 +570,27 @@ def _training_process( if ( batch_idx % args.log_every_n_batch == 0 or batch_idx < 10 ): # Log the first 10 batches to ensure the model is initialized correctly + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) print( f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" ) if torch.cuda.is_available(): torch.cuda.synchronize() print( - f"rank={rank}, batch={batch_idx}, mean(batch_time)={statistics.mean(last_n_batch_time):.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, ) last_n_batch_time.clear() + # log the global average training loss print( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={statistics.mean(last_n_batch_avg_loss):.6f}" + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" ) last_n_batch_avg_loss.clear() flush() @@ -585,7 +598,7 @@ def _training_process( if batch_idx % args.val_every_n_batch == 0: print(f"rank={rank}, batch={batch_idx}, validating...") model.eval() - _run_validation_loops( + global_avg_val_loss = _run_validation_loops( model=model, main_loader=val_main_loader_iter, random_negative_loader=val_random_negative_loader_iter, @@ -596,6 +609,9 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) model.train() else: print(f"rank={rank} ended training early - no break condition was met") @@ -674,6 +690,7 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -701,6 +718,7 @@ def _training_process( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) flush() + tensorboard_writer.close() # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index e972edac2..8bc93f535 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -159,6 +159,7 @@ from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout +from gigl.utils.tensorboard_writer import TensorBoardWriter logger = Logger() @@ -450,12 +451,15 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) logger.info(f"---Rank {rank} training process set device {device}") + is_chief_process = rank == 0 + tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), temperature=0.07, remove_accidental_hits=True, ) + batch_idx = 0 if not args.should_skip_training: train_main_loader, train_random_negative_loader = _setup_dataloaders( @@ -517,7 +521,6 @@ def _training_process( # Entering the training loop training_start_time = time.time() - batch_idx = 0 avg_train_loss = 0.0 last_n_batch_avg_loss: list[float] = [] last_n_batch_time: list[float] = [] @@ -555,17 +558,27 @@ def _training_process( batch_start = time.time() batch_idx += 1 if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) logger.info( f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" ) if torch.cuda.is_available(): torch.cuda.synchronize() logger.info( - f"rank={rank}, mean(batch_time)={statistics.mean(last_n_batch_time):.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, ) last_n_batch_time.clear() + # log the global average training loss logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={statistics.mean(last_n_batch_avg_loss):.6f}" + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" ) last_n_batch_avg_loss.clear() flush() @@ -573,7 +586,7 @@ def _training_process( if batch_idx % args.val_every_n_batch == 0: logger.info(f"rank={rank}, batch={batch_idx}, validating...") model.eval() - _run_validation_loops( + global_avg_val_loss = _run_validation_loops( model=model, main_loader=val_main_loader_iter, random_negative_loader=val_random_negative_loader_iter, @@ -582,6 +595,9 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) model.train() logger.info(f"---Rank {rank} finished training") @@ -657,6 +673,7 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -684,6 +701,7 @@ def _training_process( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) flush() + tensorboard_writer.close() # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index f0d58ca5e..e8cf68c1d 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -65,6 +65,7 @@ from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout +from gigl.utils.tensorboard_writer import TensorBoardWriter logger = Logger() @@ -400,11 +401,15 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) logger.info(f"---Rank {rank} training process set device {device}") + is_chief_process = args.machine_rank == 0 and local_rank == 0 + tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) + loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), temperature=0.07, remove_accidental_hits=True, ) + batch_idx = 0 if not args.should_skip_training: train_main_loader, train_random_negative_loader = _setup_dataloaders( @@ -469,7 +474,6 @@ def _training_process( # Entering the training loop training_start_time = time.time() - batch_idx = 0 avg_train_loss = 0.0 last_n_batch_avg_loss: list[float] = [] last_n_batch_time: list[float] = [] @@ -509,6 +513,8 @@ def _training_process( batch_start = time.time() batch_idx += 1 if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) logger.info( f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" ) @@ -516,19 +522,26 @@ def _training_process( # Wait for GPU operations to finish torch.cuda.synchronize() logger.info( - f"rank={rank}, batch={batch_idx}, mean(batch_time)={statistics.mean(last_n_batch_time):.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, ) last_n_batch_time.clear() # log the global average training loss logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={statistics.mean(last_n_batch_avg_loss):.6f}" + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" ) last_n_batch_avg_loss.clear() if batch_idx % args.val_every_n_batch == 0: logger.info(f"rank={rank}, batch={batch_idx}, validating...") model.eval() - _run_validation_loops( + global_avg_val_loss = _run_validation_loops( model=model, main_loader=val_main_loader_iter, random_negative_loader=val_random_negative_loader_iter, @@ -538,6 +551,9 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) model.train() logger.info(f"---Rank {rank} finished training") @@ -619,6 +635,7 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -648,6 +665,7 @@ def _training_process( logger.info( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) + tensorboard_writer.close() torch.distributed.destroy_process_group() diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index b95a77489..6470ab1ef 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -61,6 +61,7 @@ from gigl.types.graph import to_homogeneous from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout +from gigl.utils.tensorboard_writer import TensorBoardWriter logger = Logger() @@ -359,12 +360,15 @@ def _training_process( logger.info(f"---Rank {rank} training process set device {device}") logger.info(f"---Rank {rank} training process group initialized") + is_chief_process = args.machine_rank == 0 and local_rank == 0 + tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), temperature=0.07, remove_accidental_hits=True, ) + batch_idx = 0 if not args.should_skip_training: train_main_loader, train_random_negative_loader = _setup_dataloaders( @@ -429,7 +433,6 @@ def _training_process( # Entering the training loop training_start_time = time.time() - batch_idx = 0 avg_train_loss = 0.0 last_n_batch_avg_loss: list[float] = [] last_n_batch_time: list[float] = [] @@ -468,6 +471,8 @@ def _training_process( batch_start = time.time() batch_idx += 1 if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) logger.info( f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" ) @@ -475,19 +480,26 @@ def _training_process( # Wait for GPU operations to finish torch.cuda.synchronize() logger.info( - f"rank={rank}, mean(batch_time)={statistics.mean(last_n_batch_time):.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, ) last_n_batch_time.clear() # log the global average training loss logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={statistics.mean(last_n_batch_avg_loss):.6f}" + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" ) last_n_batch_avg_loss.clear() if batch_idx % args.val_every_n_batch == 0: logger.info(f"rank={rank}, batch={batch_idx}, validating...") model.eval() - _run_validation_loops( + global_avg_val_loss = _run_validation_loops( model=model, main_loader=val_main_loader_iter, random_negative_loader=val_random_negative_loader_iter, @@ -496,6 +508,9 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) model.train() logger.info(f"---Rank {rank} finished training") @@ -573,6 +588,7 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -602,6 +618,7 @@ def _training_process( logger.info( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) + tensorboard_writer.close() torch.distributed.destroy_process_group() diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index 6cbb968b3..7350f5ec3 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -60,6 +60,7 @@ def get_pipeline() -> int: # NOTE: `get_pipeline` here is the Pipeline name """ import datetime +import re import time from dataclasses import dataclass from typing import Final, Optional, Union @@ -87,6 +88,39 @@ def get_pipeline() -> int: # NOTE: `get_pipeline` here is the Pipeline name DEFAULT_PIPELINE_TIMEOUT_S: Final[int] = 60 * 60 * 36 # 36 hours DEFAULT_CUSTOM_JOB_TIMEOUT_S: Final[int] = 60 * 60 * 24 # 24 hours +# Captures the trailing tensorboard ID from a fully-qualified resource name. +# Used only for building the human-readable TB UI URL. +_VERTEX_TENSORBOARD_ID_FROM_RESOURCE_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^projects/(?P[^/]+)" + r"/locations/(?P[^/]+)" + r"/tensorboards/(?P[^/]+)$" +) + + +def _build_tensorboard_experiment_url( + *, + tensorboard_resource_name: str, + experiment_id: str, +) -> Optional[str]: + """Return the TB UI URL for ``experiment_id`` under the given TB resource. + + Returns ``None`` if ``tensorboard_resource_name`` doesn't parse as + ``projects/.../locations/.../tensorboards/...`` — defensive so a stray + log line never breaks job submission. + """ + match = _VERTEX_TENSORBOARD_ID_FROM_RESOURCE_PATTERN.match( + tensorboard_resource_name + ) + if not match: + return None + return ( + f"https://{match['location']}.tensorboard.googleusercontent.com/experiment/" + f"projects+{match['project']}" + f"+locations+{match['location']}" + f"+tensorboards+{match['tensorboard_id']}" + f"+experiments+{experiment_id}" + ) + @dataclass class VertexAiJobConfig: @@ -94,7 +128,9 @@ class VertexAiJobConfig: Each field maps to a property on the ``WorkerPoolSpec`` / ``MachineSpec`` / ``DiskSpec`` / ``ContainerSpec`` protos that Vertex AI - uses to describe a CustomJob. + uses to describe a CustomJob. See + https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec + for the canonical reference. Example: >>> from google.cloud.aiplatform_v1.types import ReservationAffinity @@ -135,6 +171,13 @@ class VertexAiJobConfig: reservation_affinity: Optional ``ReservationAffinity`` that maps to ``MachineSpec.reservation_affinity``. ``None`` uses the Vertex AI default (no reservation). + base_output_dir: Optional CustomJob base output directory. When set, + Vertex AI derives ``AIP_MODEL_DIR``, ``AIP_CHECKPOINT_DIR``, and + ``AIP_TENSORBOARD_LOG_DIR`` from this directory. Setting this is + how GiGL trainers learn where to write TensorBoard events; the + chief-rank uploader (started inside the trainer) is what streams + them to a Vertex AI ``TensorboardExperiment`` for cross-job + comparison. """ job_name: str @@ -153,6 +196,7 @@ class VertexAiJobConfig: enable_web_access: bool = True scheduling_strategy: Optional[aiplatform.gapic.Scheduling.Strategy] = None reservation_affinity: Optional[ReservationAffinity] = None + base_output_dir: Optional[str] = None class VertexAIService: @@ -347,6 +391,7 @@ def _submit_job( location=self._location, labels=job_config.labels, staging_bucket=self._staging_bucket, + base_output_dir=job_config.base_output_dir, ) job.submit( service_account=self._service_account, diff --git a/gigl/src/common/constants/gcs.py b/gigl/src/common/constants/gcs.py index 146845428..8c375bcd9 100644 --- a/gigl/src/common/constants/gcs.py +++ b/gigl/src/common/constants/gcs.py @@ -979,7 +979,7 @@ def get_tensorboard_logs_gcs_path( """ return GcsUri.join( get_trainer_asset_dir_gcs_path(applied_task_identifier=applied_task_identifier), - "tensorboard_logs/", + "logs/", ) diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index 64aa86a23..032ab9ff0 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -1,8 +1,11 @@ """Shared functionality for launching Vertex AI jobs for training and inference.""" +import datetime +import re from collections.abc import Mapping from typing import Final, Optional +from google.cloud import aiplatform from google.cloud.aiplatform_v1.types import ( ReservationAffinity, Scheduling, @@ -39,6 +42,75 @@ {"NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"} ) +# The SDK TensorBoard uploader rewrites run names by replacing every char +# outside this character class with ``-`` +# (.venv/.../tensorboard/uploader_utils.py:46). We pre-sanitize the GCS +# subdir name to match what the SDK will produce, so the directory and +# the resulting TensorboardRun ID agree. +_VERTEX_RUN_NAME_REPLACE_PATTERN: Final[re.Pattern[str]] = re.compile( + r"[^a-zA-Z0-9\n-]" +) + +# Captures the project/location/tensorboard_id pieces of a fully-qualified +# Vertex AI TensorBoard resource name. Used to build the TensorBoard UI URL. +_TENSORBOARD_RESOURCE_NAME_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^projects/(?P[^/]+)" + r"/locations/(?P[^/]+)" + r"/tensorboards/(?P[^/]+)$" +) + + +def _maybe_log_tensorboard_url( + vertex_ai_resource_config: VertexAiResourceConfig, +) -> None: + """Log the cross-job TensorBoard UI URL when the experiment is configured. + + The chief-rank uploader inside the trainer container also logs this URL, + but that only surfaces in Vertex AI job logs (which take a minute to + materialize). Logging it here means the URL appears in the launcher's + local stdout immediately at submit time. + """ + tb_resource = vertex_ai_resource_config.tensorboard_resource_name + experiment_name = vertex_ai_resource_config.tensorboard_experiment_name + if not tb_resource or not experiment_name: + return + match = _TENSORBOARD_RESOURCE_NAME_PATTERN.match(tb_resource) + if not match: + return + url = ( + f"https://{match['location']}.tensorboard.googleusercontent.com/experiment/" + f"projects+{match['project']}" + f"+locations+{match['location']}" + f"+tensorboards+{match['tensorboard_id']}" + f"+experiments+{experiment_name}" + ) + logger.info( + f"View TensorBoard (cross-job comparison, experiment={experiment_name!r}): " + f"{url}" + ) + + +def _sanitize_for_vertex_run(value: str) -> str: + """Coerce ``value`` into the SDK's TensorboardRun-name character class. + + Mirrors ``google.cloud.aiplatform.tensorboard.uploader_utils.reformat_run_name`` + so the GCS subdir we create and the SDK-derived run name match. + """ + return _VERTEX_RUN_NAME_REPLACE_PATTERN.sub("-", value) + + +def _build_unique_run_name(job_name: str) -> str: + """Return a launch-unique, sanitized run name for ``job_name``. + + The display ``job_name`` is not guaranteed unique across reruns of the + same task identifier, and the SDK reuses an existing + ``TensorboardRun`` by name (silently merging events). We append a UTC + timestamp so two launches of the same task always produce two distinct + runs in a shared experiment. + """ + timestamp = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") + return _sanitize_for_vertex_run(f"{job_name}-{timestamp}") + def launch_single_pool_job( vertex_ai_resource_config: VertexAiResourceConfig, @@ -52,9 +124,14 @@ def launch_single_pool_job( cuda_docker_uri: Optional[str], component: GiGLComponents, vertex_ai_region: str, -) -> None: + tensorboard_logs_uri: Optional[Uri] = None, +) -> aiplatform.CustomJob: """Launch a single pool job on Vertex AI. + The ``tensorboard_resource_name`` and ``tensorboard_experiment_name`` + fields on ``vertex_ai_resource_config`` drive TensorBoard wiring; the + launcher reads them directly off the proto. + Args: vertex_ai_resource_config: The Vertex AI resource configuration job_name: Full name for the Vertex AI job @@ -67,6 +144,12 @@ def launch_single_pool_job( cuda_docker_uri: Docker image URI for GPU execution component: The GiGL component (Trainer or Inferencer) vertex_ai_region: The Vertex AI region to launch the job in + tensorboard_logs_uri: Optional TensorBoard log URI for trainer jobs + + Returns: + The submitted ``aiplatform.CustomJob``. Useful for callers that need + the job's resource name to look up downstream artifacts (e.g. the + per-job ``TensorboardExperiment``). """ if component not in _LAUNCHABLE_COMPONENTS: raise ValueError( @@ -90,8 +173,10 @@ def launch_single_pool_job( vertex_ai_resource_config=vertex_ai_resource_config, env_vars=[env_var.EnvVar(name="TF_CPP_MIN_LOG_LEVEL", value="3")], labels=resource_config_wrapper.get_resource_labels(component=component), + tensorboard_logs_uri=tensorboard_logs_uri, ) logger.info(f"Launching {component.value} job with config: {job_config}") + _maybe_log_tensorboard_url(vertex_ai_resource_config) vertex_ai_service = VertexAIService( project=resource_config_wrapper.project, @@ -99,7 +184,7 @@ def launch_single_pool_job( service_account=resource_config_wrapper.service_account_email, staging_bucket=resource_config_wrapper.temp_assets_regional_bucket_path.uri, ) - vertex_ai_service.launch_job(job_config=job_config) + return vertex_ai_service.launch_job(job_config=job_config) def launch_graph_store_enabled_job( @@ -115,9 +200,16 @@ def launch_graph_store_enabled_job( cpu_docker_uri: Optional[str], cuda_docker_uri: Optional[str], component: GiGLComponents, + tensorboard_logs_uri: Optional[Uri] = None, ) -> None: """Launch a graph store enabled job on Vertex AI with separate storage and compute pools. + The ``compute_pool`` of ``vertex_ai_graph_store_config`` carries + ``tensorboard_resource_name`` and ``tensorboard_experiment_name`` (the + same Vertex AI metaparams that single-pool reads off its own + ``VertexAiResourceConfig``); the launcher reads them directly off the + proto. + Args: vertex_ai_graph_store_config: The Vertex AI graph store configuration job_name: Full name for the Vertex AI job @@ -131,6 +223,7 @@ def launch_graph_store_enabled_job( cpu_docker_uri: Docker image URI for CPU execution cuda_docker_uri: Docker image URI for GPU execution component: The GiGL component (Trainer or Inferencer) + tensorboard_logs_uri: Optional TensorBoard log URI for trainer jobs """ if component not in _LAUNCHABLE_COMPONENTS: raise ValueError( @@ -181,6 +274,7 @@ def launch_graph_store_enabled_job( vertex_ai_resource_config=compute_pool_config, env_vars=environment_variables, labels=labels, + tensorboard_logs_uri=tensorboard_logs_uri, ) # Create storage pool job config @@ -204,6 +298,8 @@ def launch_graph_store_enabled_job( else resource_config_wrapper.region ) + _maybe_log_tensorboard_url(compute_pool_config) + vertex_ai_service = VertexAIService( project=resource_config_wrapper.project, location=region, @@ -227,6 +323,7 @@ def _build_job_config( vertex_ai_resource_config: VertexAiResourceConfig, env_vars: list[env_var.EnvVar], labels: Optional[dict[str, str]] = None, + tensorboard_logs_uri: Optional[Uri] = None, ) -> VertexAiJobConfig: """Build a VertexAiJobConfig for training or inference jobs. @@ -234,6 +331,11 @@ def _build_job_config( jobs on Vertex AI. It assembles job arguments, sets appropriate job naming conventions, and configures resource specifications based on the provided parameters. + ``tensorboard_resource_name`` and ``tensorboard_experiment_name`` come + from ``vertex_ai_resource_config`` directly — single-pool launches read + them off the trainer's ``VertexAiResourceConfig``; graph-store launches + pass ``compute_pool`` here, which carries the same fields. + Args: job_name (str): The base name for the job. Will be prefixed with "gigl_train_" or "gigl_infer_". is_inference (bool): Whether this is an inference job (True) or training job (False). @@ -247,6 +349,7 @@ def _build_job_config( machine type, GPU type, replica count, timeout, and scheduling strategy. env_vars (list[env_var.EnvVar]): Environment variables to set in the container. labels (Optional[dict[str, str]]): Labels to associate with the job. Defaults to None. + tensorboard_logs_uri (Optional[Uri]): TensorBoard log URI for trainer jobs. Returns: VertexAiJobConfig: A configuration object ready to be used with VertexAIService.launch_job(). @@ -262,13 +365,55 @@ def _build_job_config( ) command = command_str.strip().split(" ") + base_output_dir = ( + _get_base_output_dir_from_tensorboard_logs_uri( + tensorboard_logs_uri=tensorboard_logs_uri + ) + if tensorboard_logs_uri is not None + else None + ) + + # When the user opted into a stable Vertex AI TensorboardExperiment, inject + # env vars into the worker so the chief-rank trainer can stream events + # directly to that experiment via ``aiplatform.start_upload_tb_log``. + # Validation guarantees ``tensorboard_resource_name`` and + # ``tensorboard_experiment_name`` are set together. + # + # ``GIGL_TENSORBOARD_RUN_NAME`` carries a launch-unique, sanitized run + # name. The writer creates a subdirectory of ``AIP_TENSORBOARD_LOG_DIR`` + # with this name; the SDK ``LogdirLoader`` then surfaces it as a distinct + # ``TensorboardRun`` in the named experiment, so two jobs sharing the + # experiment name show up as two runs (instead of merging into one + # ``default`` run). + # + # References: + # https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + # https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec + container_env_vars = list(env_vars) + if vertex_ai_resource_config.tensorboard_experiment_name: + container_env_vars.extend( + [ + env_var.EnvVar( + name="GIGL_TENSORBOARD_RESOURCE_NAME", + value=vertex_ai_resource_config.tensorboard_resource_name, + ), + env_var.EnvVar( + name="GIGL_TENSORBOARD_EXPERIMENT_NAME", + value=vertex_ai_resource_config.tensorboard_experiment_name, + ), + env_var.EnvVar( + name="GIGL_TENSORBOARD_RUN_NAME", + value=_build_unique_run_name(job_name), + ), + ] + ) job_config = VertexAiJobConfig( job_name=job_name, container_uri=container_uri, command=command, args=job_args, - environment_variables=env_vars, + environment_variables=container_env_vars, machine_type=vertex_ai_resource_config.machine_type, accelerator_type=vertex_ai_resource_config.gpu_type.upper().replace("-", "_"), accelerator_count=vertex_ai_resource_config.gpu_limit, @@ -291,10 +436,35 @@ def _build_job_config( reservation_affinity=_build_reservation_affinity( vertex_ai_resource_config.reservation_affinity ), + base_output_dir=base_output_dir, ) return job_config +def _get_base_output_dir_from_tensorboard_logs_uri( + tensorboard_logs_uri: Uri, +) -> str: + """Return the CustomJob base output directory for a TensorBoard log URI. + + Args: + tensorboard_logs_uri: GiGL TensorBoard log URI. This is expected to + point at the ``logs/`` directory underneath the trainer asset dir. + + Returns: + The parent directory to use as ``base_output_dir``. + + Raises: + ValueError: If the URI does not contain a parent directory. + """ + normalized_tensorboard_logs_uri = tensorboard_logs_uri.uri.rstrip("/") + base_output_dir, separator, _ = normalized_tensorboard_logs_uri.rpartition("/") + if not separator or not base_output_dir: + raise ValueError( + f"TensorBoard logs URI must include a parent directory, got {tensorboard_logs_uri.uri!r}." + ) + return base_output_dir + + def _build_reservation_affinity( affinity: VertexAiReservationAffinity, ) -> Optional[ReservationAffinity]: diff --git a/gigl/src/training/v2/glt_trainer.py b/gigl/src/training/v2/glt_trainer.py index 2f8ecbbbe..4f2ecadd1 100644 --- a/gigl/src/training/v2/glt_trainer.py +++ b/gigl/src/training/v2/glt_trainer.py @@ -54,6 +54,12 @@ def __execute_VAI_training( training_process_runtime_args = ( gbml_config_pb_wrapper.trainer_config.trainer_args ) + raw_tensorboard_logs_uri = gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri + tensorboard_logs_uri = ( + UriFactory.create_uri(raw_tensorboard_logs_uri) + if raw_tensorboard_logs_uri + else None + ) job_name = f"gigl_train_{applied_task_identifier}" @@ -70,6 +76,7 @@ def __execute_VAI_training( cuda_docker_uri=cuda_docker_uri, component=GiGLComponents.Trainer, vertex_ai_region=resource_config.vertex_ai_trainer_region, + tensorboard_logs_uri=tensorboard_logs_uri, ) elif isinstance(resource_config.trainer_config, VertexAiGraphStoreConfig): launch_graph_store_enabled_job( @@ -85,6 +92,7 @@ def __execute_VAI_training( cpu_docker_uri=cpu_docker_uri, cuda_docker_uri=cuda_docker_uri, component=GiGLComponents.Trainer, + tensorboard_logs_uri=tensorboard_logs_uri, ) else: raise NotImplementedError( diff --git a/gigl/src/validation_check/config_validator.py b/gigl/src/validation_check/config_validator.py index ec0ca4caf..2c6fa3d14 100644 --- a/gigl/src/validation_check/config_validator.py +++ b/gigl/src/validation_check/config_validator.py @@ -18,6 +18,7 @@ from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import ( check_inferencer_graph_store_compatibility, check_trainer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ) from gigl.src.validation_check.libs.name_checks import ( check_if_kfp_pipeline_job_name_valid, @@ -202,22 +203,27 @@ GiGLComponents.ConfigPopulator.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.DataPreprocessor.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.SubgraphSampler.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.SplitGenerator.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.Trainer.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.Inferencer.value: [ check_inferencer_graph_store_compatibility, diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index fc12d1939..e79b2a2a5 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -5,7 +5,8 @@ If graph store mode is set up for trainer or inferencer in one config, it must be set up in the other. """ -from typing import Literal +import re +from typing import Final, Literal from google.protobuf.message import Message @@ -18,6 +19,12 @@ logger = Logger() +# Vertex AI Experiment IDs are MetadataStore Context IDs and must satisfy +# this regex. +_VERTEX_RESOURCE_ID_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^[a-z0-9][a-z0-9-]{0,127}$" +) + def _gbml_config_has_graph_store( gbml_config_pb_wrapper: GbmlConfigPbWrapper, @@ -102,6 +109,76 @@ def check_trainer_graph_store_compatibility( ) +def check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper: GbmlConfigPbWrapper, + resource_config_wrapper: GiglResourceConfigWrapper, +) -> None: + """Check that Vertex AI trainer TensorBoard config is complete. + + ``tensorboard_resource_name`` and ``tensorboard_experiment_name`` must be + supplied together (or both unset). The trainer's chief-rank uploader needs + both to call ``aiplatform.start_upload_tb_log``; setting only one + produces no observable behavior. + + Args: + gbml_config_pb_wrapper: The GbmlConfig wrapper. + resource_config_wrapper: The GiglResourceConfig wrapper. + + Raises: + AssertionError: If exactly one of ``tensorboard_resource_name`` / + ``tensorboard_experiment_name`` is set, or if + ``tensorboard_experiment_name`` doesn't satisfy the Vertex AI + resource-ID format, or if ``should_log_to_tensorboard`` is set + without both TB fields. + """ + logger.info( + "Config validation check: Vertex AI trainer TensorBoard compatibility between template and resource configs." + ) + + trainer_resource_config = resource_config_wrapper.trainer_config + if isinstance( + trainer_resource_config, gigl_resource_config_pb2.VertexAiResourceConfig + ): + vertex_ai_config = trainer_resource_config + elif isinstance( + trainer_resource_config, gigl_resource_config_pb2.VertexAiGraphStoreConfig + ): + # Graph-store mode reads TB metaparams from the compute pool, the + # same way it reads other Vertex AI resource fields. + vertex_ai_config = trainer_resource_config.compute_pool + else: + return + + has_resource_name = bool(vertex_ai_config.tensorboard_resource_name) + has_experiment_name = bool(vertex_ai_config.tensorboard_experiment_name) + if has_resource_name != has_experiment_name: + raise AssertionError( + "VertexAiResourceConfig.tensorboard_resource_name and " + "tensorboard_experiment_name must be set together. " + f"tensorboard_resource_name set: {has_resource_name}, " + f"tensorboard_experiment_name set: {has_experiment_name}." + ) + + if has_experiment_name and not _VERTEX_RESOURCE_ID_PATTERN.match( + vertex_ai_config.tensorboard_experiment_name + ): + raise AssertionError( + "VertexAiResourceConfig.tensorboard_experiment_name " + f"({vertex_ai_config.tensorboard_experiment_name!r}) is not a " + f"valid Vertex AI Experiment ID; it must match " + f"{_VERTEX_RESOURCE_ID_PATTERN.pattern}." + ) + + if not gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: + return + + assert has_resource_name, ( + "GbmlConfig.trainer_config.should_log_to_tensorboard is true, so a " + "Vertex AI TensorBoard resource name and experiment name must be " + "set in the trainer resource config." + ) + + def check_inferencer_graph_store_compatibility( gbml_config_pb_wrapper: GbmlConfigPbWrapper, resource_config_wrapper: GiglResourceConfigWrapper, diff --git a/gigl/utils/tensorboard_writer.py b/gigl/utils/tensorboard_writer.py new file mode 100644 index 000000000..ceb3e9719 --- /dev/null +++ b/gigl/utils/tensorboard_writer.py @@ -0,0 +1,246 @@ +"""TensorBoard writer for GiGL training entrypoints.""" + +import os +import re +from typing import Any, Final, Optional + +import tensorflow as tf +from google.cloud import aiplatform + +from gigl.common.logger import Logger + +logger = Logger() + +# Vertex AI sets this env var to ``/logs/`` (or +# ``//logs/`` for HyperparameterTuningJob trials) +# when ``CustomJobSpec.baseOutputDirectory`` is configured. GiGL's launcher +# derives ``baseOutputDirectory`` from the GbmlConfig's ``tensorboardLogsUri`` +# (see ``gigl/src/common/vertex_ai_launcher.py``), so within a GiGL-launched +# trainer this env var is the authoritative log directory. +# +# References: +# https://cloud.google.com/vertex-ai/docs/training/code-requirements +# https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#FIELDS.base_output_directory +_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY: Final[str] = "AIP_TENSORBOARD_LOG_DIR" + +# Set by GiGL's launcher (``gigl/src/common/vertex_ai_launcher.py``) when the +# user requested a stable Vertex AI ``TensorboardExperiment`` for cross-job +# comparison. When all three are set on the chief rank, the writer also +# starts a background uploader (``aiplatform.start_upload_tb_log``) that +# streams events from the parent log dir to that experiment under the +# configured ``Tensorboard`` instance, with the run-name subdir surfacing +# as a distinct ``TensorboardRun``. Without these, the writer just writes +# files to ``AIP_TENSORBOARD_LOG_DIR`` and only Vertex's built-in +# auto-uploader (gated on ``jobSpec.tensorboard``) ingests them. +_GIGL_TENSORBOARD_RESOURCE_NAME_ENV_KEY: Final[str] = "GIGL_TENSORBOARD_RESOURCE_NAME" +_GIGL_TENSORBOARD_EXPERIMENT_NAME_ENV_KEY: Final[str] = ( + "GIGL_TENSORBOARD_EXPERIMENT_NAME" +) +_GIGL_TENSORBOARD_RUN_NAME_ENV_KEY: Final[str] = "GIGL_TENSORBOARD_RUN_NAME" + +_TENSORBOARD_RESOURCE_NAME_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^projects/(?P[^/]+)" + r"/locations/(?P[^/]+)" + r"/tensorboards/(?P[^/]+)$" +) + + +class TensorBoardWriter: + """Writes scalar metrics to TensorBoard. + + No-ops when disabled, so callers never see ``Optional[TensorBoardWriter]`` + plumbing across chief / non-chief ranks. + + The writer flushes after every ``log()`` call so that Vertex's TensorBoard + UI sees events live as training progresses. + + Example: + >>> with TensorBoardWriter.from_env(enabled=is_chief_process) as tb: + ... tb.log({"Loss/train": loss, "Loss/val": vloss}, step=batch_idx) + """ + + def __init__( + self, + log_dir: Optional[str], + *, + upload_started: bool = False, + ) -> None: + """Initialize the writer. + + Args: + log_dir: Destination directory for TensorBoard events. When + ``None``, the writer is a no-op and allocates no TF resources. + upload_started: Whether ``aiplatform.start_upload_tb_log`` has + been called and needs a paired ``end_upload_tb_log`` on + ``close()``. + """ + self._writer: Optional[Any] = ( + tf.summary.create_file_writer(log_dir) if log_dir else None + ) + self._closed = False + self._upload_started = upload_started + + @classmethod + def from_env(cls, *, enabled: bool = True) -> "TensorBoardWriter": + """Build a writer from Vertex AI's ``AIP_TENSORBOARD_LOG_DIR`` env var. + + When ``enabled`` is ``False``, returns a no-op writer without reading + the environment. This is the path non-chief ranks take so they can + share the same call sites as the chief. + + When ``enabled`` is ``True``: + + - ``AIP_TENSORBOARD_LOG_DIR`` must be set; otherwise this raises + ``RuntimeError`` rather than silently no-op'ing. The env var is + populated by Vertex AI from ``CustomJobSpec.baseOutputDirectory`` + (see the references in this module's header). + - If ``GIGL_TENSORBOARD_RUN_NAME`` is set, events are written to + ``//`` so the SDK uploader's + ``LogdirLoader`` discovers the subdir as a distinct + ``TensorboardRun`` (instead of merging into the SDK's hardcoded + ``DEFAULT_RUN_NAME = "default"``). The launcher injects this env + var when the user opts into ``tensorboard_experiment_name``. + - If ``GIGL_TENSORBOARD_RESOURCE_NAME`` and + ``GIGL_TENSORBOARD_EXPERIMENT_NAME`` are also set, this also starts + a background ``aiplatform`` uploader that streams events from the + PARENT log dir (so the run-name subdir surfaces as a run) to the + named ``TensorboardExperiment`` under the configured + ``Tensorboard`` instance. The uploader is shut down on + :meth:`close`. + + Args: + enabled: Whether this caller is responsible for writing events. + Typically ``is_chief_process``. + + Returns: + A ``TensorBoardWriter`` instance — real if enabled, no-op otherwise. + + Raises: + RuntimeError: If ``enabled`` is True and ``AIP_TENSORBOARD_LOG_DIR`` + is not set in the environment. + ValueError: If ``GIGL_TENSORBOARD_RESOURCE_NAME`` is set but does + not match ``projects/.../locations/.../tensorboards/...``. + """ + if not enabled: + return cls(log_dir=None) + parent_log_dir = os.environ.get(_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY) + if not parent_log_dir: + raise RuntimeError( + f"{_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY} is not set. " + "TensorBoardWriter.from_env() requires the trainer to run as " + "a Vertex AI CustomJob with baseOutputDirectory configured. " + "See https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#FIELDS.base_output_directory." + ) + run_name = os.environ.get(_GIGL_TENSORBOARD_RUN_NAME_ENV_KEY) + effective_log_dir = ( + os.path.join(parent_log_dir, run_name) if run_name else parent_log_dir + ) + + # Construct the file writer FIRST. If TF construction fails we don't + # want a leaked uploader thread keeping the (non-daemon) process + # alive. See codex review round 2, issue 6. + instance = cls(log_dir=effective_log_dir, upload_started=False) + try: + if _maybe_start_uploader(parent_log_dir=parent_log_dir): + instance._upload_started = True + except BaseException: + instance.close() + raise + return instance + + def log(self, metrics: dict[str, float], step: int) -> None: + """Write each metric scalar at ``step`` and flush. + + No-ops when the writer is disabled or already closed. + + Args: + metrics: Mapping of TensorBoard tag to scalar value. All entries + are written at the same step. + step: TensorBoard step for the events. + """ + if self._writer is None or self._closed: + return + with self._writer.as_default(): + for tag, value in metrics.items(): + tf.summary.scalar(tag, value, step=step) + self._writer.flush() + + def close(self) -> None: + """Close the underlying TF writer and stop the uploader if running. + + Idempotent; safe to call multiple times and on no-op writers. + """ + if self._closed: + return + if self._writer is not None: + self._writer.close() + if self._upload_started: + aiplatform.end_upload_tb_log() + self._closed = True + + def __enter__(self) -> "TensorBoardWriter": + return self + + def __exit__(self, *_exc: object) -> None: + self.close() + + +def _maybe_start_uploader(*, parent_log_dir: str) -> bool: + """Start the aiplatform TB uploader iff the GiGL env vars are present. + + Watches ``parent_log_dir`` (not the run-name subdir under it), so the + SDK's ``LogdirLoader`` discovers each run via + ``os.path.relpath(subdir, parent_log_dir)``. The Vertex AI TensorBoard + data model (``Tensorboard`` → ``TensorboardExperiment`` → ``TensorboardRun`` + → ``TensorboardTimeSeries``) is documented at + https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. + + Returns ``True`` if the uploader was started (caller must arrange for + ``aiplatform.end_upload_tb_log`` on shutdown), ``False`` otherwise. + + Args: + parent_log_dir: The ``AIP_TENSORBOARD_LOG_DIR`` value — i.e. the + directory whose children are run-name subdirectories. + + Raises: + ValueError: If ``GIGL_TENSORBOARD_RESOURCE_NAME`` is set but does not + match the expected resource-name format. + """ + tb_resource_name = os.environ.get(_GIGL_TENSORBOARD_RESOURCE_NAME_ENV_KEY) + experiment_name = os.environ.get(_GIGL_TENSORBOARD_EXPERIMENT_NAME_ENV_KEY) + if not tb_resource_name or not experiment_name: + return False + + match = _TENSORBOARD_RESOURCE_NAME_PATTERN.match(tb_resource_name) + if not match: + raise ValueError( + f"{_GIGL_TENSORBOARD_RESOURCE_NAME_ENV_KEY}={tb_resource_name!r} " + "does not match projects/.../locations/.../tensorboards/...; " + "the GiGL launcher should set this to the same resource name " + "configured on GiglResourceConfig." + ) + + aiplatform.init( + project=match["project"], + location=match["location"], + ) + aiplatform.start_upload_tb_log( + tensorboard_id=match["tensorboard_id"], + tensorboard_experiment_name=experiment_name, + logdir=parent_log_dir, + ) + # Log the TB UI URL so engineers can find the named experiment without + # the Vertex AI job page's "Open TensorBoard" button (which is no longer + # rendered now that GiGL doesn't pass ``submit(tensorboard=...)``). + experiment_url = ( + f"https://{match['location']}.tensorboard.googleusercontent.com/experiment/" + f"projects+{match['project']}" + f"+locations+{match['location']}" + f"+tensorboards+{match['tensorboard_id']}" + f"+experiments+{experiment_name}" + ) + logger.info( + f"View TensorBoard (cross-job comparison, experiment={experiment_name!r}): " + f"{experiment_url}" + ) + return True diff --git a/proto/snapchat/research/gbml/gigl_resource_config.proto b/proto/snapchat/research/gbml/gigl_resource_config.proto index 0d930949b..292910a1c 100644 --- a/proto/snapchat/research/gbml/gigl_resource_config.proto +++ b/proto/snapchat/research/gbml/gigl_resource_config.proto @@ -130,6 +130,22 @@ message VertexAiResourceConfig { // Compute Engine reservation affinity for the job. // See https://docs.cloud.google.com/vertex-ai/docs/training/use-reservations VertexAiReservationAffinity reservation_affinity = 9; + + // Existing Vertex AI TensorBoard resource to attach to the job. + // Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + // See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + // for the Tensorboard data model. + string tensorboard_resource_name = 10; + + // Optional. When set, the trainer's chief rank streams events to a + // TensorboardExperiment with this name on the TB resource above, in + // addition to Vertex's per-job auto-upload. Multiple jobs that share this + // value land in the same TensorboardExperiment, so they appear as + // comparable runs on one TensorBoard page. Requires + // tensorboard_resource_name above to be set. Allowed characters: + // lowercase letters, digits, hyphens (Vertex AI Experiment ID rules). + // See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. + string tensorboard_experiment_name = 11; } // Configuration for KFP job resources diff --git a/proto/snapchat/research/gbml/trained_model_metadata.proto b/proto/snapchat/research/gbml/trained_model_metadata.proto index 341133b5a..7c02de4ac 100644 --- a/proto/snapchat/research/gbml/trained_model_metadata.proto +++ b/proto/snapchat/research/gbml/trained_model_metadata.proto @@ -9,6 +9,9 @@ message TrainedModelMetadata{ string scripted_model_uri = 2; // The path where evaluation metrics are stored string eval_metrics_uri = 3; - // Path where tensorboard logs will be stored + // Path where tensorboard logs will be stored. Vertex AI maps this URI to + // ``AIP_TENSORBOARD_LOG_DIR`` inside trainer containers via + // ``CustomJobSpec.baseOutputDirectory``. See + // https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec. string tensorboard_logs_uri = 4; } diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala index 8363bdb1f..2198a2eb5 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala @@ -38,7 +38,7 @@ final case class DistributedInferencerConfig( __serializedSizeMemoized = __size } __size - 1 - + } def writeTo(`_output__`: _root_.com.google.protobuf.CodedOutputStream): _root_.scala.Unit = { trainerConfig.vertexAiInferencerConfig.foreach { __v => @@ -165,7 +165,7 @@ object DistributedInferencerConfig extends scalapb.GeneratedMessageCompanion[sna override def number: _root_.scala.Int = 0 override def value: _root_.scala.Nothing = throw new java.util.NoSuchElementException("Empty.value") } - + @SerialVersionUID(0L) final case class VertexAiInferencerConfig(value: snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig) extends snapchat.research.gbml.gigl_resource_config.DistributedInferencerConfig.TrainerConfig { type ValueType = snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala index a086f6113..da5ed6523 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala @@ -48,7 +48,7 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { XQSMwoMbnVtX3JlcGxpY2FzGAUgASgNQhDiPw0SC251bVJlcGxpY2FzUgtudW1SZXBsaWNhcyJGChJMb2NhbFRyYWluZXJDb25ma WcSMAoLbnVtX3dvcmtlcnMYASABKA1CD+I/DBIKbnVtV29ya2Vyc1IKbnVtV29ya2VycyKZAQobVmVydGV4QWlSZXNlcnZhdGlvb kFmZmluaXR5Eh0KBHR5cGUYASABKAlCCeI/BhIEdHlwZVIEdHlwZRJbChpyZXNlcnZhdGlvbl9yZXNvdXJjZV9uYW1lcxgCIAMoC - UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyLUBAoWVmVydGV4QWlSZXNvd + UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyKOBgoWVmVydGV4QWlSZXNvd XJjZUNvbmZpZxIzCgxtYWNoaW5lX3R5cGUYASABKAlCEOI/DRILbWFjaGluZVR5cGVSC21hY2hpbmVUeXBlEicKCGdwdV90eXBlG AIgASgJQgziPwkSB2dwdVR5cGVSB2dwdVR5cGUSKgoJZ3B1X2xpbWl0GAMgASgNQg3iPwoSCGdwdUxpbWl0UghncHVMaW1pdBIzC gxudW1fcmVwbGljYXMYBCABKA1CEOI/DRILbnVtUmVwbGljYXNSC251bVJlcGxpY2FzEiYKB3RpbWVvdXQYBSABKA1CDOI/CRIHd @@ -56,74 +56,77 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { Wdpb25PdmVycmlkZRJIChNzY2hlZHVsaW5nX3N0cmF0ZWd5GAcgASgJQhfiPxQSEnNjaGVkdWxpbmdTdHJhdGVneVISc2NoZWR1b GluZ1N0cmF0ZWd5Ej4KEWJvb3RfZGlza19zaXplX2diGAggASgNQhPiPxASDmJvb3REaXNrU2l6ZUdiUg5ib290RGlza1NpemVHY hKAAQoUcmVzZXJ2YXRpb25fYWZmaW5pdHkYCSABKAsyMy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpUmVzZXJ2YXRpb - 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5IooCChFLRlBSZXNvdXJjZUNvb - mZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1ZXN0UgpjcHVSZXF1ZXN0EjkKDm1lbW9yeV9yZXF1ZXN0GAIgA - SgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJwoIZ3B1X3R5cGUYAyABKAlCDOI/CRIHZ3B1VHlwZVIHZ3B1V - HlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSCGdwdUxpbWl0EjMKDG51bV9yZXBsaWNhcxgFIAEoDUIQ4j8NE - gtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvdXJjZUNvbmZpZxIwCgtudW1fd29ya2VycxgBIAEoDUIP4j8ME - gpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWcSbQoQZ3JhcGhfc3RvcmVfcG9vbBgBI - AEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0IT4j8QEg5ncmFwaFN0b3JlUG9vbFIOZ - 3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291c - mNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vbBJpCiBjb21wdXRlX2NsdXN0ZXJfbG9jYWxfd29ybGRfc2l6Z - RgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplUhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplIp0DC - hhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9haV90cmFpbmVyX2NvbmZpZxgBIAEoCzItLnNuYXBjaGF0LnJlc - 2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyY - WluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMiguc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBUcmFpbmVyQ - 29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJhaW5lckNvbmZpZxJ3ChRsb2NhbF90cmFpbmVyX2NvbmZpZxgDI - AEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpbmVyQ29uZmlnQhfiPxQSEmxvY2FsVHJhaW5lckNvbmZpZ0gAU - hJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcixwQKFVRyYWluZXJSZXNvdXJjZUNvbmZpZxKFAQoYdmVydGV4X - 2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhriP - xcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcScAoSa2ZwX3RyYWluZXJfY29uZmlnGAIgA - SgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZUNvbmZpZ0IV4j8SEhBrZnBUcmFpbmVyQ29uZmlnSABSEGtmc - FRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYAyABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkxvY2FsU - mVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnSABSEmxvY2FsVHJhaW5lckNvbmZpZxKnAQokdmVydGV4X2FpX - 2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaUdyYXBoU3Rvc - mVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lckNvbmZpZ0gAUh92ZXJ0ZXhBaUdyYXBoU3RvcmVUcmFpbmVyQ - 29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyUmVzb3VyY2VDb25maWcSjgEKG3ZlcnRleF9haV9pbmZlcmVuY - 2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0Id4j8aEhh2ZXJ0Z - XhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY2VyQ29uZmlnEo0BChpkYXRhZmxvd19pbmZlcmVuY2VyX2Nvb - mZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0YWZsb3dSZXNvdXJjZUNvbmZpZ0Id4j8aEhhkYXRhZmxvd0luZ - mVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZmlnEoEBChdsb2NhbF9pbmZlcmVuY2VyX2NvbmZpZxgDIAEoC - zIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZUNvbmZpZ0Ia4j8XEhVsb2NhbEluZmVyZW5jZXJDb25maWdIA - FIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ3JhcGhfc3RvcmVfaW5mZXJlbmNlcl9jb25maWcYBCABKAsyM - C5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTdG9yZUNvbmZpZ0In4j8kEiJ2ZXJ0ZXhBaUdyYXBoU3RvcmVJb - mZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZmVyZW5jZXJDb25maWdCEwoRaW5mZXJlbmNlcl9jb25maWcil - wgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhYmVscxgBIAMoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU - 2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRyeUIT4j8QEg5yZXNvdXJjZUxhYmVsc1IOcmVzb3VyY2VMYWJlb - HMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb - 25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db21wdXRlQ29uZmlnUhNjb21tb25Db21wdXRlQ29uZmlnGpQFC - hNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCDOI/CRIHcHJvamVjdFIHcHJvamVjdBIjCgZyZWdpb24YAiABK - AlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfYnVja2V0GAMgASgJQhXiPxISEHRlbXBBc3NldHNCdWNrZXRSE - HRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3NldHNfYnVja2V0GAQgASgJQh3iPxoSGHRlbXBSZWdpb25hbEFzc - 2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKEnBlcm1fYXNzZXRzX2J1Y2tldBgFIAEoCUIV4j8SEhBwZXJtQ - XNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfYXNzZXRzX2JxX2RhdGFzZXRfbmFtZRgGIAEoCUIc4j8ZEhd0Z - W1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0YXNldE5hbWUSVgoZZW1iZWRkaW5nX2JxX2RhdGFzZXRfbmFtZ - RgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlbWJlZGRpbmdCcURhdGFzZXROYW1lElYKGWdjcF9zZXJ2aWNlX - 2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY291bnRFbWFpbFIWZ2NwU2VydmljZUFjY291bnRFbWFpbBI8C - g9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSdW5uZXJSDmRhdGFmbG93UnVubmVyGlcKE1Jlc291cmNlTGFiZ - WxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEi9 - wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY2VfY29uZmlnX3VyaRgBIAEoCUIc4j8ZEhdzaGFyZWRSZXNvd - XJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaRJ/ChZzaGFyZWRfcmVzb3VyY2VfY29uZmlnGAIgASgLMiwuc - 25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvbmZpZ0IZ4j8WEhRzaGFyZWRSZXNvdXJjZUNvbmZpZ0gAUhRza - GFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZmlnGAwgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EY - XRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3NvckNvbmZpZ1IScHJlcHJvY2Vzc29yQ29uZmlnEn8KF3N1YmdyY - XBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TcGFya1Jlc291cmNlQ29uZmlnQhriPxcSF - XN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ29uZmlnEnwKFnNwbGl0X2dlbmVyYXRvcl9jb25maWcYDiABK - AsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY2VDb25maWdCGeI/FhIUc3BsaXRHZW5lcmF0b3JDb25maWdSF - HNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnGA8gASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EaXN0c - mlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZmlnUg10cmFpbmVyQ29uZmlnEnQKEWluZmVyZW5jZXJfY29uZ - mlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZmxvd1Jlc291cmNlQ29uZmlnQhcYAeI/EhIQaW5mZXJlbmNlc - kNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZXNvdXJjZV9jb25maWcYESABKAsyLS5zbmFwY2hhdC5yZXNlY - XJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0cmFpbmVyUmVzb3VyY2VDb25maWdSFXRyYWluZXJSZXNvdXJjZ - UNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYEiABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkluZmVyZ - W5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb3VyY2VDb25maWdSGGluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ - 0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb21wb25lbnRfVW5rbm93bhAAGhbiPxMSEUNvbXBvbmVudF9Vb - mtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH+I/HBIaQ29tcG9uZW50X0NvbmZpZ19WYWxpZGF0b3ISPwoaQ - 29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb25lbnRfQ29uZmlnX1BvcHVsYXRvchJBChtDb21wb25lbnRfR - GF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3ISPwoaQ29tcG9uZW50X1N1YmdyYXBoX - 1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2FtcGxlchI9ChlDb21wb25lbnRfU3BsaXRfR2VuZXJhdG9yEAUaH - uI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb25lbnRfVHJhaW5lchAGGhbiPxMSEUNvbXBvbmVudF9UcmFpb - mVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tcG9uZW50X0luZmVyZW5jZXJiBnByb3RvMw==""" + 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5ElgKGXRlbnNvcmJvYXJkX3Jlc + 291cmNlX25hbWUYCiABKAlCHOI/GRIXdGVuc29yYm9hcmRSZXNvdXJjZU5hbWVSF3RlbnNvcmJvYXJkUmVzb3VyY2VOYW1lEl4KG + 3RlbnNvcmJvYXJkX2V4cGVyaW1lbnRfbmFtZRgLIAEoCUIe4j8bEhl0ZW5zb3Jib2FyZEV4cGVyaW1lbnROYW1lUhl0ZW5zb3Jib + 2FyZEV4cGVyaW1lbnROYW1lIooCChFLRlBSZXNvdXJjZUNvbmZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1Z + XN0UgpjcHVSZXF1ZXN0EjkKDm1lbW9yeV9yZXF1ZXN0GAIgASgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJ + woIZ3B1X3R5cGUYAyABKAlCDOI/CRIHZ3B1VHlwZVIHZ3B1VHlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSC + GdwdUxpbWl0EjMKDG51bV9yZXBsaWNhcxgFIAEoDUIQ4j8NEgtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvd + XJjZUNvbmZpZxIwCgtudW1fd29ya2VycxgBIAEoDUIP4j8MEgpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyY + XBoU3RvcmVDb25maWcSbQoQZ3JhcGhfc3RvcmVfcG9vbBgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZ + XNvdXJjZUNvbmZpZ0IT4j8QEg5ncmFwaFN0b3JlUG9vbFIOZ3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc + 25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vb + BJpCiBjb21wdXRlX2NsdXN0ZXJfbG9jYWxfd29ybGRfc2l6ZRgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTa + XplUhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplIp0DChhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9ha + V90cmFpbmVyX2NvbmZpZxgBIAEoCzItLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSF + XZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLM + iguc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBUcmFpbmVyQ29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJha + W5lckNvbmZpZxJ3ChRsb2NhbF90cmFpbmVyX2NvbmZpZxgDIAEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpb + mVyQ29uZmlnQhfiPxQSEmxvY2FsVHJhaW5lckNvbmZpZ0gAUhJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcix + wQKFVRyYWluZXJSZXNvdXJjZUNvbmZpZxKFAQoYdmVydGV4X2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyY + WluZXJDb25maWcScAoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZ + UNvbmZpZ0IV4j8SEhBrZnBUcmFpbmVyQ29uZmlnSABSEGtmcFRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYA + yABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkxvY2FsUmVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnS + ABSEmxvY2FsVHJhaW5lckNvbmZpZxKnAQokdmVydGV4X2FpX2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hc + GNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lc + kNvbmZpZ0gAUh92ZXJ0ZXhBaUdyYXBoU3RvcmVUcmFpbmVyQ29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyU + mVzb3VyY2VDb25maWcSjgEKG3ZlcnRleF9haV9pbmZlcmVuY2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdib + WwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0Id4j8aEhh2ZXJ0ZXhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY + 2VyQ29uZmlnEo0BChpkYXRhZmxvd19pbmZlcmVuY2VyX2NvbmZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0Y + WZsb3dSZXNvdXJjZUNvbmZpZ0Id4j8aEhhkYXRhZmxvd0luZmVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZ + mlnEoEBChdsb2NhbF9pbmZlcmVuY2VyX2NvbmZpZxgDIAEoCzIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZ + UNvbmZpZ0Ia4j8XEhVsb2NhbEluZmVyZW5jZXJDb25maWdIAFIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ + 3JhcGhfc3RvcmVfaW5mZXJlbmNlcl9jb25maWcYBCABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTd + G9yZUNvbmZpZ0In4j8kEiJ2ZXJ0ZXhBaUdyYXBoU3RvcmVJbmZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZ + mVyZW5jZXJDb25maWdCEwoRaW5mZXJlbmNlcl9jb25maWcilwgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhY + mVscxgBIAMoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRye + UIT4j8QEg5yZXNvdXJjZUxhYmVsc1IOcmVzb3VyY2VMYWJlbHMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuY + XBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db + 21wdXRlQ29uZmlnUhNjb21tb25Db21wdXRlQ29uZmlnGpQFChNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCD + OI/CRIHcHJvamVjdFIHcHJvamVjdBIjCgZyZWdpb24YAiABKAlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfY + nVja2V0GAMgASgJQhXiPxISEHRlbXBBc3NldHNCdWNrZXRSEHRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3Nld + HNfYnVja2V0GAQgASgJQh3iPxoSGHRlbXBSZWdpb25hbEFzc2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKE + nBlcm1fYXNzZXRzX2J1Y2tldBgFIAEoCUIV4j8SEhBwZXJtQXNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfY + XNzZXRzX2JxX2RhdGFzZXRfbmFtZRgGIAEoCUIc4j8ZEhd0ZW1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0Y + XNldE5hbWUSVgoZZW1iZWRkaW5nX2JxX2RhdGFzZXRfbmFtZRgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlb + WJlZGRpbmdCcURhdGFzZXROYW1lElYKGWdjcF9zZXJ2aWNlX2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY + 291bnRFbWFpbFIWZ2NwU2VydmljZUFjY291bnRFbWFpbBI8Cg9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSd + W5uZXJSDmRhdGFmbG93UnVubmVyGlcKE1Jlc291cmNlTGFiZWxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKB + XZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEi9wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY + 2VfY29uZmlnX3VyaRgBIAEoCUIc4j8ZEhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1Vya + RJ/ChZzaGFyZWRfcmVzb3VyY2VfY29uZmlnGAIgASgLMiwuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvb + mZpZ0IZ4j8WEhRzaGFyZWRSZXNvdXJjZUNvbmZpZ0gAUhRzaGFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZ + mlnGAwgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3Nvc + kNvbmZpZ1IScHJlcHJvY2Vzc29yQ29uZmlnEn8KF3N1YmdyYXBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5TcGFya1Jlc291cmNlQ29uZmlnQhriPxcSFXN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ + 29uZmlnEnwKFnNwbGl0X2dlbmVyYXRvcl9jb25maWcYDiABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY + 2VDb25maWdCGeI/FhIUc3BsaXRHZW5lcmF0b3JDb25maWdSFHNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnG + A8gASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EaXN0cmlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZ + mlnUg10cmFpbmVyQ29uZmlnEnQKEWluZmVyZW5jZXJfY29uZmlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZ + mxvd1Jlc291cmNlQ29uZmlnQhcYAeI/EhIQaW5mZXJlbmNlckNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZ + XNvdXJjZV9jb25maWcYESABKAsyLS5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0c + mFpbmVyUmVzb3VyY2VDb25maWdSFXRyYWluZXJSZXNvdXJjZUNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYE + iABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb + 3VyY2VDb25maWdSGGluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb + 21wb25lbnRfVW5rbm93bhAAGhbiPxMSEUNvbXBvbmVudF9Vbmtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH + +I/HBIaQ29tcG9uZW50X0NvbmZpZ19WYWxpZGF0b3ISPwoaQ29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb + 25lbnRfQ29uZmlnX1BvcHVsYXRvchJBChtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0Y + V9QcmVwcm9jZXNzb3ISPwoaQ29tcG9uZW50X1N1YmdyYXBoX1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2Ftc + GxlchI9ChlDb21wb25lbnRfU3BsaXRfR2VuZXJhdG9yEAUaHuI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb + 25lbnRfVHJhaW5lchAGGhbiPxMSEUNvbXBvbmVudF9UcmFpbmVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tc + G9uZW50X0luZmVyZW5jZXJiBnByb3RvMw==""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala index 21f9ea1c2..d863014af 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala @@ -36,6 +36,20 @@ package snapchat.research.gbml.gigl_resource_config * @param reservationAffinity * Compute Engine reservation affinity for the job. * See https://docs.cloud.google.com/vertex-ai/docs/training/use-reservations + * @param tensorboardResourceName + * Existing Vertex AI TensorBoard resource to attach to the job. + * Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + * for the Tensorboard data model. + * @param tensorboardExperimentName + * Optional. When set, the trainer's chief rank streams events to a + * TensorboardExperiment with this name on the TB resource above, in + * addition to Vertex's per-job auto-upload. Multiple jobs that share this + * value land in the same TensorboardExperiment, so they appear as + * comparable runs on one TensorBoard page. Requires + * tensorboard_resource_name above to be set. Allowed characters: + * lowercase letters, digits, hyphens (Vertex AI Experiment ID rules). + * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. */ @SerialVersionUID(0L) final case class VertexAiResourceConfig( @@ -48,6 +62,8 @@ final case class VertexAiResourceConfig( schedulingStrategy: _root_.scala.Predef.String = "", bootDiskSizeGb: _root_.scala.Int = 0, reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None, + tensorboardResourceName: _root_.scala.Predef.String = "", + tensorboardExperimentName: _root_.scala.Predef.String = "", unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[VertexAiResourceConfig] { @transient @@ -114,6 +130,20 @@ final case class VertexAiResourceConfig( val __value = reservationAffinity.get __size += 1 + _root_.com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(__value.serializedSize) + __value.serializedSize }; + + { + val __value = tensorboardResourceName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(10, __value) + } + }; + + { + val __value = tensorboardExperimentName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(11, __value) + } + }; __size += unknownFields.serializedSize __size } @@ -181,6 +211,18 @@ final case class VertexAiResourceConfig( _output__.writeUInt32NoTag(__m.serializedSize) __m.writeTo(_output__) }; + { + val __v = tensorboardResourceName + if (!__v.isEmpty) { + _output__.writeString(10, __v) + } + }; + { + val __v = tensorboardExperimentName + if (!__v.isEmpty) { + _output__.writeString(11, __v) + } + }; unknownFields.writeTo(_output__) } def withMachineType(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(machineType = __v) @@ -194,6 +236,8 @@ final case class VertexAiResourceConfig( def getReservationAffinity: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity = reservationAffinity.getOrElse(snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity.defaultInstance) def clearReservationAffinity: VertexAiResourceConfig = copy(reservationAffinity = _root_.scala.None) def withReservationAffinity(__v: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity): VertexAiResourceConfig = copy(reservationAffinity = Option(__v)) + def withTensorboardResourceName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardResourceName = __v) + def withTensorboardExperimentName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardExperimentName = __v) def withUnknownFields(__v: _root_.scalapb.UnknownFieldSet) = copy(unknownFields = __v) def discardUnknownFields = copy(unknownFields = _root_.scalapb.UnknownFieldSet.empty) def getFieldByNumber(__fieldNumber: _root_.scala.Int): _root_.scala.Any = { @@ -231,6 +275,14 @@ final case class VertexAiResourceConfig( if (__t != 0) __t else null } case 9 => reservationAffinity.orNull + case 10 => { + val __t = tensorboardResourceName + if (__t != "") __t else null + } + case 11 => { + val __t = tensorboardExperimentName + if (__t != "") __t else null + } } } def getField(__field: _root_.scalapb.descriptors.FieldDescriptor): _root_.scalapb.descriptors.PValue = { @@ -245,6 +297,8 @@ final case class VertexAiResourceConfig( case 7 => _root_.scalapb.descriptors.PString(schedulingStrategy) case 8 => _root_.scalapb.descriptors.PInt(bootDiskSizeGb) case 9 => reservationAffinity.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) + case 10 => _root_.scalapb.descriptors.PString(tensorboardResourceName) + case 11 => _root_.scalapb.descriptors.PString(tensorboardExperimentName) } } def toProtoString: _root_.scala.Predef.String = _root_.scalapb.TextFormat.printToUnicodeString(this) @@ -264,6 +318,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat var __schedulingStrategy: _root_.scala.Predef.String = "" var __bootDiskSizeGb: _root_.scala.Int = 0 var __reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None + var __tensorboardResourceName: _root_.scala.Predef.String = "" + var __tensorboardExperimentName: _root_.scala.Predef.String = "" var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null var _done__ = false while (!_done__) { @@ -288,6 +344,10 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat __bootDiskSizeGb = _input__.readUInt32() case 74 => __reservationAffinity = Option(__reservationAffinity.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) + case 82 => + __tensorboardResourceName = _input__.readStringRequireUtf8() + case 90 => + __tensorboardExperimentName = _input__.readStringRequireUtf8() case tag => if (_unknownFields__ == null) { _unknownFields__ = new _root_.scalapb.UnknownFieldSet.Builder() @@ -305,6 +365,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy = __schedulingStrategy, bootDiskSizeGb = __bootDiskSizeGb, reservationAffinity = __reservationAffinity, + tensorboardResourceName = __tensorboardResourceName, + tensorboardExperimentName = __tensorboardExperimentName, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() ) } @@ -320,7 +382,9 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride = __fieldsMap.get(scalaDescriptor.findFieldByNumber(6).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), schedulingStrategy = __fieldsMap.get(scalaDescriptor.findFieldByNumber(7).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), bootDiskSizeGb = __fieldsMap.get(scalaDescriptor.findFieldByNumber(8).get).map(_.as[_root_.scala.Int]).getOrElse(0), - reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]) + reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]), + tensorboardResourceName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(10).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), + tensorboardExperimentName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(11).get).map(_.as[_root_.scala.Predef.String]).getOrElse("") ) case _ => throw new RuntimeException("Expected PMessage") } @@ -344,7 +408,9 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride = "", schedulingStrategy = "", bootDiskSizeGb = 0, - reservationAffinity = _root_.scala.None + reservationAffinity = _root_.scala.None, + tensorboardResourceName = "", + tensorboardExperimentName = "" ) implicit class VertexAiResourceConfigLens[UpperPB](_l: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig]) extends _root_.scalapb.lenses.ObjectLens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig](_l) { def machineType: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.machineType)((c_, f_) => c_.copy(machineType = f_)) @@ -357,6 +423,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat def bootDiskSizeGb: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Int] = field(_.bootDiskSizeGb)((c_, f_) => c_.copy(bootDiskSizeGb = f_)) def reservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = field(_.getReservationAffinity)((c_, f_) => c_.copy(reservationAffinity = Option(f_))) def optionalReservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]] = field(_.reservationAffinity)((c_, f_) => c_.copy(reservationAffinity = f_)) + def tensorboardResourceName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardResourceName)((c_, f_) => c_.copy(tensorboardResourceName = f_)) + def tensorboardExperimentName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardExperimentName)((c_, f_) => c_.copy(tensorboardExperimentName = f_)) } final val MACHINE_TYPE_FIELD_NUMBER = 1 final val GPU_TYPE_FIELD_NUMBER = 2 @@ -367,6 +435,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat final val SCHEDULING_STRATEGY_FIELD_NUMBER = 7 final val BOOT_DISK_SIZE_GB_FIELD_NUMBER = 8 final val RESERVATION_AFFINITY_FIELD_NUMBER = 9 + final val TENSORBOARD_RESOURCE_NAME_FIELD_NUMBER = 10 + final val TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER = 11 def of( machineType: _root_.scala.Predef.String, gpuType: _root_.scala.Predef.String, @@ -376,7 +446,9 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride: _root_.scala.Predef.String, schedulingStrategy: _root_.scala.Predef.String, bootDiskSizeGb: _root_.scala.Int, - reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] + reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity], + tensorboardResourceName: _root_.scala.Predef.String, + tensorboardExperimentName: _root_.scala.Predef.String ): _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig = _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig( machineType, gpuType, @@ -386,7 +458,9 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride, schedulingStrategy, bootDiskSizeGb, - reservationAffinity + reservationAffinity, + tensorboardResourceName, + tensorboardExperimentName ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.VertexAiResourceConfig]) } diff --git a/scala/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala b/scala/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala index bcf95c046..2ae44b3a5 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala @@ -12,7 +12,10 @@ package snapchat.research.gbml.trained_model_metadata * @param evalMetricsUri * The path where evaluation metrics are stored * @param tensorboardLogsUri - * Path where tensorboard logs will be stored + * Path where tensorboard logs will be stored. Vertex AI maps this URI to + * ``AIP_TENSORBOARD_LOG_DIR`` inside trainer containers via + * ``CustomJobSpec.baseOutputDirectory``. See + * https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec. */ @SerialVersionUID(0L) final case class TrainedModelMetadata( @@ -26,28 +29,28 @@ final case class TrainedModelMetadata( private[this] var __serializedSizeMemoized: _root_.scala.Int = 0 private[this] def __computeSerializedSize(): _root_.scala.Int = { var __size = 0 - + { val __value = trainedModelUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(1, __value) } }; - + { val __value = scriptedModelUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(2, __value) } }; - + { val __value = evalMetricsUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(3, __value) } }; - + { val __value = tensorboardLogsUri if (!__value.isEmpty) { @@ -64,7 +67,7 @@ final case class TrainedModelMetadata( __serializedSizeMemoized = __size } __size - 1 - + } def writeTo(`_output__`: _root_.com.google.protobuf.CodedOutputStream): _root_.scala.Unit = { { diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala index 8363bdb1f..2198a2eb5 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala @@ -38,7 +38,7 @@ final case class DistributedInferencerConfig( __serializedSizeMemoized = __size } __size - 1 - + } def writeTo(`_output__`: _root_.com.google.protobuf.CodedOutputStream): _root_.scala.Unit = { trainerConfig.vertexAiInferencerConfig.foreach { __v => @@ -165,7 +165,7 @@ object DistributedInferencerConfig extends scalapb.GeneratedMessageCompanion[sna override def number: _root_.scala.Int = 0 override def value: _root_.scala.Nothing = throw new java.util.NoSuchElementException("Empty.value") } - + @SerialVersionUID(0L) final case class VertexAiInferencerConfig(value: snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig) extends snapchat.research.gbml.gigl_resource_config.DistributedInferencerConfig.TrainerConfig { type ValueType = snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala index a086f6113..da5ed6523 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala @@ -48,7 +48,7 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { XQSMwoMbnVtX3JlcGxpY2FzGAUgASgNQhDiPw0SC251bVJlcGxpY2FzUgtudW1SZXBsaWNhcyJGChJMb2NhbFRyYWluZXJDb25ma WcSMAoLbnVtX3dvcmtlcnMYASABKA1CD+I/DBIKbnVtV29ya2Vyc1IKbnVtV29ya2VycyKZAQobVmVydGV4QWlSZXNlcnZhdGlvb kFmZmluaXR5Eh0KBHR5cGUYASABKAlCCeI/BhIEdHlwZVIEdHlwZRJbChpyZXNlcnZhdGlvbl9yZXNvdXJjZV9uYW1lcxgCIAMoC - UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyLUBAoWVmVydGV4QWlSZXNvd + UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyKOBgoWVmVydGV4QWlSZXNvd XJjZUNvbmZpZxIzCgxtYWNoaW5lX3R5cGUYASABKAlCEOI/DRILbWFjaGluZVR5cGVSC21hY2hpbmVUeXBlEicKCGdwdV90eXBlG AIgASgJQgziPwkSB2dwdVR5cGVSB2dwdVR5cGUSKgoJZ3B1X2xpbWl0GAMgASgNQg3iPwoSCGdwdUxpbWl0UghncHVMaW1pdBIzC gxudW1fcmVwbGljYXMYBCABKA1CEOI/DRILbnVtUmVwbGljYXNSC251bVJlcGxpY2FzEiYKB3RpbWVvdXQYBSABKA1CDOI/CRIHd @@ -56,74 +56,77 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { Wdpb25PdmVycmlkZRJIChNzY2hlZHVsaW5nX3N0cmF0ZWd5GAcgASgJQhfiPxQSEnNjaGVkdWxpbmdTdHJhdGVneVISc2NoZWR1b GluZ1N0cmF0ZWd5Ej4KEWJvb3RfZGlza19zaXplX2diGAggASgNQhPiPxASDmJvb3REaXNrU2l6ZUdiUg5ib290RGlza1NpemVHY hKAAQoUcmVzZXJ2YXRpb25fYWZmaW5pdHkYCSABKAsyMy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpUmVzZXJ2YXRpb - 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5IooCChFLRlBSZXNvdXJjZUNvb - mZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1ZXN0UgpjcHVSZXF1ZXN0EjkKDm1lbW9yeV9yZXF1ZXN0GAIgA - SgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJwoIZ3B1X3R5cGUYAyABKAlCDOI/CRIHZ3B1VHlwZVIHZ3B1V - HlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSCGdwdUxpbWl0EjMKDG51bV9yZXBsaWNhcxgFIAEoDUIQ4j8NE - gtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvdXJjZUNvbmZpZxIwCgtudW1fd29ya2VycxgBIAEoDUIP4j8ME - gpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWcSbQoQZ3JhcGhfc3RvcmVfcG9vbBgBI - AEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0IT4j8QEg5ncmFwaFN0b3JlUG9vbFIOZ - 3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291c - mNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vbBJpCiBjb21wdXRlX2NsdXN0ZXJfbG9jYWxfd29ybGRfc2l6Z - RgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplUhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplIp0DC - hhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9haV90cmFpbmVyX2NvbmZpZxgBIAEoCzItLnNuYXBjaGF0LnJlc - 2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyY - WluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMiguc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBUcmFpbmVyQ - 29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJhaW5lckNvbmZpZxJ3ChRsb2NhbF90cmFpbmVyX2NvbmZpZxgDI - AEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpbmVyQ29uZmlnQhfiPxQSEmxvY2FsVHJhaW5lckNvbmZpZ0gAU - hJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcixwQKFVRyYWluZXJSZXNvdXJjZUNvbmZpZxKFAQoYdmVydGV4X - 2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhriP - xcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcScAoSa2ZwX3RyYWluZXJfY29uZmlnGAIgA - SgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZUNvbmZpZ0IV4j8SEhBrZnBUcmFpbmVyQ29uZmlnSABSEGtmc - FRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYAyABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkxvY2FsU - mVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnSABSEmxvY2FsVHJhaW5lckNvbmZpZxKnAQokdmVydGV4X2FpX - 2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaUdyYXBoU3Rvc - mVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lckNvbmZpZ0gAUh92ZXJ0ZXhBaUdyYXBoU3RvcmVUcmFpbmVyQ - 29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyUmVzb3VyY2VDb25maWcSjgEKG3ZlcnRleF9haV9pbmZlcmVuY - 2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0Id4j8aEhh2ZXJ0Z - XhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY2VyQ29uZmlnEo0BChpkYXRhZmxvd19pbmZlcmVuY2VyX2Nvb - mZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0YWZsb3dSZXNvdXJjZUNvbmZpZ0Id4j8aEhhkYXRhZmxvd0luZ - mVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZmlnEoEBChdsb2NhbF9pbmZlcmVuY2VyX2NvbmZpZxgDIAEoC - zIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZUNvbmZpZ0Ia4j8XEhVsb2NhbEluZmVyZW5jZXJDb25maWdIA - FIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ3JhcGhfc3RvcmVfaW5mZXJlbmNlcl9jb25maWcYBCABKAsyM - C5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTdG9yZUNvbmZpZ0In4j8kEiJ2ZXJ0ZXhBaUdyYXBoU3RvcmVJb - mZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZmVyZW5jZXJDb25maWdCEwoRaW5mZXJlbmNlcl9jb25maWcil - wgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhYmVscxgBIAMoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU - 2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRyeUIT4j8QEg5yZXNvdXJjZUxhYmVsc1IOcmVzb3VyY2VMYWJlb - HMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb - 25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db21wdXRlQ29uZmlnUhNjb21tb25Db21wdXRlQ29uZmlnGpQFC - hNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCDOI/CRIHcHJvamVjdFIHcHJvamVjdBIjCgZyZWdpb24YAiABK - AlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfYnVja2V0GAMgASgJQhXiPxISEHRlbXBBc3NldHNCdWNrZXRSE - HRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3NldHNfYnVja2V0GAQgASgJQh3iPxoSGHRlbXBSZWdpb25hbEFzc - 2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKEnBlcm1fYXNzZXRzX2J1Y2tldBgFIAEoCUIV4j8SEhBwZXJtQ - XNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfYXNzZXRzX2JxX2RhdGFzZXRfbmFtZRgGIAEoCUIc4j8ZEhd0Z - W1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0YXNldE5hbWUSVgoZZW1iZWRkaW5nX2JxX2RhdGFzZXRfbmFtZ - RgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlbWJlZGRpbmdCcURhdGFzZXROYW1lElYKGWdjcF9zZXJ2aWNlX - 2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY291bnRFbWFpbFIWZ2NwU2VydmljZUFjY291bnRFbWFpbBI8C - g9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSdW5uZXJSDmRhdGFmbG93UnVubmVyGlcKE1Jlc291cmNlTGFiZ - WxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEi9 - wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY2VfY29uZmlnX3VyaRgBIAEoCUIc4j8ZEhdzaGFyZWRSZXNvd - XJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaRJ/ChZzaGFyZWRfcmVzb3VyY2VfY29uZmlnGAIgASgLMiwuc - 25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvbmZpZ0IZ4j8WEhRzaGFyZWRSZXNvdXJjZUNvbmZpZ0gAUhRza - GFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZmlnGAwgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EY - XRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3NvckNvbmZpZ1IScHJlcHJvY2Vzc29yQ29uZmlnEn8KF3N1YmdyY - XBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TcGFya1Jlc291cmNlQ29uZmlnQhriPxcSF - XN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ29uZmlnEnwKFnNwbGl0X2dlbmVyYXRvcl9jb25maWcYDiABK - AsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY2VDb25maWdCGeI/FhIUc3BsaXRHZW5lcmF0b3JDb25maWdSF - HNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnGA8gASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EaXN0c - mlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZmlnUg10cmFpbmVyQ29uZmlnEnQKEWluZmVyZW5jZXJfY29uZ - mlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZmxvd1Jlc291cmNlQ29uZmlnQhcYAeI/EhIQaW5mZXJlbmNlc - kNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZXNvdXJjZV9jb25maWcYESABKAsyLS5zbmFwY2hhdC5yZXNlY - XJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0cmFpbmVyUmVzb3VyY2VDb25maWdSFXRyYWluZXJSZXNvdXJjZ - UNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYEiABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkluZmVyZ - W5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb3VyY2VDb25maWdSGGluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ - 0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb21wb25lbnRfVW5rbm93bhAAGhbiPxMSEUNvbXBvbmVudF9Vb - mtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH+I/HBIaQ29tcG9uZW50X0NvbmZpZ19WYWxpZGF0b3ISPwoaQ - 29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb25lbnRfQ29uZmlnX1BvcHVsYXRvchJBChtDb21wb25lbnRfR - GF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3ISPwoaQ29tcG9uZW50X1N1YmdyYXBoX - 1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2FtcGxlchI9ChlDb21wb25lbnRfU3BsaXRfR2VuZXJhdG9yEAUaH - uI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb25lbnRfVHJhaW5lchAGGhbiPxMSEUNvbXBvbmVudF9UcmFpb - mVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tcG9uZW50X0luZmVyZW5jZXJiBnByb3RvMw==""" + 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5ElgKGXRlbnNvcmJvYXJkX3Jlc + 291cmNlX25hbWUYCiABKAlCHOI/GRIXdGVuc29yYm9hcmRSZXNvdXJjZU5hbWVSF3RlbnNvcmJvYXJkUmVzb3VyY2VOYW1lEl4KG + 3RlbnNvcmJvYXJkX2V4cGVyaW1lbnRfbmFtZRgLIAEoCUIe4j8bEhl0ZW5zb3Jib2FyZEV4cGVyaW1lbnROYW1lUhl0ZW5zb3Jib + 2FyZEV4cGVyaW1lbnROYW1lIooCChFLRlBSZXNvdXJjZUNvbmZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1Z + XN0UgpjcHVSZXF1ZXN0EjkKDm1lbW9yeV9yZXF1ZXN0GAIgASgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJ + woIZ3B1X3R5cGUYAyABKAlCDOI/CRIHZ3B1VHlwZVIHZ3B1VHlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSC + GdwdUxpbWl0EjMKDG51bV9yZXBsaWNhcxgFIAEoDUIQ4j8NEgtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvd + XJjZUNvbmZpZxIwCgtudW1fd29ya2VycxgBIAEoDUIP4j8MEgpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyY + XBoU3RvcmVDb25maWcSbQoQZ3JhcGhfc3RvcmVfcG9vbBgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZ + XNvdXJjZUNvbmZpZ0IT4j8QEg5ncmFwaFN0b3JlUG9vbFIOZ3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc + 25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vb + BJpCiBjb21wdXRlX2NsdXN0ZXJfbG9jYWxfd29ybGRfc2l6ZRgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTa + XplUhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplIp0DChhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9ha + V90cmFpbmVyX2NvbmZpZxgBIAEoCzItLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSF + XZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLM + iguc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBUcmFpbmVyQ29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJha + W5lckNvbmZpZxJ3ChRsb2NhbF90cmFpbmVyX2NvbmZpZxgDIAEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpb + mVyQ29uZmlnQhfiPxQSEmxvY2FsVHJhaW5lckNvbmZpZ0gAUhJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcix + wQKFVRyYWluZXJSZXNvdXJjZUNvbmZpZxKFAQoYdmVydGV4X2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyY + WluZXJDb25maWcScAoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZ + UNvbmZpZ0IV4j8SEhBrZnBUcmFpbmVyQ29uZmlnSABSEGtmcFRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYA + yABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkxvY2FsUmVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnS + ABSEmxvY2FsVHJhaW5lckNvbmZpZxKnAQokdmVydGV4X2FpX2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hc + GNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lc + kNvbmZpZ0gAUh92ZXJ0ZXhBaUdyYXBoU3RvcmVUcmFpbmVyQ29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyU + mVzb3VyY2VDb25maWcSjgEKG3ZlcnRleF9haV9pbmZlcmVuY2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdib + WwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0Id4j8aEhh2ZXJ0ZXhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY + 2VyQ29uZmlnEo0BChpkYXRhZmxvd19pbmZlcmVuY2VyX2NvbmZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0Y + WZsb3dSZXNvdXJjZUNvbmZpZ0Id4j8aEhhkYXRhZmxvd0luZmVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZ + mlnEoEBChdsb2NhbF9pbmZlcmVuY2VyX2NvbmZpZxgDIAEoCzIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZ + UNvbmZpZ0Ia4j8XEhVsb2NhbEluZmVyZW5jZXJDb25maWdIAFIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ + 3JhcGhfc3RvcmVfaW5mZXJlbmNlcl9jb25maWcYBCABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTd + G9yZUNvbmZpZ0In4j8kEiJ2ZXJ0ZXhBaUdyYXBoU3RvcmVJbmZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZ + mVyZW5jZXJDb25maWdCEwoRaW5mZXJlbmNlcl9jb25maWcilwgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhY + mVscxgBIAMoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRye + UIT4j8QEg5yZXNvdXJjZUxhYmVsc1IOcmVzb3VyY2VMYWJlbHMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuY + XBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db + 21wdXRlQ29uZmlnUhNjb21tb25Db21wdXRlQ29uZmlnGpQFChNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCD + OI/CRIHcHJvamVjdFIHcHJvamVjdBIjCgZyZWdpb24YAiABKAlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfY + nVja2V0GAMgASgJQhXiPxISEHRlbXBBc3NldHNCdWNrZXRSEHRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3Nld + HNfYnVja2V0GAQgASgJQh3iPxoSGHRlbXBSZWdpb25hbEFzc2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKE + nBlcm1fYXNzZXRzX2J1Y2tldBgFIAEoCUIV4j8SEhBwZXJtQXNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfY + XNzZXRzX2JxX2RhdGFzZXRfbmFtZRgGIAEoCUIc4j8ZEhd0ZW1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0Y + XNldE5hbWUSVgoZZW1iZWRkaW5nX2JxX2RhdGFzZXRfbmFtZRgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlb + WJlZGRpbmdCcURhdGFzZXROYW1lElYKGWdjcF9zZXJ2aWNlX2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY + 291bnRFbWFpbFIWZ2NwU2VydmljZUFjY291bnRFbWFpbBI8Cg9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSd + W5uZXJSDmRhdGFmbG93UnVubmVyGlcKE1Jlc291cmNlTGFiZWxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKB + XZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEi9wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY + 2VfY29uZmlnX3VyaRgBIAEoCUIc4j8ZEhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1Vya + RJ/ChZzaGFyZWRfcmVzb3VyY2VfY29uZmlnGAIgASgLMiwuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvb + mZpZ0IZ4j8WEhRzaGFyZWRSZXNvdXJjZUNvbmZpZ0gAUhRzaGFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZ + mlnGAwgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3Nvc + kNvbmZpZ1IScHJlcHJvY2Vzc29yQ29uZmlnEn8KF3N1YmdyYXBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5TcGFya1Jlc291cmNlQ29uZmlnQhriPxcSFXN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ + 29uZmlnEnwKFnNwbGl0X2dlbmVyYXRvcl9jb25maWcYDiABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY + 2VDb25maWdCGeI/FhIUc3BsaXRHZW5lcmF0b3JDb25maWdSFHNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnG + A8gASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EaXN0cmlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZ + mlnUg10cmFpbmVyQ29uZmlnEnQKEWluZmVyZW5jZXJfY29uZmlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZ + mxvd1Jlc291cmNlQ29uZmlnQhcYAeI/EhIQaW5mZXJlbmNlckNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZ + XNvdXJjZV9jb25maWcYESABKAsyLS5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0c + mFpbmVyUmVzb3VyY2VDb25maWdSFXRyYWluZXJSZXNvdXJjZUNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYE + iABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb + 3VyY2VDb25maWdSGGluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb + 21wb25lbnRfVW5rbm93bhAAGhbiPxMSEUNvbXBvbmVudF9Vbmtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH + +I/HBIaQ29tcG9uZW50X0NvbmZpZ19WYWxpZGF0b3ISPwoaQ29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb + 25lbnRfQ29uZmlnX1BvcHVsYXRvchJBChtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0Y + V9QcmVwcm9jZXNzb3ISPwoaQ29tcG9uZW50X1N1YmdyYXBoX1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2Ftc + GxlchI9ChlDb21wb25lbnRfU3BsaXRfR2VuZXJhdG9yEAUaHuI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb + 25lbnRfVHJhaW5lchAGGhbiPxMSEUNvbXBvbmVudF9UcmFpbmVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tc + G9uZW50X0luZmVyZW5jZXJiBnByb3RvMw==""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala index 21f9ea1c2..d863014af 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala @@ -36,6 +36,20 @@ package snapchat.research.gbml.gigl_resource_config * @param reservationAffinity * Compute Engine reservation affinity for the job. * See https://docs.cloud.google.com/vertex-ai/docs/training/use-reservations + * @param tensorboardResourceName + * Existing Vertex AI TensorBoard resource to attach to the job. + * Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + * for the Tensorboard data model. + * @param tensorboardExperimentName + * Optional. When set, the trainer's chief rank streams events to a + * TensorboardExperiment with this name on the TB resource above, in + * addition to Vertex's per-job auto-upload. Multiple jobs that share this + * value land in the same TensorboardExperiment, so they appear as + * comparable runs on one TensorBoard page. Requires + * tensorboard_resource_name above to be set. Allowed characters: + * lowercase letters, digits, hyphens (Vertex AI Experiment ID rules). + * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. */ @SerialVersionUID(0L) final case class VertexAiResourceConfig( @@ -48,6 +62,8 @@ final case class VertexAiResourceConfig( schedulingStrategy: _root_.scala.Predef.String = "", bootDiskSizeGb: _root_.scala.Int = 0, reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None, + tensorboardResourceName: _root_.scala.Predef.String = "", + tensorboardExperimentName: _root_.scala.Predef.String = "", unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[VertexAiResourceConfig] { @transient @@ -114,6 +130,20 @@ final case class VertexAiResourceConfig( val __value = reservationAffinity.get __size += 1 + _root_.com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(__value.serializedSize) + __value.serializedSize }; + + { + val __value = tensorboardResourceName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(10, __value) + } + }; + + { + val __value = tensorboardExperimentName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(11, __value) + } + }; __size += unknownFields.serializedSize __size } @@ -181,6 +211,18 @@ final case class VertexAiResourceConfig( _output__.writeUInt32NoTag(__m.serializedSize) __m.writeTo(_output__) }; + { + val __v = tensorboardResourceName + if (!__v.isEmpty) { + _output__.writeString(10, __v) + } + }; + { + val __v = tensorboardExperimentName + if (!__v.isEmpty) { + _output__.writeString(11, __v) + } + }; unknownFields.writeTo(_output__) } def withMachineType(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(machineType = __v) @@ -194,6 +236,8 @@ final case class VertexAiResourceConfig( def getReservationAffinity: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity = reservationAffinity.getOrElse(snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity.defaultInstance) def clearReservationAffinity: VertexAiResourceConfig = copy(reservationAffinity = _root_.scala.None) def withReservationAffinity(__v: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity): VertexAiResourceConfig = copy(reservationAffinity = Option(__v)) + def withTensorboardResourceName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardResourceName = __v) + def withTensorboardExperimentName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardExperimentName = __v) def withUnknownFields(__v: _root_.scalapb.UnknownFieldSet) = copy(unknownFields = __v) def discardUnknownFields = copy(unknownFields = _root_.scalapb.UnknownFieldSet.empty) def getFieldByNumber(__fieldNumber: _root_.scala.Int): _root_.scala.Any = { @@ -231,6 +275,14 @@ final case class VertexAiResourceConfig( if (__t != 0) __t else null } case 9 => reservationAffinity.orNull + case 10 => { + val __t = tensorboardResourceName + if (__t != "") __t else null + } + case 11 => { + val __t = tensorboardExperimentName + if (__t != "") __t else null + } } } def getField(__field: _root_.scalapb.descriptors.FieldDescriptor): _root_.scalapb.descriptors.PValue = { @@ -245,6 +297,8 @@ final case class VertexAiResourceConfig( case 7 => _root_.scalapb.descriptors.PString(schedulingStrategy) case 8 => _root_.scalapb.descriptors.PInt(bootDiskSizeGb) case 9 => reservationAffinity.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) + case 10 => _root_.scalapb.descriptors.PString(tensorboardResourceName) + case 11 => _root_.scalapb.descriptors.PString(tensorboardExperimentName) } } def toProtoString: _root_.scala.Predef.String = _root_.scalapb.TextFormat.printToUnicodeString(this) @@ -264,6 +318,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat var __schedulingStrategy: _root_.scala.Predef.String = "" var __bootDiskSizeGb: _root_.scala.Int = 0 var __reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None + var __tensorboardResourceName: _root_.scala.Predef.String = "" + var __tensorboardExperimentName: _root_.scala.Predef.String = "" var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null var _done__ = false while (!_done__) { @@ -288,6 +344,10 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat __bootDiskSizeGb = _input__.readUInt32() case 74 => __reservationAffinity = Option(__reservationAffinity.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) + case 82 => + __tensorboardResourceName = _input__.readStringRequireUtf8() + case 90 => + __tensorboardExperimentName = _input__.readStringRequireUtf8() case tag => if (_unknownFields__ == null) { _unknownFields__ = new _root_.scalapb.UnknownFieldSet.Builder() @@ -305,6 +365,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy = __schedulingStrategy, bootDiskSizeGb = __bootDiskSizeGb, reservationAffinity = __reservationAffinity, + tensorboardResourceName = __tensorboardResourceName, + tensorboardExperimentName = __tensorboardExperimentName, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() ) } @@ -320,7 +382,9 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride = __fieldsMap.get(scalaDescriptor.findFieldByNumber(6).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), schedulingStrategy = __fieldsMap.get(scalaDescriptor.findFieldByNumber(7).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), bootDiskSizeGb = __fieldsMap.get(scalaDescriptor.findFieldByNumber(8).get).map(_.as[_root_.scala.Int]).getOrElse(0), - reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]) + reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]), + tensorboardResourceName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(10).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), + tensorboardExperimentName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(11).get).map(_.as[_root_.scala.Predef.String]).getOrElse("") ) case _ => throw new RuntimeException("Expected PMessage") } @@ -344,7 +408,9 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride = "", schedulingStrategy = "", bootDiskSizeGb = 0, - reservationAffinity = _root_.scala.None + reservationAffinity = _root_.scala.None, + tensorboardResourceName = "", + tensorboardExperimentName = "" ) implicit class VertexAiResourceConfigLens[UpperPB](_l: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig]) extends _root_.scalapb.lenses.ObjectLens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig](_l) { def machineType: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.machineType)((c_, f_) => c_.copy(machineType = f_)) @@ -357,6 +423,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat def bootDiskSizeGb: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Int] = field(_.bootDiskSizeGb)((c_, f_) => c_.copy(bootDiskSizeGb = f_)) def reservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = field(_.getReservationAffinity)((c_, f_) => c_.copy(reservationAffinity = Option(f_))) def optionalReservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]] = field(_.reservationAffinity)((c_, f_) => c_.copy(reservationAffinity = f_)) + def tensorboardResourceName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardResourceName)((c_, f_) => c_.copy(tensorboardResourceName = f_)) + def tensorboardExperimentName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardExperimentName)((c_, f_) => c_.copy(tensorboardExperimentName = f_)) } final val MACHINE_TYPE_FIELD_NUMBER = 1 final val GPU_TYPE_FIELD_NUMBER = 2 @@ -367,6 +435,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat final val SCHEDULING_STRATEGY_FIELD_NUMBER = 7 final val BOOT_DISK_SIZE_GB_FIELD_NUMBER = 8 final val RESERVATION_AFFINITY_FIELD_NUMBER = 9 + final val TENSORBOARD_RESOURCE_NAME_FIELD_NUMBER = 10 + final val TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER = 11 def of( machineType: _root_.scala.Predef.String, gpuType: _root_.scala.Predef.String, @@ -376,7 +446,9 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride: _root_.scala.Predef.String, schedulingStrategy: _root_.scala.Predef.String, bootDiskSizeGb: _root_.scala.Int, - reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] + reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity], + tensorboardResourceName: _root_.scala.Predef.String, + tensorboardExperimentName: _root_.scala.Predef.String ): _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig = _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig( machineType, gpuType, @@ -386,7 +458,9 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride, schedulingStrategy, bootDiskSizeGb, - reservationAffinity + reservationAffinity, + tensorboardResourceName, + tensorboardExperimentName ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.VertexAiResourceConfig]) } diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala index bcf95c046..2ae44b3a5 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala @@ -12,7 +12,10 @@ package snapchat.research.gbml.trained_model_metadata * @param evalMetricsUri * The path where evaluation metrics are stored * @param tensorboardLogsUri - * Path where tensorboard logs will be stored + * Path where tensorboard logs will be stored. Vertex AI maps this URI to + * ``AIP_TENSORBOARD_LOG_DIR`` inside trainer containers via + * ``CustomJobSpec.baseOutputDirectory``. See + * https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec. */ @SerialVersionUID(0L) final case class TrainedModelMetadata( @@ -26,28 +29,28 @@ final case class TrainedModelMetadata( private[this] var __serializedSizeMemoized: _root_.scala.Int = 0 private[this] def __computeSerializedSize(): _root_.scala.Int = { var __size = 0 - + { val __value = trainedModelUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(1, __value) } }; - + { val __value = scriptedModelUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(2, __value) } }; - + { val __value = evalMetricsUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(3, __value) } }; - + { val __value = tensorboardLogsUri if (!__value.isEmpty) { @@ -64,7 +67,7 @@ final case class TrainedModelMetadata( __serializedSizeMemoized = __size } __size - 1 - + } def writeTo(`_output__`: _root_.com.google.protobuf.CodedOutputStream): _root_.scala.Unit = { { diff --git a/snapchat/research/gbml/gigl_resource_config_pb2.py b/snapchat/research/gbml/gigl_resource_config_pb2.py index bbda8cf57..e701fd3ef 100644 --- a/snapchat/research/gbml/gigl_resource_config_pb2.py +++ b/snapchat/research/gbml/gigl_resource_config_pb2.py @@ -15,7 +15,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1snapchat/research/gbml/gigl_resource_config.proto\x12\x16snapchat.research.gbml\"Y\n\x13SparkResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x16\n\x0enum_local_ssds\x18\x02 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x03 \x01(\r\"\x83\x01\n\x16\x44\x61taflowResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\x12\x17\n\x0fmax_num_workers\x18\x02 \x01(\r\x12\x14\n\x0cmachine_type\x18\x03 \x01(\t\x12\x14\n\x0c\x64isk_size_gb\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\"\xbc\x01\n\x16\x44\x61taPreprocessorConfig\x12P\n\x18\x65\x64ge_preprocessor_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\x12P\n\x18node_preprocessor_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\"h\n\x15VertexAiTrainerConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\"z\n\x10KFPTrainerConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\")\n\x12LocalTrainerConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"O\n\x1bVertexAiReservationAffinity\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\"\n\x1areservation_resource_names\x18\x02 \x03(\t\"\xa2\x02\n\x16VertexAiResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\x12\x1b\n\x13gcp_region_override\x18\x06 \x01(\t\x12\x1b\n\x13scheduling_strategy\x18\x07 \x01(\t\x12\x19\n\x11\x62oot_disk_size_gb\x18\x08 \x01(\r\x12Q\n\x14reservation_affinity\x18\t \x01(\x0b\x32\x33.snapchat.research.gbml.VertexAiReservationAffinity\"{\n\x11KFPResourceConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\"*\n\x13LocalResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"\xd4\x01\n\x18VertexAiGraphStoreConfig\x12H\n\x10graph_store_pool\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12\x44\n\x0c\x63ompute_pool\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12(\n compute_cluster_local_world_size\x18\x03 \x01(\x05\"\x93\x02\n\x18\x44istributedTrainerConfig\x12Q\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32-.snapchat.research.gbml.VertexAiTrainerConfigH\x00\x12\x46\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32(.snapchat.research.gbml.KFPTrainerConfigH\x00\x12J\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32*.snapchat.research.gbml.LocalTrainerConfigH\x00\x42\x10\n\x0etrainer_config\"\xf5\x02\n\x15TrainerResourceConfig\x12R\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12G\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32).snapchat.research.gbml.KFPResourceConfigH\x00\x12K\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12`\n$vertex_ai_graph_store_trainer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x10\n\x0etrainer_config\"\x91\x03\n\x18InferencerResourceConfig\x12U\n\x1bvertex_ai_inferencer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12T\n\x1a\x64\x61taflow_inferencer_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigH\x00\x12N\n\x17local_inferencer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12\x63\n\'vertex_ai_graph_store_inferencer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x13\n\x11inferencer_config\"\xa3\x04\n\x14SharedResourceConfig\x12Y\n\x0fresource_labels\x18\x01 \x03(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.ResourceLabelsEntry\x12_\n\x15\x63ommon_compute_config\x18\x02 \x01(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.CommonComputeConfig\x1a\x97\x02\n\x13\x43ommonComputeConfig\x12\x0f\n\x07project\x18\x01 \x01(\t\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x1a\n\x12temp_assets_bucket\x18\x03 \x01(\t\x12#\n\x1btemp_regional_assets_bucket\x18\x04 \x01(\t\x12\x1a\n\x12perm_assets_bucket\x18\x05 \x01(\t\x12#\n\x1btemp_assets_bq_dataset_name\x18\x06 \x01(\t\x12!\n\x19\x65mbedding_bq_dataset_name\x18\x07 \x01(\t\x12!\n\x19gcp_service_account_email\x18\x08 \x01(\t\x12\x17\n\x0f\x64\x61taflow_runner\x18\x0b \x01(\t\x1a\x35\n\x13ResourceLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xc8\x05\n\x12GiglResourceConfig\x12$\n\x1ashared_resource_config_uri\x18\x01 \x01(\tH\x00\x12N\n\x16shared_resource_config\x18\x02 \x01(\x0b\x32,.snapchat.research.gbml.SharedResourceConfigH\x00\x12K\n\x13preprocessor_config\x18\x0c \x01(\x0b\x32..snapchat.research.gbml.DataPreprocessorConfig\x12L\n\x17subgraph_sampler_config\x18\r \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12K\n\x16split_generator_config\x18\x0e \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12L\n\x0etrainer_config\x18\x0f \x01(\x0b\x32\x30.snapchat.research.gbml.DistributedTrainerConfigB\x02\x18\x01\x12M\n\x11inferencer_config\x18\x10 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigB\x02\x18\x01\x12N\n\x17trainer_resource_config\x18\x11 \x01(\x0b\x32-.snapchat.research.gbml.TrainerResourceConfig\x12T\n\x1ainferencer_resource_config\x18\x12 \x01(\x0b\x32\x30.snapchat.research.gbml.InferencerResourceConfigB\x11\n\x0fshared_resource*\xf3\x01\n\tComponent\x12\x15\n\x11\x43omponent_Unknown\x10\x00\x12\x1e\n\x1a\x43omponent_Config_Validator\x10\x01\x12\x1e\n\x1a\x43omponent_Config_Populator\x10\x02\x12\x1f\n\x1b\x43omponent_Data_Preprocessor\x10\x03\x12\x1e\n\x1a\x43omponent_Subgraph_Sampler\x10\x04\x12\x1d\n\x19\x43omponent_Split_Generator\x10\x05\x12\x15\n\x11\x43omponent_Trainer\x10\x06\x12\x18\n\x14\x43omponent_Inferencer\x10\x07\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1snapchat/research/gbml/gigl_resource_config.proto\x12\x16snapchat.research.gbml\"Y\n\x13SparkResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x16\n\x0enum_local_ssds\x18\x02 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x03 \x01(\r\"\x83\x01\n\x16\x44\x61taflowResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\x12\x17\n\x0fmax_num_workers\x18\x02 \x01(\r\x12\x14\n\x0cmachine_type\x18\x03 \x01(\t\x12\x14\n\x0c\x64isk_size_gb\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\"\xbc\x01\n\x16\x44\x61taPreprocessorConfig\x12P\n\x18\x65\x64ge_preprocessor_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\x12P\n\x18node_preprocessor_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\"h\n\x15VertexAiTrainerConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\"z\n\x10KFPTrainerConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\")\n\x12LocalTrainerConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"O\n\x1bVertexAiReservationAffinity\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\"\n\x1areservation_resource_names\x18\x02 \x03(\t\"\xea\x02\n\x16VertexAiResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\x12\x1b\n\x13gcp_region_override\x18\x06 \x01(\t\x12\x1b\n\x13scheduling_strategy\x18\x07 \x01(\t\x12\x19\n\x11\x62oot_disk_size_gb\x18\x08 \x01(\r\x12Q\n\x14reservation_affinity\x18\t \x01(\x0b\x32\x33.snapchat.research.gbml.VertexAiReservationAffinity\x12!\n\x19tensorboard_resource_name\x18\n \x01(\t\x12#\n\x1btensorboard_experiment_name\x18\x0b \x01(\t\"{\n\x11KFPResourceConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\"*\n\x13LocalResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"\xd4\x01\n\x18VertexAiGraphStoreConfig\x12H\n\x10graph_store_pool\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12\x44\n\x0c\x63ompute_pool\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12(\n compute_cluster_local_world_size\x18\x03 \x01(\x05\"\x93\x02\n\x18\x44istributedTrainerConfig\x12Q\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32-.snapchat.research.gbml.VertexAiTrainerConfigH\x00\x12\x46\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32(.snapchat.research.gbml.KFPTrainerConfigH\x00\x12J\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32*.snapchat.research.gbml.LocalTrainerConfigH\x00\x42\x10\n\x0etrainer_config\"\xf5\x02\n\x15TrainerResourceConfig\x12R\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12G\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32).snapchat.research.gbml.KFPResourceConfigH\x00\x12K\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12`\n$vertex_ai_graph_store_trainer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x10\n\x0etrainer_config\"\x91\x03\n\x18InferencerResourceConfig\x12U\n\x1bvertex_ai_inferencer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12T\n\x1a\x64\x61taflow_inferencer_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigH\x00\x12N\n\x17local_inferencer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12\x63\n\'vertex_ai_graph_store_inferencer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x13\n\x11inferencer_config\"\xa3\x04\n\x14SharedResourceConfig\x12Y\n\x0fresource_labels\x18\x01 \x03(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.ResourceLabelsEntry\x12_\n\x15\x63ommon_compute_config\x18\x02 \x01(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.CommonComputeConfig\x1a\x97\x02\n\x13\x43ommonComputeConfig\x12\x0f\n\x07project\x18\x01 \x01(\t\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x1a\n\x12temp_assets_bucket\x18\x03 \x01(\t\x12#\n\x1btemp_regional_assets_bucket\x18\x04 \x01(\t\x12\x1a\n\x12perm_assets_bucket\x18\x05 \x01(\t\x12#\n\x1btemp_assets_bq_dataset_name\x18\x06 \x01(\t\x12!\n\x19\x65mbedding_bq_dataset_name\x18\x07 \x01(\t\x12!\n\x19gcp_service_account_email\x18\x08 \x01(\t\x12\x17\n\x0f\x64\x61taflow_runner\x18\x0b \x01(\t\x1a\x35\n\x13ResourceLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xc8\x05\n\x12GiglResourceConfig\x12$\n\x1ashared_resource_config_uri\x18\x01 \x01(\tH\x00\x12N\n\x16shared_resource_config\x18\x02 \x01(\x0b\x32,.snapchat.research.gbml.SharedResourceConfigH\x00\x12K\n\x13preprocessor_config\x18\x0c \x01(\x0b\x32..snapchat.research.gbml.DataPreprocessorConfig\x12L\n\x17subgraph_sampler_config\x18\r \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12K\n\x16split_generator_config\x18\x0e \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12L\n\x0etrainer_config\x18\x0f \x01(\x0b\x32\x30.snapchat.research.gbml.DistributedTrainerConfigB\x02\x18\x01\x12M\n\x11inferencer_config\x18\x10 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigB\x02\x18\x01\x12N\n\x17trainer_resource_config\x18\x11 \x01(\x0b\x32-.snapchat.research.gbml.TrainerResourceConfig\x12T\n\x1ainferencer_resource_config\x18\x12 \x01(\x0b\x32\x30.snapchat.research.gbml.InferencerResourceConfigB\x11\n\x0fshared_resource*\xf3\x01\n\tComponent\x12\x15\n\x11\x43omponent_Unknown\x10\x00\x12\x1e\n\x1a\x43omponent_Config_Validator\x10\x01\x12\x1e\n\x1a\x43omponent_Config_Populator\x10\x02\x12\x1f\n\x1b\x43omponent_Data_Preprocessor\x10\x03\x12\x1e\n\x1a\x43omponent_Subgraph_Sampler\x10\x04\x12\x1d\n\x19\x43omponent_Split_Generator\x10\x05\x12\x15\n\x11\x43omponent_Trainer\x10\x06\x12\x18\n\x14\x43omponent_Inferencer\x10\x07\x62\x06proto3') _COMPONENT = DESCRIPTOR.enum_types_by_name['Component'] Component = enum_type_wrapper.EnumTypeWrapper(_COMPONENT) @@ -184,8 +184,8 @@ _GIGLRESOURCECONFIG.fields_by_name['trainer_config']._serialized_options = b'\030\001' _GIGLRESOURCECONFIG.fields_by_name['inferencer_config']._options = None _GIGLRESOURCECONFIG.fields_by_name['inferencer_config']._serialized_options = b'\030\001' - _COMPONENT._serialized_start=3848 - _COMPONENT._serialized_end=4091 + _COMPONENT._serialized_start=3920 + _COMPONENT._serialized_end=4163 _SPARKRESOURCECONFIG._serialized_start=77 _SPARKRESOURCECONFIG._serialized_end=166 _DATAFLOWRESOURCECONFIG._serialized_start=169 @@ -201,25 +201,25 @@ _VERTEXAIRESERVATIONAFFINITY._serialized_start=766 _VERTEXAIRESERVATIONAFFINITY._serialized_end=845 _VERTEXAIRESOURCECONFIG._serialized_start=848 - _VERTEXAIRESOURCECONFIG._serialized_end=1138 - _KFPRESOURCECONFIG._serialized_start=1140 - _KFPRESOURCECONFIG._serialized_end=1263 - _LOCALRESOURCECONFIG._serialized_start=1265 - _LOCALRESOURCECONFIG._serialized_end=1307 - _VERTEXAIGRAPHSTORECONFIG._serialized_start=1310 - _VERTEXAIGRAPHSTORECONFIG._serialized_end=1522 - _DISTRIBUTEDTRAINERCONFIG._serialized_start=1525 - _DISTRIBUTEDTRAINERCONFIG._serialized_end=1800 - _TRAINERRESOURCECONFIG._serialized_start=1803 - _TRAINERRESOURCECONFIG._serialized_end=2176 - _INFERENCERRESOURCECONFIG._serialized_start=2179 - _INFERENCERRESOURCECONFIG._serialized_end=2580 - _SHAREDRESOURCECONFIG._serialized_start=2583 - _SHAREDRESOURCECONFIG._serialized_end=3130 - _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_start=2796 - _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_end=3075 - _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_start=3077 - _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_end=3130 - _GIGLRESOURCECONFIG._serialized_start=3133 - _GIGLRESOURCECONFIG._serialized_end=3845 + _VERTEXAIRESOURCECONFIG._serialized_end=1210 + _KFPRESOURCECONFIG._serialized_start=1212 + _KFPRESOURCECONFIG._serialized_end=1335 + _LOCALRESOURCECONFIG._serialized_start=1337 + _LOCALRESOURCECONFIG._serialized_end=1379 + _VERTEXAIGRAPHSTORECONFIG._serialized_start=1382 + _VERTEXAIGRAPHSTORECONFIG._serialized_end=1594 + _DISTRIBUTEDTRAINERCONFIG._serialized_start=1597 + _DISTRIBUTEDTRAINERCONFIG._serialized_end=1872 + _TRAINERRESOURCECONFIG._serialized_start=1875 + _TRAINERRESOURCECONFIG._serialized_end=2248 + _INFERENCERRESOURCECONFIG._serialized_start=2251 + _INFERENCERRESOURCECONFIG._serialized_end=2652 + _SHAREDRESOURCECONFIG._serialized_start=2655 + _SHAREDRESOURCECONFIG._serialized_end=3202 + _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_start=2868 + _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_end=3147 + _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_start=3149 + _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_end=3202 + _GIGLRESOURCECONFIG._serialized_start=3205 + _GIGLRESOURCECONFIG._serialized_end=3917 # @@protoc_insertion_point(module_scope) diff --git a/snapchat/research/gbml/gigl_resource_config_pb2.pyi b/snapchat/research/gbml/gigl_resource_config_pb2.pyi index 6198d1076..09ddb04c0 100644 --- a/snapchat/research/gbml/gigl_resource_config_pb2.pyi +++ b/snapchat/research/gbml/gigl_resource_config_pb2.pyi @@ -259,6 +259,8 @@ class VertexAiResourceConfig(google.protobuf.message.Message): SCHEDULING_STRATEGY_FIELD_NUMBER: builtins.int BOOT_DISK_SIZE_GB_FIELD_NUMBER: builtins.int RESERVATION_AFFINITY_FIELD_NUMBER: builtins.int + TENSORBOARD_RESOURCE_NAME_FIELD_NUMBER: builtins.int + TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER: builtins.int machine_type: builtins.str """Machine type for job""" gpu_type: builtins.str @@ -294,6 +296,22 @@ class VertexAiResourceConfig(google.protobuf.message.Message): """Compute Engine reservation affinity for the job. See https://docs.cloud.google.com/vertex-ai/docs/training/use-reservations """ + tensorboard_resource_name: builtins.str + """Existing Vertex AI TensorBoard resource to attach to the job. + Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + for the Tensorboard data model. + """ + tensorboard_experiment_name: builtins.str + """Optional. When set, the trainer's chief rank streams events to a + TensorboardExperiment with this name on the TB resource above, in + addition to Vertex's per-job auto-upload. Multiple jobs that share this + value land in the same TensorboardExperiment, so they appear as + comparable runs on one TensorBoard page. Requires + tensorboard_resource_name above to be set. Allowed characters: + lowercase letters, digits, hyphens (Vertex AI Experiment ID rules). + See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. + """ def __init__( self, *, @@ -306,9 +324,11 @@ class VertexAiResourceConfig(google.protobuf.message.Message): scheduling_strategy: builtins.str = ..., boot_disk_size_gb: builtins.int = ..., reservation_affinity: global___VertexAiReservationAffinity | None = ..., + tensorboard_resource_name: builtins.str = ..., + tensorboard_experiment_name: builtins.str = ..., ) -> None: ... def HasField(self, field_name: typing_extensions.Literal["reservation_affinity", b"reservation_affinity"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["boot_disk_size_gb", b"boot_disk_size_gb", "gcp_region_override", b"gcp_region_override", "gpu_limit", b"gpu_limit", "gpu_type", b"gpu_type", "machine_type", b"machine_type", "num_replicas", b"num_replicas", "reservation_affinity", b"reservation_affinity", "scheduling_strategy", b"scheduling_strategy", "timeout", b"timeout"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["boot_disk_size_gb", b"boot_disk_size_gb", "gcp_region_override", b"gcp_region_override", "gpu_limit", b"gpu_limit", "gpu_type", b"gpu_type", "machine_type", b"machine_type", "num_replicas", b"num_replicas", "reservation_affinity", b"reservation_affinity", "scheduling_strategy", b"scheduling_strategy", "tensorboard_experiment_name", b"tensorboard_experiment_name", "tensorboard_resource_name", b"tensorboard_resource_name", "timeout", b"timeout"]) -> None: ... global___VertexAiResourceConfig = VertexAiResourceConfig diff --git a/snapchat/research/gbml/trained_model_metadata_pb2.pyi b/snapchat/research/gbml/trained_model_metadata_pb2.pyi index 5bdb95d48..9fa9f7886 100644 --- a/snapchat/research/gbml/trained_model_metadata_pb2.pyi +++ b/snapchat/research/gbml/trained_model_metadata_pb2.pyi @@ -28,7 +28,11 @@ class TrainedModelMetadata(google.protobuf.message.Message): eval_metrics_uri: builtins.str """The path where evaluation metrics are stored""" tensorboard_logs_uri: builtins.str - """Path where tensorboard logs will be stored""" + """Path where tensorboard logs will be stored. Vertex AI maps this URI to + ``AIP_TENSORBOARD_LOG_DIR`` inside trainer containers via + ``CustomJobSpec.baseOutputDirectory``. See + https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec. + """ def __init__( self, *, diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index 8db251b44..b566782de 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -1,5 +1,6 @@ """Unit tests for vertex_ai_launcher module.""" +import time from unittest.mock import Mock, patch from absl.testing import absltest @@ -10,6 +11,7 @@ GiglResourceConfigWrapper, ) from gigl.src.common.vertex_ai_launcher import ( + _build_job_config, launch_graph_store_enabled_job, launch_single_pool_job, ) @@ -59,6 +61,7 @@ def _create_gigl_resource_config_with_graph_store( gcp_region_override="us-west1", timeout=10800, scheduling_strategy="STANDARD", + tensorboard_resource_name="projects/test-project/locations/us-west1/tensorboards/test-tensorboard", ) storage_pool = gigl_resource_config_pb2.VertexAiResourceConfig( machine_type="n1-highmem-32", @@ -92,6 +95,7 @@ def _create_gigl_resource_config_with_single_pool_inference( machine_type="n1-standard-8", num_replicas=1, timeout=7200, + tensorboard_resource_name="projects/test-project/locations/us-central1/tensorboards/should-not-attach", ) # Create InferencerResourceConfig with single pool vertex AI config @@ -152,6 +156,7 @@ def test_launch_training_graph_store_cuda(self, mock_vertex_ai_service_class): cpu_docker_uri=cpu_docker_uri, cuda_docker_uri=cuda_docker_uri, component=component, + tensorboard_logs_uri=Uri("gs://test-perm-bucket/job-name/trainer/logs/"), ) # Assert - verify VertexAIService was instantiated correctly @@ -192,6 +197,10 @@ def test_launch_training_graph_store_cuda(self, mock_vertex_ai_service_class): self.assertIn( f"--epochs={process_runtime_args['epochs']}", compute_job_config.args ) + self.assertEqual( + compute_job_config.base_output_dir, + "gs://test-perm-bucket/job-name/trainer", + ) # Verify storage pool config self.assertEqual(storage_job_config.machine_type, storage_pool.machine_type) @@ -199,6 +208,9 @@ def test_launch_training_graph_store_cuda(self, mock_vertex_ai_service_class): "gigl.distributed.graph_store.storage_main", " ".join(storage_job_config.command), ) + self.assertIsNotNone(storage_job_config.args) + assert storage_job_config.args is not None # Type narrowing for mypy + self.assertIsNone(storage_job_config.base_output_dir) # Verify environment variables compute_env_vars = { @@ -304,6 +316,7 @@ def test_launch_inference_single_pool_cpu(self, mock_vertex_ai_service_class): self.assertIn( f"--output_path={process_runtime_args['output_path']}", job_config.args ) + self.assertIsNone(job_config.base_output_dir) # Verify resource labels expected_labels = { @@ -313,6 +326,203 @@ def test_launch_inference_single_pool_cpu(self, mock_vertex_ai_service_class): } self.assertEqual(job_config.labels, expected_labels) + @patch("gigl.src.common.vertex_ai_launcher.VertexAIService") + def test_launch_single_pool_job_reads_experiment_name_from_resource_config( + self, mock_vertex_ai_service_class + ): + """tensorboard_experiment_name on the resource config flows to the VertexAiJobConfig.""" + experiment_name = "my-single-pool-experiment" + + gigl_resource_config_proto = ( + _create_gigl_resource_config_with_single_pool_inference( + cost_resource_group="gigl_train" + ) + ) + resource_config_wrapper = GiglResourceConfigWrapper( + resource_config=gigl_resource_config_proto + ) + vertex_ai_config = gigl_resource_config_proto.inferencer_resource_config.vertex_ai_inferencer_config + vertex_ai_config.tensorboard_experiment_name = experiment_name + + mock_service_instance = Mock() + mock_vertex_ai_service_class.return_value = mock_service_instance + + launch_single_pool_job( + vertex_ai_resource_config=vertex_ai_config, + job_name="test-single-pool-tb-exp", + task_config_uri=Uri("gs://bucket/task_config.yaml"), + resource_config_uri=Uri("gs://bucket/resource_config.yaml"), + process_command="python -m gigl.src.training.v2.glt_trainer", + process_runtime_args={}, + resource_config_wrapper=resource_config_wrapper, + cpu_docker_uri="gcr.io/project/cpu-image:tag", + cuda_docker_uri="gcr.io/project/cuda-image:tag", + component=GiGLComponents.Trainer, + vertex_ai_region="us-central1", + tensorboard_logs_uri=Uri("gs://bucket/job/trainer/logs/"), + ) + + mock_service_instance.launch_job.assert_called_once() + call_args = mock_service_instance.launch_job.call_args + job_config = call_args.kwargs["job_config"] + env = {ev.name: ev.value for ev in job_config.environment_variables or []} + self.assertEqual(env.get("GIGL_TENSORBOARD_EXPERIMENT_NAME"), experiment_name) + + @patch("gigl.src.common.vertex_ai_launcher.VertexAIService") + def test_launch_graph_store_job_reads_experiment_name_from_compute_pool( + self, mock_vertex_ai_service_class + ): + """compute_pool.tensorboard_experiment_name flows to the compute pool's + VertexAiJobConfig; storage pool stays empty. + """ + experiment_name = "my-graph-store-experiment" + + gigl_resource_config_proto = _create_gigl_resource_config_with_graph_store( + cost_resource_group="gigl_train" + ) + resource_config_wrapper = GiglResourceConfigWrapper( + resource_config=gigl_resource_config_proto + ) + graph_store_config = gigl_resource_config_proto.trainer_resource_config.vertex_ai_graph_store_trainer_config + graph_store_config.compute_pool.tensorboard_experiment_name = experiment_name + + mock_service_instance = Mock() + mock_vertex_ai_service_class.return_value = mock_service_instance + + launch_graph_store_enabled_job( + vertex_ai_graph_store_config=graph_store_config, + job_name="test-graph-store-tb-exp", + task_config_uri=Uri("gs://bucket/task_config.yaml"), + resource_config_uri=Uri("gs://bucket/resource_config.yaml"), + compute_commmand="python -m gigl.src.training.v2.glt_trainer", + compute_runtime_args={}, + resource_config_wrapper=resource_config_wrapper, + storage_command="python -m gigl.distributed.graph_store.storage_main", + storage_args={}, + cpu_docker_uri="gcr.io/project/cpu-image:tag", + cuda_docker_uri="gcr.io/project/cuda-image:tag", + component=GiGLComponents.Trainer, + tensorboard_logs_uri=Uri("gs://bucket/job/trainer/logs/"), + ) + + mock_service_instance.launch_graph_store_job.assert_called_once() + call_args = mock_service_instance.launch_graph_store_job.call_args + compute_job_config = call_args.kwargs["compute_pool_job_config"] + storage_job_config = call_args.kwargs["storage_pool_job_config"] + + compute_env = { + ev.name: ev.value for ev in compute_job_config.environment_variables or [] + } + storage_env_names = { + ev.name for ev in storage_job_config.environment_variables or [] + } + self.assertEqual( + compute_env.get("GIGL_TENSORBOARD_EXPERIMENT_NAME"), experiment_name + ) + self.assertNotIn("GIGL_TENSORBOARD_EXPERIMENT_NAME", storage_env_names) + + def test_build_job_config_injects_gigl_tensorboard_env_vars(self) -> None: + """When tensorboard_experiment_name is set with a TB resource, the + launcher injects env vars so the trainer's chief-rank uploader can + find the destination experiment. + """ + resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-4", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + tensorboard_experiment_name="my-comparison", + ) + cfg = _build_job_config( + job_name="gigl_train_some_task", + task_config_uri=Uri("gs://b/task.yaml"), + resource_config_uri=Uri("gs://b/resource.yaml"), + command_str="python -m gigl.src.training.v2.glt_trainer", + args={}, + use_cuda=False, + container_uri="gcr.io/p/img", + vertex_ai_resource_config=resource_config, + env_vars=[], + tensorboard_logs_uri=Uri("gs://b/run/logs/"), + ) + env = {ev.name: ev.value for ev in cfg.environment_variables or []} + self.assertEqual( + env["GIGL_TENSORBOARD_RESOURCE_NAME"], + "projects/p/locations/us/tensorboards/1", + ) + self.assertEqual(env["GIGL_TENSORBOARD_EXPERIMENT_NAME"], "my-comparison") + # GIGL_TENSORBOARD_RUN_NAME must be sanitized (underscores in the + # job_name become hyphens) and carry a launch-unique timestamp suffix. + run_name = env["GIGL_TENSORBOARD_RUN_NAME"] + self.assertRegex(run_name, r"^gigl-train-some-task-\d{8}-\d{6}$") + + def test_build_job_config_run_name_is_unique_per_call(self) -> None: + """Two builds of the same job_name produce two distinct run names.""" + resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-4", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + tensorboard_experiment_name="my-comparison", + ) + kwargs = dict( + job_name="gigl_train_same_name", + task_config_uri=Uri("gs://b/task.yaml"), + resource_config_uri=Uri("gs://b/resource.yaml"), + command_str="python -m gigl.src.training.v2.glt_trainer", + args={}, + use_cuda=False, + container_uri="gcr.io/p/img", + vertex_ai_resource_config=resource_config, + env_vars=[], + tensorboard_logs_uri=Uri("gs://b/run/logs/"), + ) + first = _build_job_config(**kwargs) # type: ignore[arg-type] + # Sleep one second so the timestamp suffix changes deterministically. + time.sleep(1) + second = _build_job_config(**kwargs) # type: ignore[arg-type] + + def _run_name(cfg) -> str: + return next( + ev.value + for ev in cfg.environment_variables or [] + if ev.name == "GIGL_TENSORBOARD_RUN_NAME" + ) + + self.assertNotEqual(_run_name(first), _run_name(second)) + + def test_build_job_config_no_gigl_env_vars_when_experiment_name_unset( + self, + ) -> None: + """The GIGL_TENSORBOARD_* env vars are NOT injected on the legacy + ``submit(tensorboard=...)`` path. + """ + resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-4", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + ) + cfg = _build_job_config( + job_name="job", + task_config_uri=Uri("gs://b/task.yaml"), + resource_config_uri=Uri("gs://b/resource.yaml"), + command_str="python -m gigl.src.training.v2.glt_trainer", + args={}, + use_cuda=False, + container_uri="gcr.io/p/img", + vertex_ai_resource_config=resource_config, + env_vars=[], + tensorboard_logs_uri=Uri("gs://b/run/logs/"), + ) + env_names = {ev.name for ev in cfg.environment_variables or []} + self.assertNotIn("GIGL_TENSORBOARD_RESOURCE_NAME", env_names) + self.assertNotIn("GIGL_TENSORBOARD_EXPERIMENT_NAME", env_names) + self.assertNotIn("GIGL_TENSORBOARD_RUN_NAME", env_names) + if __name__ == "__main__": absltest.main() diff --git a/tests/unit/src/config_populator/config_populator_functionality_test.py b/tests/unit/src/config_populator/config_populator_functionality_test.py index 440b4cc95..201dac5b8 100644 --- a/tests/unit/src/config_populator/config_populator_functionality_test.py +++ b/tests/unit/src/config_populator/config_populator_functionality_test.py @@ -101,6 +101,9 @@ def test_sgs_config_population_is_accurate( ) self.assertNotEqual(trained_model_metadata_pb.trained_model_uri, "") self.assertNotEqual(trained_model_metadata_pb.scripted_model_uri, "") + self.assertTrue( + trained_model_metadata_pb.tensorboard_logs_uri.endswith("/logs/") + ) # Assert inference metadata assets were set inference_metadata_pb: inference_metadata_pb2.InferenceMetadata = ( @@ -189,6 +192,9 @@ def test_glt_config_population_is_accurate( ) self.assertNotEqual(trained_model_metadata_pb.trained_model_uri, "") self.assertNotEqual(trained_model_metadata_pb.scripted_model_uri, "") + self.assertTrue( + trained_model_metadata_pb.tensorboard_logs_uri.endswith("/logs/") + ) # Assert inference metadata assets were set inference_metadata_pb: inference_metadata_pb2.InferenceMetadata = ( diff --git a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py index c70450501..716a8e270 100644 --- a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py +++ b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py @@ -7,6 +7,7 @@ from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import ( check_inferencer_graph_store_compatibility, check_trainer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ) from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2 from tests.test_assets.test_case import TestCase @@ -94,6 +95,18 @@ def _create_gbml_config_without_graph_stores() -> GbmlConfigPbWrapper: return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) +def _create_gbml_config_with_tensorboard_enabled() -> GbmlConfigPbWrapper: + """Create a GbmlConfig with trainer TensorBoard logging enabled.""" + gbml_config = gbml_config_pb2.GbmlConfig() + gbml_config.trainer_config.should_log_to_tensorboard = True + return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) + + +def _create_empty_gbml_config() -> GbmlConfigPbWrapper: + """Create a minimal GbmlConfig (no flags set).""" + return GbmlConfigPbWrapper(gbml_config_pb=gbml_config_pb2.GbmlConfig()) + + def _create_resource_config_with_both_graph_stores() -> GiglResourceConfigWrapper: """Create a GiglResourceConfig with VertexAiGraphStoreConfig for both trainer and inferencer.""" config = gigl_resource_config_pb2.GiglResourceConfig() @@ -126,6 +139,65 @@ def _create_resource_config_without_graph_stores() -> GiglResourceConfigWrapper: return GiglResourceConfigWrapper(resource_config=config) +def _create_resource_config_with_trainer_tensorboard( + *, + tensorboard_resource_name: str, + tensorboard_experiment_name: str = "", + use_graph_store: bool = False, +) -> GiglResourceConfigWrapper: + """Create a GiglResourceConfig with a trainer TensorBoard resource.""" + config = gigl_resource_config_pb2.GiglResourceConfig() + _create_shared_resource_config(config) + + if use_graph_store: + graph_store_config = _create_vertex_ai_graph_store_config() + graph_store_config.compute_pool.tensorboard_resource_name = ( + tensorboard_resource_name + ) + graph_store_config.compute_pool.tensorboard_experiment_name = ( + tensorboard_experiment_name + ) + config.trainer_resource_config.vertex_ai_graph_store_trainer_config.CopyFrom( + graph_store_config + ) + else: + vertex_ai_resource_config = _create_vertex_ai_resource_config() + vertex_ai_resource_config.tensorboard_resource_name = tensorboard_resource_name + vertex_ai_resource_config.tensorboard_experiment_name = ( + tensorboard_experiment_name + ) + config.trainer_resource_config.vertex_ai_trainer_config.CopyFrom( + vertex_ai_resource_config + ) + + return GiglResourceConfigWrapper(resource_config=config) + + +def _create_resource_config_with_experiment_name_only( + *, + experiment_name: str, + use_graph_store: bool = False, +) -> GiglResourceConfigWrapper: + """Create a GiglResourceConfig with experiment_name set but NO TB resource.""" + config = gigl_resource_config_pb2.GiglResourceConfig() + _create_shared_resource_config(config) + + if use_graph_store: + graph_store_config = _create_vertex_ai_graph_store_config() + graph_store_config.compute_pool.tensorboard_experiment_name = experiment_name + config.trainer_resource_config.vertex_ai_graph_store_trainer_config.CopyFrom( + graph_store_config + ) + else: + vertex_ai_resource_config = _create_vertex_ai_resource_config() + vertex_ai_resource_config.tensorboard_experiment_name = experiment_name + config.trainer_resource_config.vertex_ai_trainer_config.CopyFrom( + vertex_ai_resource_config + ) + + return GiglResourceConfigWrapper(resource_config=config) + + class TestTrainerGraphStoreCompatibility(TestCase): """Test suite for trainer graph store compatibility checks.""" @@ -203,6 +275,82 @@ def test_template_has_inferencer_graph_store_resource_does_not(self): resource_config_wrapper=resource_config, ) + +class TestVertexAITrainerTensorboardCompatibility(TestCase): + """Test suite for Vertex AI trainer TensorBoard compatibility checks.""" + + def test_vertex_ai_trainer_tensorboard_config_present(self): + gbml_config = _create_gbml_config_with_tensorboard_enabled() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ), + tensorboard_experiment_name="my-comparison", + ) + + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + def test_graph_store_trainer_tensorboard_config_present(self): + gbml_config = _create_gbml_config_with_tensorboard_enabled() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ), + tensorboard_experiment_name="my-comparison", + use_graph_store=True, + ) + + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + def test_vertex_ai_trainer_tensorboard_missing_resource_name_raises(self): + gbml_config = _create_gbml_config_with_tensorboard_enabled() + resource_config = _create_resource_config_without_graph_stores() + + with self.assertRaises(AssertionError): + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + def test_resource_name_set_without_experiment_name_raises(self): + """tensorboard_resource_name set without tensorboard_experiment_name → AssertionError.""" + gbml_config = _create_empty_gbml_config() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ) + ) + + with self.assertRaises(AssertionError) as ctx: + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + self.assertIn("must be set together", str(ctx.exception)) + + def test_invalid_experiment_name_format_raises(self): + """tensorboard_experiment_name that violates the Vertex resource-ID regex raises.""" + gbml_config = _create_empty_gbml_config() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ), + tensorboard_experiment_name="My_Invalid_Name", + ) + + with self.assertRaises(AssertionError) as ctx: + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + self.assertIn("not a valid Vertex AI Experiment ID", str(ctx.exception)) + def test_resource_has_inferencer_graph_store_template_does_not(self): """Test that resource having graph store but template not raises an assertion error.""" gbml_config = _create_gbml_config_without_graph_stores() @@ -213,6 +361,53 @@ def test_resource_has_inferencer_graph_store_template_does_not(self): resource_config_wrapper=resource_config, ) + def test_experiment_name_set_without_tensorboard_resource_raises(self): + """tensorboard_experiment_name set without resource_name → AssertionError.""" + gbml_config = _create_empty_gbml_config() + resource_config = _create_resource_config_with_experiment_name_only( + experiment_name="my-comparison" + ) + + with self.assertRaises(AssertionError) as ctx: + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + self.assertIn("must be set together", str(ctx.exception)) + + def test_experiment_name_set_with_tensorboard_resource_does_not_raise(self): + """tensorboard_experiment_name set and TB resource present → no exception.""" + gbml_config = _create_empty_gbml_config() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ), + tensorboard_experiment_name="my-comparison", + ) + + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + def test_experiment_name_set_with_graph_store_tensorboard_resource_does_not_raise( + self, + ): + """tensorboard_experiment_name set and graph-store TB resource present → no exception.""" + gbml_config = _create_empty_gbml_config() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ), + tensorboard_experiment_name="my-comparison", + use_graph_store=True, + ) + + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + if __name__ == "__main__": absltest.main() diff --git a/tests/unit/utils/tensorboard_writer_test.py b/tests/unit/utils/tensorboard_writer_test.py new file mode 100644 index 000000000..a95140692 --- /dev/null +++ b/tests/unit/utils/tensorboard_writer_test.py @@ -0,0 +1,288 @@ +"""Unit tests for gigl.utils.tensorboard_writer.""" + +import os +from unittest.mock import Mock, patch + +from absl.testing import absltest + +from gigl.utils import tensorboard_writer as tensorboard_writer_module +from gigl.utils.tensorboard_writer import TensorBoardWriter +from tests.test_assets.test_case import TestCase + + +class TestTensorBoardWriter(TestCase): + """Tests for the TensorBoardWriter class.""" + + def test_from_env_returns_noop_when_disabled(self) -> None: + # When disabled (e.g. non-chief rank), env var state is irrelevant + # and no TF writer is constructed. + with patch.dict( + os.environ, + {"AIP_TENSORBOARD_LOG_DIR": "gs://vertex-managed/logs"}, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ) as mock_create_file_writer: + writer = TensorBoardWriter.from_env(enabled=False) + writer.log({"Loss/train": 1.0}, step=0) + writer.close() + + mock_create_file_writer.assert_not_called() + + def test_from_env_uses_parent_log_dir_when_no_run_name(self) -> None: + with patch.dict( + os.environ, + {"AIP_TENSORBOARD_LOG_DIR": "gs://vertex-managed/logs"}, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ) as mock_create_file_writer: + TensorBoardWriter.from_env() + + mock_create_file_writer.assert_called_once_with("gs://vertex-managed/logs") + + def test_from_env_uses_run_name_subdir_when_set(self) -> None: + """Writer points TF at the subdir so the SDK uploader sees a distinct run.""" + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": "gs://vertex-managed/logs", + "GIGL_TENSORBOARD_RUN_NAME": "my-run", + }, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ) as mock_create_file_writer: + TensorBoardWriter.from_env() + + mock_create_file_writer.assert_called_once_with( + "gs://vertex-managed/logs/my-run" + ) + + def test_from_env_raises_when_env_var_missing(self) -> None: + with patch.dict(os.environ, {}, clear=True): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ) as mock_create_file_writer: + with self.assertRaises(RuntimeError): + TensorBoardWriter.from_env() + + mock_create_file_writer.assert_not_called() + + @patch("gigl.utils.tensorboard_writer.tf.summary.scalar") + def test_log_writes_each_metric_at_step_and_flushes( + self, mock_summary_scalar + ) -> None: + underlying_writer = Mock() + underlying_writer.as_default.return_value.__enter__ = Mock(return_value=None) + underlying_writer.as_default.return_value.__exit__ = Mock(return_value=None) + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer", + return_value=underlying_writer, + ): + writer = TensorBoardWriter(log_dir="gs://logs/") + writer.log({"Loss/train": 1.5, "Loss/val": 2.0}, step=10) + + self.assertEqual(mock_summary_scalar.call_count, 2) + mock_summary_scalar.assert_any_call("Loss/train", 1.5, step=10) + mock_summary_scalar.assert_any_call("Loss/val", 2.0, step=10) + underlying_writer.flush.assert_called_once() + + @patch("gigl.utils.tensorboard_writer.tf.summary.scalar") + def test_log_is_noop_when_writer_disabled(self, mock_summary_scalar) -> None: + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ) as mock_create_file_writer: + writer = TensorBoardWriter(log_dir=None) + writer.log({"Loss/train": 1.0}, step=0) + + mock_create_file_writer.assert_not_called() + mock_summary_scalar.assert_not_called() + + def test_context_manager_closes_writer(self) -> None: + underlying_writer = Mock() + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer", + return_value=underlying_writer, + ): + with TensorBoardWriter(log_dir="gs://logs/"): + pass + + underlying_writer.close.assert_called_once() + + def test_close_is_idempotent(self) -> None: + underlying_writer = Mock() + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer", + return_value=underlying_writer, + ): + writer = TensorBoardWriter(log_dir="gs://logs/") + writer.close() + writer.close() + + underlying_writer.close.assert_called_once() + + def test_close_on_noop_writer_does_not_raise(self) -> None: + writer = TensorBoardWriter(log_dir=None) + writer.close() + writer.close() # Idempotent on no-op writer. + + +class TestTensorBoardWriterUploader(TestCase): + """Tests for the chief-rank ``aiplatform.start_upload_tb_log`` hook.""" + + _LOG_DIR = "gs://vertex-managed/logs" + _TB_RESOURCE = "projects/my-project/locations/us-central1/tensorboards/42" + _EXPERIMENT = "my-comparison" + + def test_uploader_starts_when_all_env_vars_present(self) -> None: + """Uploader watches the parent log dir; writer points at the run-name subdir.""" + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": self._TB_RESOURCE, + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + "GIGL_TENSORBOARD_RUN_NAME": "my-run", + }, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ) as mock_create_file_writer: + with ( + patch("google.cloud.aiplatform.start_upload_tb_log") as mock_start, + patch("google.cloud.aiplatform.init") as mock_init, + patch("google.cloud.aiplatform.end_upload_tb_log") as mock_end, + ): + writer = TensorBoardWriter.from_env() + writer.close() + + mock_create_file_writer.assert_called_once_with(f"{self._LOG_DIR}/my-run") + mock_init.assert_called_once_with(project="my-project", location="us-central1") + # Uploader watches the PARENT log dir so the run-name subdir is + # discovered as a TensorboardRun via os.path.relpath. + mock_start.assert_called_once_with( + tensorboard_id="42", + tensorboard_experiment_name=self._EXPERIMENT, + logdir=self._LOG_DIR, + ) + mock_end.assert_called_once() + + def test_uploader_does_not_start_when_only_log_dir_set(self) -> None: + with patch.dict( + os.environ, + {"AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR}, + clear=True, + ): + with patch("gigl.utils.tensorboard_writer.tf.summary.create_file_writer"): + with ( + patch("google.cloud.aiplatform.start_upload_tb_log") as mock_start, + patch("google.cloud.aiplatform.end_upload_tb_log") as mock_end, + ): + writer = TensorBoardWriter.from_env() + writer.close() + + mock_start.assert_not_called() + mock_end.assert_not_called() + + def test_invalid_tb_resource_name_raises(self) -> None: + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": "not-a-valid-resource-name", + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + }, + clear=True, + ): + with patch("gigl.utils.tensorboard_writer.tf.summary.create_file_writer"): + with self.assertRaises(ValueError) as ctx: + TensorBoardWriter.from_env() + + self.assertIn("GIGL_TENSORBOARD_RESOURCE_NAME", str(ctx.exception)) + + def test_uploader_skipped_for_disabled_writer(self) -> None: + """Non-chief ranks (enabled=False) skip both the writer and uploader.""" + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": self._TB_RESOURCE, + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + }, + clear=True, + ): + with patch("google.cloud.aiplatform.start_upload_tb_log") as mock_start: + writer = TensorBoardWriter.from_env(enabled=False) + writer.close() + + mock_start.assert_not_called() + + def test_uploader_logs_named_experiment_url_on_start(self) -> None: + """The named-experiment URL is logged so engineers can find the TB + page without the (now-absent) Vertex AI job-page button. + """ + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": self._TB_RESOURCE, + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + }, + clear=True, + ): + with patch("gigl.utils.tensorboard_writer.tf.summary.create_file_writer"): + with ( + patch("google.cloud.aiplatform.start_upload_tb_log"), + patch("google.cloud.aiplatform.init"), + patch("google.cloud.aiplatform.end_upload_tb_log"), + patch.object(tensorboard_writer_module.logger, "info") as mock_info, + ): + writer = TensorBoardWriter.from_env() + writer.close() + + info_calls = [call.args[0] for call in mock_info.call_args_list] + url_log = next((msg for msg in info_calls if "View TensorBoard" in msg), None) + self.assertIsNotNone(url_log) + self.assertIn(self._EXPERIMENT, url_log) + self.assertIn("tensorboards+42", url_log) + self.assertIn("us-central1", url_log) + + def test_uploader_failure_after_writer_construction_closes_writer(self) -> None: + """If start_upload_tb_log raises, the TF file writer is closed and + the exception propagates — no leaked uploader thread, no half-built + writer. + """ + underlying_writer = Mock() + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": self._TB_RESOURCE, + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + }, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer", + return_value=underlying_writer, + ): + with ( + patch( + "google.cloud.aiplatform.start_upload_tb_log", + side_effect=RuntimeError("boom"), + ), + patch("google.cloud.aiplatform.init"), + ): + with self.assertRaises(RuntimeError): + TensorBoardWriter.from_env() + + underlying_writer.close.assert_called_once() + + +if __name__ == "__main__": + absltest.main()