From ba92e970005cd34c7131db229d2f71cfa0388b97 Mon Sep 17 00:00:00 2001 From: can-gaa-hou Date: Wed, 15 Apr 2026 11:20:04 +0800 Subject: [PATCH 1/4] Add HUD API integration with result callback functionality --- .github/workflows/crcr-deploy-prod.yml | 2 + .github/workflows/crcr-on-pr.yml | 4 ++ crcr/README.md | 10 ++++- crcr/Terrafile | 1 + crcr/aws/outputs.tf | 5 +++ crcr/aws/result.tf | 60 ++++++++++++++++++++++++++ crcr/aws/secrets.tf | 1 + crcr/aws/variables.tf | 11 +++++ 8 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 crcr/aws/result.tf diff --git a/.github/workflows/crcr-deploy-prod.yml b/.github/workflows/crcr-deploy-prod.yml index 40c6c7a2..a58e9b42 100644 --- a/.github/workflows/crcr-deploy-prod.yml +++ b/.github/workflows/crcr-deploy-prod.yml @@ -46,3 +46,5 @@ jobs: TF_VAR_github_app_id: ${{ secrets.CRCR_GITHUB_APP_ID }} TF_VAR_github_app_secret: ${{ secrets.CRCR_GITHUB_APP_SECRET }} TF_VAR_github_app_privatekey: ${{ secrets.CRCR_GITHUB_APP_PRIVATEKEY }} + TF_VAR_hud_api_url: ${{ secrets.CRCR_HUD_API_URL }} + TF_VAR_hud_bot_key: ${{ secrets.CRCR_HUD_BOT_KEY }} diff --git a/.github/workflows/crcr-on-pr.yml b/.github/workflows/crcr-on-pr.yml index 4c6f1d53..793ed006 100644 --- a/.github/workflows/crcr-on-pr.yml +++ b/.github/workflows/crcr-on-pr.yml @@ -52,6 +52,8 @@ jobs: TF_VAR_github_app_id: ${{ secrets.CRCR_GITHUB_APP_ID }} TF_VAR_github_app_secret: ${{ secrets.CRCR_GITHUB_APP_SECRET }} TF_VAR_github_app_privatekey: ${{ secrets.CRCR_GITHUB_APP_PRIVATEKEY }} + TF_VAR_hud_api_url: ${{ secrets.CRCR_HUD_API_URL }} + TF_VAR_hud_bot_key: ${{ secrets.CRCR_HUD_BOT_KEY }} - name: Make plan shell: bash @@ -63,3 +65,5 @@ jobs: TF_VAR_github_app_id: ${{ secrets.CRCR_GITHUB_APP_ID }} TF_VAR_github_app_secret: ${{ secrets.CRCR_GITHUB_APP_SECRET }} TF_VAR_github_app_privatekey: ${{ secrets.CRCR_GITHUB_APP_PRIVATEKEY }} + TF_VAR_hud_api_url: ${{ secrets.CRCR_HUD_API_URL }} + TF_VAR_hud_bot_key: ${{ secrets.CRCR_HUD_BOT_KEY }} diff --git a/crcr/README.md b/crcr/README.md index 270c707e..2eeb8d67 100644 --- a/crcr/README.md +++ b/crcr/README.md @@ -44,7 +44,8 @@ crcr/ ├── iam.tf # Lambda execution role and policies ├── secrets.tf # Secrets Manager secret and version ├── elasticache.tf # Redis replication group - └── webhook.tf # Lambda function and public function URL + ├── result.tf # Result callback lambda function and public function URL + └── webhook.tf # Webhook lambda function and public function URL ``` ## Prerequisites @@ -86,6 +87,9 @@ aws dynamodb create-table \ | `allowlist_ttl` | `1200` | Allowlist cache TTL in Redis (seconds) | | `vpc_cidr_block` | `10.0.0.0/16` | CIDR block for the VPC | | `availability_zone_suffixes` | `["a", "b"]` | Availability zone letter suffixes | +| `hud_api_url` | `N/A` | URL for sending callback data to HUD | +| `hud_bot_key` | `N/A` | Key to access to HUD (sensitive) | +| `oot_status_ttl` | `259200` | OOT workflow run status TTL in Redis (seconds) | **Note:** @@ -103,6 +107,8 @@ cd ci-infra/crcr/aws export TF_VAR_github_app_id=123456 export TF_VAR_github_app_secret= export TF_VAR_github_app_privatekey="$(cat path/to/key.pem)" +export TF_VAR_hud_api_url= +export TF_VAR_hud_bot_key= ``` #### Deploy prod @@ -133,6 +139,8 @@ The production deployment is handled via the `crcr-deploy-prod.yml` workflow (`w - `CRCR_GITHUB_APP_ID` - GitHub App ID - `CRCR_GITHUB_APP_SECRET` - GitHub App webhook secret - `CRCR_GITHUB_APP_PRIVATEKEY` - PEM-encoded GitHub App private key + - `CRCR_HUD_API_URL` - URL for sending callback data to HUD + - `CRCR_HUD_BOT_KEY` - Key to access to HUD 2. **Trigger the workflow** manually from workflow_dispatch: diff --git a/crcr/Terrafile b/crcr/Terrafile index 2ec93309..3f12f620 100644 --- a/crcr/Terrafile +++ b/crcr/Terrafile @@ -6,5 +6,6 @@ crcr: tag: "v20260408-150242" assets: - "cross-repo-ci-webhook.zip" + - "cross-repo-ci-result.zip" asset-folders: - assets/lambdas-download diff --git a/crcr/aws/outputs.tf b/crcr/aws/outputs.tf index ae703f2d..3554d4f5 100644 --- a/crcr/aws/outputs.tf +++ b/crcr/aws/outputs.tf @@ -3,6 +3,11 @@ output "webhook_function_url" { description = "GitHub App webhook URL; configure the GitHub App webhook as /github/webhook" } +output "result_function_url" { + value = aws_lambda_function_url.result.function_url + description = "Result callback URL; downstream workflows post results to /github/result" +} + output "redis_endpoint" { value = aws_elasticache_replication_group.redis.primary_endpoint_address description = "Redis primary endpoint" diff --git a/crcr/aws/result.tf b/crcr/aws/result.tf new file mode 100644 index 00000000..2d2f8880 --- /dev/null +++ b/crcr/aws/result.tf @@ -0,0 +1,60 @@ +locals { + result_zip = abspath("../assets/lambdas-download/cross-repo-ci-result.zip") +} + +resource "aws_lambda_function" "result" { + function_name = "crcr-result-${var.environment}" + role = aws_iam_role.lambda.arn + + runtime = "python3.13" + handler = "lambda_function.lambda_handler" + filename = local.result_zip + source_code_hash = filebase64sha256(local.result_zip) + + timeout = 60 + memory_size = 512 + reserved_concurrent_executions = 50 + tags = local.tags + + environment { + variables = { + GITHUB_APP_ID = var.github_app_id + REDIS_ENDPOINT = aws_elasticache_replication_group.redis.primary_endpoint_address + SECRET_STORE_ARN = local.secret_store_arn + UPSTREAM_REPO = var.upstream_repo + ALLOWLIST_URL = var.allowlist_url + ALLOWLIST_TTL_SECONDS = tostring(var.allowlist_ttl) + HUD_API_URL = var.hud_api_url + } + } + + vpc_config { + security_group_ids = [aws_security_group.lambda.id] + subnet_ids = module.crcr_vpc.private_subnets + } +} + +resource "aws_cloudwatch_log_group" "result" { + name = "/aws/lambda/${aws_lambda_function.result.function_name}" + retention_in_days = 90 + tags = local.tags +} + +resource "aws_lambda_function_url" "result" { + function_name = aws_lambda_function.result.function_name + authorization_type = "NONE" +} + +resource "aws_lambda_permission" "result_function_url_invoke" { + function_name = aws_lambda_function.result.function_name + action = "lambda:InvokeFunctionUrl" + principal = "*" + function_url_auth_type = "NONE" +} + +resource "aws_lambda_permission" "result_function_invoke" { + function_name = aws_lambda_function.result.function_name + action = "lambda:InvokeFunction" + principal = "*" + invoked_via_function_url = true +} diff --git a/crcr/aws/secrets.tf b/crcr/aws/secrets.tf index c0e5a840..2b3945d1 100644 --- a/crcr/aws/secrets.tf +++ b/crcr/aws/secrets.tf @@ -9,5 +9,6 @@ resource "aws_secretsmanager_secret_version" "main" { GITHUB_APP_SECRET = var.github_app_secret GITHUB_APP_PRIVATE_KEY = var.github_app_privatekey REDIS_LOGIN = random_password.redis_password.result + HUD_BOT_KEY = var.hud_bot_key }) } diff --git a/crcr/aws/variables.tf b/crcr/aws/variables.tf index 180c0046..9d14ce0a 100644 --- a/crcr/aws/variables.tf +++ b/crcr/aws/variables.tf @@ -49,3 +49,14 @@ variable "availability_zone_suffixes" { type = list(string) default = ["a", "b"] } + +variable "hud_api_url" { + description = "HUD API endpoint for posting downstream workflow results" + type = string +} + +variable "hud_bot_key" { + description = "Authorization key for the HUD API" + type = string + sensitive = true +} From f1a9cc0003812df42bfe68bb2577c3fa7c319e6a Mon Sep 17 00:00:00 2001 From: can-gaa-hou Date: Fri, 24 Apr 2026 10:17:07 +0800 Subject: [PATCH 2/4] Update handler name --- crcr/aws/result.tf | 2 +- crcr/aws/webhook.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crcr/aws/result.tf b/crcr/aws/result.tf index 2d2f8880..aa2722af 100644 --- a/crcr/aws/result.tf +++ b/crcr/aws/result.tf @@ -7,7 +7,7 @@ resource "aws_lambda_function" "result" { role = aws_iam_role.lambda.arn runtime = "python3.13" - handler = "lambda_function.lambda_handler" + handler = "callback.lambda_function.lambda_handler" filename = local.result_zip source_code_hash = filebase64sha256(local.result_zip) diff --git a/crcr/aws/webhook.tf b/crcr/aws/webhook.tf index 509bf456..d405e6c9 100644 --- a/crcr/aws/webhook.tf +++ b/crcr/aws/webhook.tf @@ -34,7 +34,7 @@ resource "aws_lambda_function" "webhook" { role = aws_iam_role.lambda.arn runtime = "python3.13" - handler = "lambda_function.lambda_handler" + handler = "webhook.lambda_function.lambda_handler" filename = local.webhook_zip source_code_hash = filebase64sha256(local.webhook_zip) From 0063256572d3ecb602bdb5232a64019f0c2dd17a Mon Sep 17 00:00:00 2001 From: can-gaa-hou Date: Mon, 11 May 2026 14:36:44 +0800 Subject: [PATCH 3/4] Refactor security group configuration and add OOT status TTL variable --- crcr/README.md | 4 ++-- crcr/aws/result.tf | 15 ++++++++------- crcr/aws/security.tf | 26 ++++++++++++++++++++++++++ crcr/aws/variables.tf | 6 ++++++ crcr/aws/webhook.tf | 27 --------------------------- 5 files changed, 42 insertions(+), 36 deletions(-) create mode 100644 crcr/aws/security.tf diff --git a/crcr/README.md b/crcr/README.md index 2eeb8d67..ea187e61 100644 --- a/crcr/README.md +++ b/crcr/README.md @@ -159,7 +159,7 @@ CRCR follows a four-level progression system. Each level adds more integration b | Level | Name | Status | Description | |---|---|---|---| -| **L1** | Events Only | **Current** | Webhook events are forwarded to downstream repos. No feedback to upstream PRs. Downstream repos receive `repository_dispatch` and run CI independently. | -| **L2** | HUD Visibility | developing | Downstream CI results are written to ClickHouse and displayed on a dedicated HUD page (`hud.pytorch.org/oot/[org]/[repo]`). Upstream PRs still show no check status. | +| **L1** | Events Only | running | Webhook events are forwarded to downstream repos. No feedback to upstream PRs. Downstream repos receive `repository_dispatch` and run CI independently. | +| **L2** | HUD Visibility | **Current** | Downstream CI results are written to ClickHouse and displayed on a dedicated HUD page (`hud.pytorch.org/oot/[org]/[repo]`). Upstream PRs still show no check status. | | **L3** | Label-Triggered PR Checks | developing | A non-blocking Check Run appears on upstream PRs when a `ciflow/oot/` label is added. This is the recommended long-term target for most downstream repos. | | **L4** | Always-On Blocking Checks | developing | Blocking Check Run auto-triggered for every PR. Reserved for critical accelerators only. Merge is blocked on failure. | diff --git a/crcr/aws/result.tf b/crcr/aws/result.tf index aa2722af..855ead56 100644 --- a/crcr/aws/result.tf +++ b/crcr/aws/result.tf @@ -18,13 +18,14 @@ resource "aws_lambda_function" "result" { environment { variables = { - GITHUB_APP_ID = var.github_app_id - REDIS_ENDPOINT = aws_elasticache_replication_group.redis.primary_endpoint_address - SECRET_STORE_ARN = local.secret_store_arn - UPSTREAM_REPO = var.upstream_repo - ALLOWLIST_URL = var.allowlist_url - ALLOWLIST_TTL_SECONDS = tostring(var.allowlist_ttl) - HUD_API_URL = var.hud_api_url + GITHUB_APP_ID = var.github_app_id + REDIS_ENDPOINT = aws_elasticache_replication_group.redis.primary_endpoint_address + SECRET_STORE_ARN = local.secret_store_arn + UPSTREAM_REPO = var.upstream_repo + ALLOWLIST_URL = var.allowlist_url + ALLOWLIST_TTL_SECONDS = tostring(var.allowlist_ttl) + HUD_API_URL = var.hud_api_url + OOT_STATUS_TTL = tostring(var.oot_status_ttl) } } diff --git a/crcr/aws/security.tf b/crcr/aws/security.tf new file mode 100644 index 00000000..ddf8284c --- /dev/null +++ b/crcr/aws/security.tf @@ -0,0 +1,26 @@ +resource "aws_security_group" "lambda" { + name = "crcr-lambda-sg-${var.environment}" + description = "Security group for Lambda function" + vpc_id = module.crcr_vpc.vpc_id + tags = local.tags +} + +resource "aws_security_group_rule" "lambda_to_redis" { + type = "egress" + from_port = 6379 + to_port = 6379 + protocol = "tcp" + security_group_id = aws_security_group.lambda.id + source_security_group_id = aws_security_group.redis.id + description = "Allow Redis access" +} + +resource "aws_security_group_rule" "lambda_to_https" { + type = "egress" + from_port = 443 + to_port = 443 + protocol = "tcp" + security_group_id = aws_security_group.lambda.id + cidr_blocks = ["0.0.0.0/0"] + description = "Allow HTTPS for Secrets Manager and GitHub API" +} diff --git a/crcr/aws/variables.tf b/crcr/aws/variables.tf index 9d14ce0a..4e5405e7 100644 --- a/crcr/aws/variables.tf +++ b/crcr/aws/variables.tf @@ -60,3 +60,9 @@ variable "hud_bot_key" { type = string sensitive = true } + +variable "oot_status_ttl" { + description = "OOT workflow run status TTL in Redis (seconds)" + type = number + default = 259200 +} diff --git a/crcr/aws/webhook.tf b/crcr/aws/webhook.tf index d405e6c9..120d3010 100644 --- a/crcr/aws/webhook.tf +++ b/crcr/aws/webhook.tf @@ -2,33 +2,6 @@ locals { webhook_zip = abspath("../assets/lambdas-download/cross-repo-ci-webhook.zip") } -resource "aws_security_group" "lambda" { - name = "crcr-lambda-sg-${var.environment}" - description = "Security group for Lambda function" - vpc_id = module.crcr_vpc.vpc_id - tags = local.tags -} - -resource "aws_security_group_rule" "lambda_to_redis" { - type = "egress" - from_port = 6379 - to_port = 6379 - protocol = "tcp" - security_group_id = aws_security_group.lambda.id - source_security_group_id = aws_security_group.redis.id - description = "Allow Redis access" -} - -resource "aws_security_group_rule" "lambda_to_https" { - type = "egress" - from_port = 443 - to_port = 443 - protocol = "tcp" - security_group_id = aws_security_group.lambda.id - cidr_blocks = ["0.0.0.0/0"] - description = "Allow HTTPS for Secrets Manager and GitHub API" -} - resource "aws_lambda_function" "webhook" { function_name = "crcr-webhook-${var.environment}" role = aws_iam_role.lambda.arn From 9064ceaab472121049444c3e3e0852219db3186a Mon Sep 17 00:00:00 2001 From: can-gaa-hou Date: Fri, 22 May 2026 10:18:11 +0800 Subject: [PATCH 4/4] change all "result" to "callback" --- crcr/README.md | 2 +- crcr/Terrafile | 2 +- crcr/aws/{result.tf => callback.tf} | 26 +++++++++++++------------- crcr/aws/outputs.tf | 6 +++--- 4 files changed, 18 insertions(+), 18 deletions(-) rename crcr/aws/{result.tf => callback.tf} (62%) diff --git a/crcr/README.md b/crcr/README.md index ea187e61..43df8197 100644 --- a/crcr/README.md +++ b/crcr/README.md @@ -44,7 +44,7 @@ crcr/ ├── iam.tf # Lambda execution role and policies ├── secrets.tf # Secrets Manager secret and version ├── elasticache.tf # Redis replication group - ├── result.tf # Result callback lambda function and public function URL + ├── callback.tf # Result callback lambda function and public function URL └── webhook.tf # Webhook lambda function and public function URL ``` diff --git a/crcr/Terrafile b/crcr/Terrafile index 3f12f620..eccd0f4f 100644 --- a/crcr/Terrafile +++ b/crcr/Terrafile @@ -6,6 +6,6 @@ crcr: tag: "v20260408-150242" assets: - "cross-repo-ci-webhook.zip" - - "cross-repo-ci-result.zip" + - "cross-repo-ci-callback.zip" asset-folders: - assets/lambdas-download diff --git a/crcr/aws/result.tf b/crcr/aws/callback.tf similarity index 62% rename from crcr/aws/result.tf rename to crcr/aws/callback.tf index 855ead56..47d4894e 100644 --- a/crcr/aws/result.tf +++ b/crcr/aws/callback.tf @@ -1,15 +1,15 @@ locals { - result_zip = abspath("../assets/lambdas-download/cross-repo-ci-result.zip") + callback_zip = abspath("../assets/lambdas-download/cross-repo-ci-callback.zip") } -resource "aws_lambda_function" "result" { - function_name = "crcr-result-${var.environment}" +resource "aws_lambda_function" "callback" { + function_name = "crcr-callback-${var.environment}" role = aws_iam_role.lambda.arn runtime = "python3.13" handler = "callback.lambda_function.lambda_handler" - filename = local.result_zip - source_code_hash = filebase64sha256(local.result_zip) + filename = local.callback_zip + source_code_hash = filebase64sha256(local.callback_zip) timeout = 60 memory_size = 512 @@ -35,26 +35,26 @@ resource "aws_lambda_function" "result" { } } -resource "aws_cloudwatch_log_group" "result" { - name = "/aws/lambda/${aws_lambda_function.result.function_name}" +resource "aws_cloudwatch_log_group" "callback" { + name = "/aws/lambda/${aws_lambda_function.callback.function_name}" retention_in_days = 90 tags = local.tags } -resource "aws_lambda_function_url" "result" { - function_name = aws_lambda_function.result.function_name +resource "aws_lambda_function_url" "callback" { + function_name = aws_lambda_function.callback.function_name authorization_type = "NONE" } -resource "aws_lambda_permission" "result_function_url_invoke" { - function_name = aws_lambda_function.result.function_name +resource "aws_lambda_permission" "callback_function_url_invoke" { + function_name = aws_lambda_function.callback.function_name action = "lambda:InvokeFunctionUrl" principal = "*" function_url_auth_type = "NONE" } -resource "aws_lambda_permission" "result_function_invoke" { - function_name = aws_lambda_function.result.function_name +resource "aws_lambda_permission" "callback_function_invoke" { + function_name = aws_lambda_function.callback.function_name action = "lambda:InvokeFunction" principal = "*" invoked_via_function_url = true diff --git a/crcr/aws/outputs.tf b/crcr/aws/outputs.tf index 3554d4f5..82c5962b 100644 --- a/crcr/aws/outputs.tf +++ b/crcr/aws/outputs.tf @@ -3,9 +3,9 @@ output "webhook_function_url" { description = "GitHub App webhook URL; configure the GitHub App webhook as /github/webhook" } -output "result_function_url" { - value = aws_lambda_function_url.result.function_url - description = "Result callback URL; downstream workflows post results to /github/result" +output "callback_function_url" { + value = aws_lambda_function_url.callback.function_url + description = "Result callback URL; downstream workflows post results to /github/callback" } output "redis_endpoint" {