Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/crcr-deploy-prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,5 @@ jobs:
TF_VAR_github_app_id: ${{ secrets.CRCR_GITHUB_APP_ID }}
TF_VAR_github_app_secret: ${{ secrets.CRCR_GITHUB_APP_SECRET }}
TF_VAR_github_app_privatekey: ${{ secrets.CRCR_GITHUB_APP_PRIVATEKEY }}
TF_VAR_hud_api_url: ${{ secrets.CRCR_HUD_API_URL }}
TF_VAR_hud_bot_key: ${{ secrets.CRCR_HUD_BOT_KEY }}
4 changes: 4 additions & 0 deletions .github/workflows/crcr-on-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ jobs:
TF_VAR_github_app_id: ${{ secrets.CRCR_GITHUB_APP_ID }}
TF_VAR_github_app_secret: ${{ secrets.CRCR_GITHUB_APP_SECRET }}
TF_VAR_github_app_privatekey: ${{ secrets.CRCR_GITHUB_APP_PRIVATEKEY }}
TF_VAR_hud_api_url: ${{ secrets.CRCR_HUD_API_URL }}
TF_VAR_hud_bot_key: ${{ secrets.CRCR_HUD_BOT_KEY }}

- name: Make plan
shell: bash
Expand All @@ -63,3 +65,5 @@ jobs:
TF_VAR_github_app_id: ${{ secrets.CRCR_GITHUB_APP_ID }}
TF_VAR_github_app_secret: ${{ secrets.CRCR_GITHUB_APP_SECRET }}
TF_VAR_github_app_privatekey: ${{ secrets.CRCR_GITHUB_APP_PRIVATEKEY }}
TF_VAR_hud_api_url: ${{ secrets.CRCR_HUD_API_URL }}
TF_VAR_hud_bot_key: ${{ secrets.CRCR_HUD_BOT_KEY }}
14 changes: 11 additions & 3 deletions crcr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ crcr/
├── iam.tf # Lambda execution role and policies
├── secrets.tf # Secrets Manager secret and version
├── elasticache.tf # Redis replication group
└── webhook.tf # Lambda function and public function URL
├── callback.tf # Result callback lambda function and public function URL
└── webhook.tf # Webhook lambda function and public function URL
```

## Prerequisites
Expand Down Expand Up @@ -86,6 +87,9 @@ aws dynamodb create-table \
| `allowlist_ttl` | `1200` | Allowlist cache TTL in Redis (seconds) |
| `vpc_cidr_block` | `10.0.0.0/16` | CIDR block for the VPC |
| `availability_zone_suffixes` | `["a", "b"]` | Availability zone letter suffixes |
| `hud_api_url` | `N/A` | URL for sending callback data to HUD |
| `hud_bot_key` | `N/A` | Key to access to HUD (sensitive) |
| `oot_status_ttl` | `259200` | OOT workflow run status TTL in Redis (seconds) |

**Note:**

Expand All @@ -103,6 +107,8 @@ cd ci-infra/crcr/aws
export TF_VAR_github_app_id=123456
export TF_VAR_github_app_secret=<webhook_secret>
export TF_VAR_github_app_privatekey="$(cat path/to/key.pem)"
export TF_VAR_hud_api_url=<hud_api_url>
export TF_VAR_hud_bot_key=<hud_bot_key>
```

#### Deploy prod
Expand Down Expand Up @@ -133,6 +139,8 @@ The production deployment is handled via the `crcr-deploy-prod.yml` workflow (`w
- `CRCR_GITHUB_APP_ID` - GitHub App ID
- `CRCR_GITHUB_APP_SECRET` - GitHub App webhook secret
- `CRCR_GITHUB_APP_PRIVATEKEY` - PEM-encoded GitHub App private key
- `CRCR_HUD_API_URL` - URL for sending callback data to HUD
- `CRCR_HUD_BOT_KEY` - Key to access to HUD

2. **Trigger the workflow** manually from workflow_dispatch:

Expand All @@ -151,7 +159,7 @@ CRCR follows a four-level progression system. Each level adds more integration b

| Level | Name | Status | Description |
|---|---|---|---|
| **L1** | Events Only | **Current** | Webhook events are forwarded to downstream repos. No feedback to upstream PRs. Downstream repos receive `repository_dispatch` and run CI independently. |
| **L2** | HUD Visibility | developing | Downstream CI results are written to ClickHouse and displayed on a dedicated HUD page (`hud.pytorch.org/oot/[org]/[repo]`). Upstream PRs still show no check status. |
| **L1** | Events Only | running | Webhook events are forwarded to downstream repos. No feedback to upstream PRs. Downstream repos receive `repository_dispatch` and run CI independently. |
| **L2** | HUD Visibility | **Current** | Downstream CI results are written to ClickHouse and displayed on a dedicated HUD page (`hud.pytorch.org/oot/[org]/[repo]`). Upstream PRs still show no check status. |
| **L3** | Label-Triggered PR Checks | developing | A non-blocking Check Run appears on upstream PRs when a `ciflow/oot/<name>` label is added. This is the recommended long-term target for most downstream repos. |
| **L4** | Always-On Blocking Checks | developing | Blocking Check Run auto-triggered for every PR. Reserved for critical accelerators only. Merge is blocked on failure. |
3 changes: 2 additions & 1 deletion crcr/Terrafile
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ terraform-aws-vpc:
rev: "3ffbd46fb1c7733e1b34d8666893280454e27436"
crcr:
source: "pytorch/test-infra"
tag: "v20260408-150242"
tag: "v20260522-142343"
assets:
- "cross-repo-ci-webhook.zip"
- "cross-repo-ci-callback.zip"
asset-folders:
- assets/lambdas-download
61 changes: 61 additions & 0 deletions crcr/aws/callback.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
locals {
callback_zip = abspath("../assets/lambdas-download/cross-repo-ci-callback.zip")
}

resource "aws_lambda_function" "callback" {
function_name = "crcr-callback-${var.environment}"
role = aws_iam_role.lambda.arn

runtime = "python3.13"
handler = "callback.lambda_function.lambda_handler"
filename = local.callback_zip
source_code_hash = filebase64sha256(local.callback_zip)

timeout = 60
memory_size = 512
reserved_concurrent_executions = 50
tags = local.tags

environment {
variables = {
GITHUB_APP_ID = var.github_app_id
REDIS_ENDPOINT = aws_elasticache_replication_group.redis.primary_endpoint_address
SECRET_STORE_ARN = local.secret_store_arn
UPSTREAM_REPO = var.upstream_repo
ALLOWLIST_URL = var.allowlist_url
ALLOWLIST_TTL_SECONDS = tostring(var.allowlist_ttl)
HUD_API_URL = var.hud_api_url
OOT_STATUS_TTL = tostring(var.oot_status_ttl)
}
}

vpc_config {
security_group_ids = [aws_security_group.lambda.id]
subnet_ids = module.crcr_vpc.private_subnets
}
}

resource "aws_cloudwatch_log_group" "callback" {
name = "/aws/lambda/${aws_lambda_function.callback.function_name}"
retention_in_days = 90
tags = local.tags
}

resource "aws_lambda_function_url" "callback" {
function_name = aws_lambda_function.callback.function_name
authorization_type = "NONE"
}

resource "aws_lambda_permission" "callback_function_url_invoke" {
function_name = aws_lambda_function.callback.function_name
action = "lambda:InvokeFunctionUrl"
principal = "*"
function_url_auth_type = "NONE"
}

resource "aws_lambda_permission" "callback_function_invoke" {
function_name = aws_lambda_function.callback.function_name
action = "lambda:InvokeFunction"
principal = "*"
invoked_via_function_url = true
}
5 changes: 5 additions & 0 deletions crcr/aws/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ output "webhook_function_url" {
description = "GitHub App webhook URL; configure the GitHub App webhook as <url>/github/webhook"
}

output "callback_function_url" {
value = aws_lambda_function_url.callback.function_url
description = "Result callback URL; downstream workflows post results to <url>/github/callback"
}

output "redis_endpoint" {
value = aws_elasticache_replication_group.redis.primary_endpoint_address
description = "Redis primary endpoint"
Expand Down
1 change: 1 addition & 0 deletions crcr/aws/secrets.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ resource "aws_secretsmanager_secret_version" "main" {
GITHUB_APP_SECRET = var.github_app_secret
GITHUB_APP_PRIVATE_KEY = var.github_app_privatekey
REDIS_LOGIN = random_password.redis_password.result
HUD_BOT_KEY = var.hud_bot_key
})
}
26 changes: 26 additions & 0 deletions crcr/aws/security.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
resource "aws_security_group" "lambda" {
name = "crcr-lambda-sg-${var.environment}"
description = "Security group for Lambda function"
vpc_id = module.crcr_vpc.vpc_id
tags = local.tags
}

resource "aws_security_group_rule" "lambda_to_redis" {
type = "egress"
from_port = 6379
to_port = 6379
protocol = "tcp"
security_group_id = aws_security_group.lambda.id
source_security_group_id = aws_security_group.redis.id
description = "Allow Redis access"
}

resource "aws_security_group_rule" "lambda_to_https" {
type = "egress"
from_port = 443
to_port = 443
protocol = "tcp"
security_group_id = aws_security_group.lambda.id
cidr_blocks = ["0.0.0.0/0"]
description = "Allow HTTPS for Secrets Manager and GitHub API"
}
17 changes: 17 additions & 0 deletions crcr/aws/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,20 @@ variable "availability_zone_suffixes" {
type = list(string)
default = ["a", "b"]
}

variable "hud_api_url" {
description = "HUD API endpoint for posting downstream workflow results"
type = string
}

variable "hud_bot_key" {
description = "Authorization key for the HUD API"
type = string
sensitive = true
}

variable "oot_status_ttl" {
description = "OOT workflow run status TTL in Redis (seconds)"
type = number
default = 259200
}
29 changes: 1 addition & 28 deletions crcr/aws/webhook.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,12 @@ locals {
webhook_zip = abspath("../assets/lambdas-download/cross-repo-ci-webhook.zip")
}

resource "aws_security_group" "lambda" {
name = "crcr-lambda-sg-${var.environment}"
description = "Security group for Lambda function"
vpc_id = module.crcr_vpc.vpc_id
tags = local.tags
}

resource "aws_security_group_rule" "lambda_to_redis" {
type = "egress"
from_port = 6379
to_port = 6379
protocol = "tcp"
security_group_id = aws_security_group.lambda.id
source_security_group_id = aws_security_group.redis.id
description = "Allow Redis access"
}

resource "aws_security_group_rule" "lambda_to_https" {
type = "egress"
from_port = 443
to_port = 443
protocol = "tcp"
security_group_id = aws_security_group.lambda.id
cidr_blocks = ["0.0.0.0/0"]
description = "Allow HTTPS for Secrets Manager and GitHub API"
}

resource "aws_lambda_function" "webhook" {
function_name = "crcr-webhook-${var.environment}"
role = aws_iam_role.lambda.arn

runtime = "python3.13"
handler = "lambda_function.lambda_handler"
handler = "webhook.lambda_function.lambda_handler"
filename = local.webhook_zip
source_code_hash = filebase64sha256(local.webhook_zip)

Expand Down
Loading