From 1a2bf8ce2e99c172963fd04786051242ffa4db91 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Wed, 17 Jun 2026 02:35:58 -0400 Subject: [PATCH] fix: drive hotfix finalize via trigger-capable state token merge The generated hotfix workflow merged the clean-cherry-pick resolution PR with gh pr merge --auto under the default GITHUB_TOKEN. GitHub completes auto-merge as github-actions[bot], and merges authored by that token do not emit pull_request events, so the pull_request(closed) finalize chain never ran and the target environment state was never recorded after an automated hotfix. Merge the clean resolution PR in a dedicated step authenticated with the configured state token (the same trigger-capable token used for state writes). The step polls PR mergeability before merging, so a protected env branch with a required status check still gates the merge until the check is green, and an unprotected branch merges on the first poll. A state-token-authored merge emits pull_request(closed), so context, build, deploy, and finalize run. Signed-off-by: Joshua Temple --- docs/src/content/docs/workflows.md | 6 +- internal/generate/hotfix.go | 92 ++++++++++++++++++++++++++---- internal/generate/hotfix_test.go | 61 ++++++++++++++------ 3 files changed, 127 insertions(+), 32 deletions(-) diff --git a/docs/src/content/docs/workflows.md b/docs/src/content/docs/workflows.md index f3ac631..3ef84be 100644 --- a/docs/src/content/docs/workflows.md +++ b/docs/src/content/docs/workflows.md @@ -259,7 +259,7 @@ flowchart TD RF["Roll forward first (default)
fix merged to trunk; refused if not an ancestor of trunk tip"] RF -- "env must run base + fix only" --> IB["env/<env> integration branch
created on demand at recorded state SHA"] IB --> CP["cherry-pick onto hotfix/<env>/<short-sha>"] - CP -- "clean" --> PRclean["resolution PR · cascade-hotfix
auto-merge, gated by env checks"] + CP -- "clean" --> PRclean["resolution PR · cascade-hotfix
state_token merge, gated by env checks"] CP -- "conflict" --> PRconf["resolution PR · cascade-hotfix-conflict
markers committed; human force-pushes head"] PRclean --> MERGE["on merge"] PRconf --> MERGE @@ -294,7 +294,7 @@ state: ### Cherry-pick and resolution pull request -A clean cherry-pick opens a pull request labeled `cascade-hotfix` with auto-merge enabled. The required checks configured on `env/` gate the merge, and the pull request is the audit record even when no human touches it. +A clean cherry-pick opens a pull request labeled `cascade-hotfix` and merges it as the configured `state_token`. The apply job polls the pull request until it is mergeable, so the required checks configured on `env/` still gate the merge, and the pull request is the audit record even when no human touches it. The merge runs as `state_token` rather than the default `GITHUB_TOKEN` on purpose: a merge authored by `GITHUB_TOKEN` does not emit the `pull_request` close event, so the build, deploy, and finalize stages would never run and the diverged state would never be recorded. Configure `state_token` with a trigger-capable token (the same one used for state writes) to get the post-merge stages after an automated hotfix. On conflict, the conflicted tree is committed with its conflict markers intact, the branch is pushed, and the pull request is opened labeled `cascade-hotfix-conflict`. Committing the markers makes the resolution pull request a real, checkout-able branch: the diff shows exactly where the conflict is, and a human resolves it locally by force-pushing the head branch. @@ -332,7 +332,7 @@ Its jobs: | Job | Trigger | Role | | --- | --- | --- | | plan | dispatch | Fetch env branches and tags, run `cascade hotfix plan`, surface branch-protection suggestions as `::notice::` lines | -| apply | dispatch (not dry-run) | Cherry-pick onto `hotfix//`, open the resolution pull request (clean auto-merges; conflict opens the labeled resolution pull request) | +| apply | dispatch (not dry-run) | Cherry-pick onto `hotfix//`, open the resolution pull request (clean polls until mergeable then merges as `state_token`; conflict opens the labeled resolution pull request) | | check | open pull request to `env/*` | Validate the manifest while the hotfix pull request is open | | build | merged hotfix | Build the merge SHA, since a cherry-picked commit has no prebuilt artifact | | deploy | merged hotfix | Deploy to the target environment, paired with a rollback job mirroring the promote workflow | diff --git a/internal/generate/hotfix.go b/internal/generate/hotfix.go index 5cc5b38..3f63b37 100644 --- a/internal/generate/hotfix.go +++ b/internal/generate/hotfix.go @@ -30,6 +30,9 @@ const hotfixConflictLabel = "cascade-hotfix-conflict" // The workflow carries two triggers in one file: a workflow_dispatch entry that // plans and applies the cherry-pick, and a pull_request (closed) entry that runs // the build, deploy, and finalize stages when the resolution pull request merges. +// The clean-path merge runs as the configured state token so the merge is +// trigger capable and the pull_request (closed) chain actually fires; a merge +// authored by the default GITHUB_TOKEN would not emit that event. // // This generator is gated on the configured environment count: it emits only // when two or more environments are declared, because a single-environment @@ -48,6 +51,18 @@ func NewHotfixGenerator(cfg *config.TrunkConfig, baseDir string) *HotfixGenerato } } +// getStateTokenRef returns the token expression used to merge the clean-path +// resolution PR. It mirrors the release and promote generators: users configure +// the full expression via the state_token config option, and it defaults to the +// GITHUB_TOKEN expression when unset. The clean-path merge must run as this +// actor so the merge emits a pull_request(closed) event and reaches the build, +// deploy, and finalize chain; the default GITHUB_TOKEN does not emit that event, +// so a hotfix into a repo with no configured state token records no state until +// the operator supplies a trigger-capable token here. +func (g *HotfixGenerator) getStateTokenRef() string { + return g.config.GetStateToken() +} + // Enabled reports whether the hotfix workflow should be emitted. The workflow is // emitted only when the manifest declares two or more environments, since the // first environment is the build target and at least one further environment is @@ -238,9 +253,11 @@ func (g *HotfixGenerator) writePlanJob(sb *strings.Builder) { } // writeApplyJob emits the apply job, run on dispatch when not a dry-run. It -// cherry-picks the commit onto a hotfix branch and opens a resolution PR. Clean -// cherry-picks auto-merge; conflicting ones open a labeled PR for local -// resolution and do not auto-merge. +// cherry-picks the commit onto a hotfix branch and opens a resolution PR. A +// clean cherry-pick is merged by the dedicated merge step as the configured +// state token, which polls until the PR is mergeable so a protected env branch +// with a required check still gates the merge. A conflicting cherry-pick opens a +// labeled PR for local resolution and is merged by a human via the UI. func (g *HotfixGenerator) writeApplyJob(sb *strings.Builder) { sb.WriteString(" apply:\n") sb.WriteString(" name: Apply Hotfix Cherry-Pick\n") @@ -316,15 +333,15 @@ func (g *HotfixGenerator) writeApplyJob(sb *strings.Builder) { fmt.Fprintf(sb, " --label %s \\\n", hotfixLabel) sb.WriteString(" --title \"hotfix(${TARGET_ENV}): cherry-pick ${SHORT_SHA}\" \\\n") sb.WriteString(" --body \"$BODY\"\n") - // Prefer auto-merge so required checks gate the merge on protected - // branches. GitHub rejects enablePullRequestAutoMerge when the target - // branch has no protection rule, so fall back to an immediate squash - // merge. The `if !` guard keeps the failing attempt from tripping - // `set -e` and aborting the step. - sb.WriteString(" if ! gh pr merge --auto --squash \"$BRANCH\"; then\n") - sb.WriteString(" echo \"::notice::auto-merge unavailable (branch likely unprotected); merging directly\"\n") - sb.WriteString(" gh pr merge --squash --delete-branch \"$BRANCH\"\n") - sb.WriteString(" fi\n") + // Hand the resolution branch to the dedicated merge step. The merge runs + // as the configured state token (a trigger-capable actor), which the + // job-level GH_TOKEN is not, so it has to be a separate step with its own + // env. The clean path is the only one that auto-merges; the conflict path + // leaves the merge to a human via the UI. + sb.WriteString(" {\n") + sb.WriteString(" echo \"HOTFIX_BRANCH=$BRANCH\"\n") + sb.WriteString(" echo \"HOTFIX_CLEAN_MERGE=true\"\n") + sb.WriteString(" } >> \"$GITHUB_ENV\"\n") sb.WriteString(" else\n") sb.WriteString(" echo \"::warning::Cherry-pick conflicted; opening resolution PR for manual resolve\"\n") sb.WriteString(" CONFLICTS=$(git diff --name-only --diff-filter=U)\n") @@ -339,6 +356,57 @@ func (g *HotfixGenerator) writeApplyJob(sb *strings.Builder) { sb.WriteString(" --title \"hotfix(${TARGET_ENV}): cherry-pick ${SHORT_SHA} (conflicts)\" \\\n") sb.WriteString(" --body \"$CONFLICT_BODY\"\n") sb.WriteString(" fi\n") + + g.writeCleanMergeStep(sb) +} + +// writeCleanMergeStep emits the clean-path merge step. It runs only after a +// clean cherry-pick (signalled by HOTFIX_CLEAN_MERGE) and merges the resolution +// PR as the configured state token. The merge must be authored by a +// trigger-capable actor: a merge authored by the default GITHUB_TOKEN does not +// emit a pull_request(closed) event, so the build, deploy, and finalize chain +// would never run and the target environment's state would never be recorded. +// +// The step polls PR mergeability before merging, so a protected env branch with +// a required status check still gates the merge until that check is green. An +// unprotected branch reports mergeable on the first poll, so the same loop +// merges immediately. On timeout it fails loudly so the operator sees the stuck +// resolution PR rather than a silently skipped finalize. +func (g *HotfixGenerator) writeCleanMergeStep(sb *strings.Builder) { + sb.WriteString(" - name: Merge clean resolution PR\n") + sb.WriteString(" if: env.HOTFIX_CLEAN_MERGE == 'true'\n") + sb.WriteString(" env:\n") + // Merge as the configured state token so the merge is trigger capable and + // reaches finalize. Defaults to GITHUB_TOKEN when unset; that default does + // not emit the pull_request event, so operators that need finalize after a + // hotfix must configure a trigger-capable state_token. + fmt.Fprintf(sb, " GH_TOKEN: %s\n", g.getStateTokenRef()) + sb.WriteString(" BRANCH: ${{ env.HOTFIX_BRANCH }}\n") + sb.WriteString(" run: |\n") + // Poll mergeability for up to ~5 minutes (20 attempts, 15s apart) so a + // required status check on a protected env branch has time to report. The + // `if` guard keeps an individual non-fatal gh call from aborting the loop; + // only the timeout path exits non-zero. + sb.WriteString(" ATTEMPTS=20\n") + sb.WriteString(" SLEEP=15\n") + sb.WriteString(" MERGED=false\n") + sb.WriteString(" for i in $(seq 1 \"$ATTEMPTS\"); do\n") + sb.WriteString(" STATE=$(gh pr view \"$BRANCH\" --json mergeable,mergeStateStatus -q '.mergeable + \" \" + .mergeStateStatus' 2>/dev/null || echo \"UNKNOWN UNKNOWN\")\n") + sb.WriteString(" MERGEABLE=$(echo \"$STATE\" | cut -d' ' -f1)\n") + sb.WriteString(" STATUS=$(echo \"$STATE\" | cut -d' ' -f2)\n") + sb.WriteString(" echo \"::notice::resolution PR mergeable=$MERGEABLE state=$STATUS (attempt $i/$ATTEMPTS)\"\n") + sb.WriteString(" if [ \"$MERGEABLE\" = \"MERGEABLE\" ] && [ \"$STATUS\" != \"BLOCKED\" ]; then\n") + sb.WriteString(" if gh pr merge --squash --delete-branch \"$BRANCH\"; then\n") + sb.WriteString(" MERGED=true\n") + sb.WriteString(" break\n") + sb.WriteString(" fi\n") + sb.WriteString(" fi\n") + sb.WriteString(" sleep \"$SLEEP\"\n") + sb.WriteString(" done\n") + sb.WriteString(" if [ \"$MERGED\" != \"true\" ]; then\n") + sb.WriteString(" echo \"::error::Resolution PR for $BRANCH did not become mergeable within the timeout; merge it manually to run the hotfix finalize chain\"\n") + sb.WriteString(" exit 1\n") + sb.WriteString(" fi\n") } // writeCheckJob emits the parse-config validity gate that runs while a hotfix PR diff --git a/internal/generate/hotfix_test.go b/internal/generate/hotfix_test.go index 03352f3..d1fe6ae 100644 --- a/internal/generate/hotfix_test.go +++ b/internal/generate/hotfix_test.go @@ -150,27 +150,54 @@ func TestHotfixGenerator_CleanPath(t *testing.T) { require.NoError(t, err) assert.Contains(t, content, "--label cascade-hotfix") - assert.Contains(t, content, "gh pr merge --auto") -} - -// TestHotfixGenerator_CleanPathMergeFallback guards the regression where the -// clean cherry-pick path merged with `gh pr merge --auto`, which calls GitHub's -// enablePullRequestAutoMerge mutation. GitHub rejects that mutation on a branch -// with no protection rule, so the hotfix step exited non-zero under `set -e` and -// the PR was left open on unprotected env branches. The clean path must prefer -// auto-merge but fall back to an immediate squash merge when auto-merge cannot -// be enabled, and the `--auto` attempt must be guarded so its failure does not -// abort the step. -func TestHotfixGenerator_CleanPathMergeFallback(t *testing.T) { + assert.Contains(t, content, "gh pr merge --squash --delete-branch \"$BRANCH\"") +} + +// TestHotfixGenerator_CleanPathPATMerge guards the structural fix where the +// clean cherry-pick path merged with `gh pr merge --auto`. GitHub auto-merge +// completes the merge as github-actions[bot] under GITHUB_TOKEN, and merges +// authored by GITHUB_TOKEN do not emit pull_request events, so the +// pull_request(closed) finalize chain never fired and state was never recorded +// after a hotfix. The clean path must instead poll until the resolution PR is +// mergeable (so a protected env branch with a required check still gates the +// merge) and then merge with the configured state token, which is trigger +// capable and reaches the finalize chain. +func TestHotfixGenerator_CleanPathPATMerge(t *testing.T) { + cfg := threeEnvHotfixConfig() + cfg.StateToken = "${{ secrets.CASCADE_BOT_TOKEN }}" + gen := NewHotfixGenerator(cfg, "") + content, err := gen.Generate() + require.NoError(t, err) + + // The clean path no longer relies on auto-merge, which suppresses the + // finalize trigger. + assert.NotContains(t, content, "gh pr merge --auto", + "clean path must not use auto-merge; it suppresses the pull_request finalize trigger") + + // The merge step authenticates with the configured state token so the + // merge actor is trigger capable and finalize is reachable. + assert.Contains(t, content, "GH_TOKEN: ${{ secrets.CASCADE_BOT_TOKEN }}", + "the clean-path merge step must authenticate with the configured state token") + + // A poll-until-mergeable construct gates the merge so a protected env + // branch with a required check still blocks until the check is green. + assert.Contains(t, content, "gh pr view \"$BRANCH\" --json mergeable", + "the clean path must poll PR mergeability before merging") + assert.Contains(t, content, "gh pr merge --squash --delete-branch \"$BRANCH\"", + "the clean path must squash-merge once the PR is mergeable") +} + +// TestHotfixGenerator_CleanPathMergeDefaultsToGitHubToken confirms that when no +// state token is configured the merge step degrades to the default +// GITHUB_TOKEN expression, matching the state-write token plumbing used by the +// release and promote generators. +func TestHotfixGenerator_CleanPathMergeDefaultsToGitHubToken(t *testing.T) { gen := NewHotfixGenerator(threeEnvHotfixConfig(), "") content, err := gen.Generate() require.NoError(t, err) - // The auto-merge attempt is guarded by `if !` so a failure on an - // unprotected branch does not trip `set -e` and abort the step. - assert.Contains(t, content, "if ! gh pr merge --auto --squash \"$BRANCH\"; then") - // The fallback merges immediately when auto-merge is unavailable. - assert.Contains(t, content, "gh pr merge --squash --delete-branch \"$BRANCH\"") + assert.NotContains(t, content, "gh pr merge --auto") + assert.Contains(t, content, "GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}") } // TestHotfixGenerator_SeedsLabels guards the regression where the apply job ran