Skip to content

Commit 41e662e

Browse files
authored
Merge pull request #333 from ably/refactor/dependabot-workflow-rewrite
refactor(ci): rewrite fix-dependabot to capture all CI failures
2 parents 9270f64 + 62c46e7 commit 41e662e

File tree

1 file changed

+201
-69
lines changed

1 file changed

+201
-69
lines changed

.github/workflows/dependabot-lockfile.yml

Lines changed: 201 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,21 @@ on:
66

77
permissions:
88
actions: read
9+
checks: read
910
contents: write
1011
pull-requests: write
1112

13+
concurrency:
14+
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
15+
cancel-in-progress: true
16+
1217
jobs:
13-
fix-dependabot:
18+
regen-lockfile:
1419
runs-on: ubuntu-latest
15-
timeout-minutes: 30
20+
timeout-minutes: 10
21+
outputs:
22+
skip: ${{ steps.guard.outputs.skip }}
23+
head_sha: ${{ steps.get-sha.outputs.sha }}
1624

1725
steps:
1826
- name: Check if Dependabot PR
@@ -89,99 +97,223 @@ jobs:
8997
echo "changed=true" >> "$GITHUB_OUTPUT"
9098
fi
9199
92-
- name: Try building
100+
- name: Get HEAD SHA
93101
if: steps.guard.outputs.skip != 'true'
94-
id: build
95-
continue-on-error: true
96-
run: |
97-
set -o pipefail
98-
pnpm install --frozen-lockfile
99-
pnpm run build 2>&1 | tee /tmp/build-output.txt
100-
101-
- name: Try linting
102-
if: steps.guard.outputs.skip != 'true' && steps.build.outcome == 'success'
103-
id: lint
104-
continue-on-error: true
105-
run: |
106-
set -o pipefail
107-
pnpm exec eslint . 2>&1 | tee /tmp/lint-output.txt
102+
id: get-sha
103+
run: echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
108104

109-
- name: Try testing
110-
if: steps.guard.outputs.skip != 'true' && steps.build.outcome == 'success'
111-
id: test
112-
continue-on-error: true
113-
run: |
114-
set -o pipefail
115-
failed=0
116-
pnpm test:unit 2>&1 | tee /tmp/test-output.txt || failed=1
117-
pnpm --filter @ably/react-web-cli test 2>&1 | tee -a /tmp/test-output.txt || failed=1
118-
exit $failed
105+
fix-failures:
106+
needs: regen-lockfile
107+
if: needs.regen-lockfile.outputs.skip != 'true'
108+
runs-on: ubuntu-latest
109+
timeout-minutes: 45
119110

120-
- name: Check if fixes needed
121-
if: steps.guard.outputs.skip != 'true'
122-
id: needs-fix
123-
run: |
124-
if [[ "${{ steps.build.outcome }}" == "failure" || "${{ steps.lint.outcome }}" == "failure" || "${{ steps.test.outcome }}" == "failure" ]]; then
125-
echo "needed=true" >> "$GITHUB_OUTPUT"
126-
else
127-
echo "needed=false" >> "$GITHUB_OUTPUT"
128-
fi
111+
steps:
112+
- name: Generate App Token
113+
id: generate-token
114+
uses: actions/create-github-app-token@v3
115+
with:
116+
app-id: ${{ secrets.CI_APP_ID }}
117+
private-key: ${{ secrets.CI_APP_PRIVATE_KEY }}
129118

130-
- name: Capture error output
131-
if: steps.needs-fix.outputs.needed == 'true'
132-
id: errors
119+
- name: Wait for CI checks to complete
120+
id: wait-for-checks
121+
env:
122+
GH_TOKEN: ${{ github.token }}
123+
HEAD_SHA: ${{ needs.regen-lockfile.outputs.head_sha }}
124+
REPO: ${{ github.repository }}
133125
run: |
134-
{
135-
echo "build_output<<ENDOFOUTPUT"
136-
if [ -f /tmp/build-output.txt ]; then
137-
tail -n 200 /tmp/build-output.txt
138-
else
139-
echo "No build output captured"
126+
# Default output so downstream steps have a defined value even if this step fails
127+
echo "failed_count=0" >> "$GITHUB_OUTPUT"
128+
129+
POLL_INTERVAL=30
130+
MAX_POLL_TIME=1500 # 25 minutes
131+
INITIAL_WAIT=60
132+
133+
# Checks to skip: our own workflow jobs, Vercel (prefix match), PR tooling
134+
# NOTE: keep in sync — if you rename jobs in other workflows, update here
135+
SKIP_PATTERN="^(regen-lockfile|fix-failures|Vercel.*|claude-review|generate-overview|Generate PR Overview)$"
136+
137+
# Expected CI checks and their source workflows:
138+
# test -> test.yml (unit + lint + integration)
139+
# e2e-cli -> e2e-tests.yml (CLI E2E)
140+
# audit -> audit.yml (security audit)
141+
# setup -> e2e-web-cli-parallel.yml (Web CLI E2E build prep)
142+
EXPECTED_CHECKS=("test" "e2e-cli" "setup" "audit")
143+
MIN_EXPECTED=3
144+
145+
echo "Waiting for CI checks on SHA: $HEAD_SHA"
146+
echo "SHA source: regen-lockfile job output (may be a new commit if lockfile was pushed)"
147+
echo "Initial wait of ${INITIAL_WAIT}s for checks to be queued..."
148+
sleep "$INITIAL_WAIT"
149+
150+
start_time=$(date +%s)
151+
152+
while true; do
153+
elapsed=$(( $(date +%s) - start_time ))
154+
if [[ $elapsed -ge $MAX_POLL_TIME ]]; then
155+
echo "::warning::Timed out after ${MAX_POLL_TIME}s waiting for checks"
156+
if [[ -n "$ci_checks" ]]; then
157+
still_pending=$(echo "$ci_checks" | jq -c 'select(.status != "completed")' | jq -r '.name' 2>/dev/null || true)
158+
if [[ -n "$still_pending" ]]; then
159+
echo "::warning::Still pending at timeout: ${still_pending}"
160+
fi
161+
fi
162+
break
140163
fi
141-
echo "ENDOFOUTPUT"
142-
echo "lint_output<<ENDOFOUTPUT"
143-
if [ -f /tmp/lint-output.txt ]; then
144-
tail -n 200 /tmp/lint-output.txt
145-
else
146-
echo "Lint was not run"
164+
165+
# Fetch all check runs for this SHA (handles pagination)
166+
all_checks=$(gh api "repos/${REPO}/commits/${HEAD_SHA}/check-runs" \
167+
--paginate \
168+
--jq '.check_runs[] | {name: .name, status: .status, conclusion: .conclusion, details_url: .details_url}' \
169+
2>/dev/null) || {
170+
echo "::warning::API call failed (elapsed: ${elapsed}s), retrying in 10s..."
171+
sleep 10
172+
continue
173+
}
174+
175+
# Filter out non-CI checks
176+
ci_checks=$(echo "$all_checks" | jq -c "select(.name | test(\"${SKIP_PATTERN}\") | not)" 2>/dev/null)
177+
178+
if [[ -z "$ci_checks" ]]; then
179+
echo "No CI checks found yet (elapsed: ${elapsed}s), waiting..."
180+
sleep "$POLL_INTERVAL"
181+
continue
182+
fi
183+
184+
# Count how many expected checks have appeared
185+
appeared=0
186+
for check_name in "${EXPECTED_CHECKS[@]}"; do
187+
if echo "$ci_checks" | jq -e "select(.name == \"${check_name}\")" > /dev/null 2>&1; then
188+
appeared=$((appeared + 1))
189+
fi
190+
done
191+
192+
if [[ $appeared -lt $MIN_EXPECTED && $elapsed -lt 300 ]]; then
193+
echo "Only ${appeared}/${MIN_EXPECTED} expected checks appeared (elapsed: ${elapsed}s), waiting..."
194+
sleep "$POLL_INTERVAL"
195+
continue
147196
fi
148-
echo "ENDOFOUTPUT"
149-
echo "test_output<<ENDOFOUTPUT"
150-
if [ -f /tmp/test-output.txt ]; then
151-
tail -n 200 /tmp/test-output.txt
152-
else
153-
echo "Tests were not run"
197+
198+
# Check if all CI checks are completed
199+
total=$(echo "$ci_checks" | jq -s 'length')
200+
pending=$(echo "$ci_checks" | jq -c 'select(.status != "completed")' | jq -s 'length')
201+
202+
echo "Check status: $((total - pending))/${total} completed (elapsed: ${elapsed}s)"
203+
204+
if [[ "$pending" -eq 0 && "$total" -gt 0 ]]; then
205+
echo "All CI checks completed."
206+
break
154207
fi
155-
echo "ENDOFOUTPUT"
208+
209+
sleep "$POLL_INTERVAL"
210+
done
211+
212+
# Fail explicitly if we timed out without ever receiving check data
213+
if [[ $elapsed -ge $MAX_POLL_TIME && -z "$ci_checks" ]]; then
214+
echo "::error::Timed out waiting for CI checks — no check data received"
215+
exit 1
216+
fi
217+
218+
# Collect failures (include cancelled — usually means an upstream job failed)
219+
failed_checks=$(echo "$ci_checks" | jq -c 'select(.conclusion == "failure" or .conclusion == "cancelled")' 2>/dev/null)
220+
failed_count=0
221+
if [[ -n "$failed_checks" ]]; then
222+
failed_count=$(echo "$failed_checks" | jq -s 'length')
223+
fi
224+
225+
echo "failed_count=${failed_count}" >> "$GITHUB_OUTPUT"
226+
227+
if [[ "$failed_count" -eq 0 ]]; then
228+
echo "All checks passed! Nothing to fix."
229+
exit 0
230+
fi
231+
232+
echo "Found ${failed_count} failed check(s)"
233+
234+
# List failed check names
235+
failed_names=$(echo "$failed_checks" | jq -r '.name' | sort)
236+
echo "Failed: ${failed_names}"
237+
238+
# Extract unique workflow run IDs from details_url
239+
# URL format: https://github.com/{owner}/{repo}/actions/runs/{run_id}/job/{job_id}
240+
run_ids=$(echo "$failed_checks" | jq -r '.details_url' | sed -n 's|.*/runs/\([0-9]*\)/.*|\1|p' | sort -u)
241+
242+
# Fetch failed logs for each workflow run
243+
failure_logs=""
244+
for run_id in $run_ids; do
245+
run_name=$(gh api "repos/${REPO}/actions/runs/${run_id}" --jq '.name' 2>/dev/null || echo "unknown")
246+
run_url="https://github.com/${REPO}/actions/runs/${run_id}"
247+
echo "Fetching failed logs for: ${run_name} (run ${run_id})..."
248+
logs=$(gh run view "$run_id" --repo "$REPO" --log-failed 2>&1 | tail -n 500) || logs="Failed to fetch logs. View manually: ${run_url}"
249+
250+
failure_logs="${failure_logs}
251+
=== Failed workflow: ${run_name} (run ${run_id}) ===
252+
URL: ${run_url}
253+
${logs}
254+
255+
"
256+
done
257+
258+
# Write outputs using randomised delimiters to avoid collision with log content
259+
delim_summary="EOF_$(openssl rand -hex 16)"
260+
delim_logs="EOF_$(openssl rand -hex 16)"
261+
{
262+
echo "failure_summary<<${delim_summary}"
263+
echo "Failed checks: $(echo "$failed_names" | tr '\n' ', ' | sed 's/, $//')"
264+
echo "${delim_summary}"
265+
echo "failure_logs<<${delim_logs}"
266+
echo "$failure_logs"
267+
echo "${delim_logs}"
156268
} >> "$GITHUB_OUTPUT"
157269
158-
- name: Fix issues with Claude
159-
if: steps.needs-fix.outputs.needed == 'true'
270+
- name: Checkout Dependabot branch
271+
if: steps.wait-for-checks.outputs.failed_count > 0
272+
uses: actions/checkout@v6
273+
with:
274+
ref: ${{ github.event.pull_request.head.ref }}
275+
token: ${{ steps.generate-token.outputs.token }}
276+
277+
- name: Set up pnpm
278+
if: steps.wait-for-checks.outputs.failed_count > 0
279+
uses: pnpm/action-setup@v5
280+
with:
281+
version: 10
282+
283+
- name: Set up Node.js
284+
if: steps.wait-for-checks.outputs.failed_count > 0
285+
uses: actions/setup-node@v6
286+
with:
287+
node-version: "22.x"
288+
289+
- name: Install dependencies
290+
if: steps.wait-for-checks.outputs.failed_count > 0
291+
run: pnpm install --frozen-lockfile
292+
293+
- name: Fix failures with Claude
294+
if: steps.wait-for-checks.outputs.failed_count > 0
160295
uses: anthropics/claude-code-action@v1
161296
with:
162297
anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
163298
github_token: ${{ steps.generate-token.outputs.token }}
164299
allowed_bots: "dependabot[bot]"
165300
prompt: |
166301
This is a Dependabot PR that bumps dependencies. The lockfile has been
167-
regenerated but the build, lint, or tests are failing.
302+
regenerated, but CI checks are failing.
168303
169304
Read .claude/CLAUDE.md for project context.
170305
171-
## Errors
306+
## Failed Checks
172307
173-
Build output (if failed):
174-
${{ steps.errors.outputs.build_output }}
308+
${{ steps.wait-for-checks.outputs.failure_summary }}
175309
176-
Lint output (if failed):
177-
${{ steps.errors.outputs.lint_output }}
310+
## Failure Logs
178311
179-
Test output (if failed):
180-
${{ steps.errors.outputs.test_output }}
312+
${{ steps.wait-for-checks.outputs.failure_logs }}
181313
182314
## Instructions
183315
184-
1. Diagnose why the build/lint/tests fail after the dependency bump
316+
1. Analyze ALL the failure logs above to understand what broke
185317
2. Make the MINIMUM changes needed to fix it — do not refactor unrelated code
186318
3. Run `pnpm run build`, `pnpm exec eslint .`, `pnpm test:unit`, and `pnpm --filter @ably/react-web-cli test` to verify your fixes
187319
4. Commit your changes with a descriptive message

0 commit comments

Comments
 (0)