Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions ruby/lib/ci/queue/configuration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ def lazy_load_test_helper_paths
@lazy_load_test_helpers.split(',').map(&:strip)
end

def retry?
ENV.fetch("BUILDKITE_RETRY_COUNT", "0").to_i > 0 ||
ENV["SEMAPHORE_PIPELINE_RERUN"] == "true"
end

def queue_init_timeout
@queue_init_timeout || timeout
end
Expand Down
17 changes: 17 additions & 0 deletions ruby/lib/ci/queue/redis/supervisor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,23 @@ def wait_for_workers
yield if block_given?
end

# On retry runs (BUILDKITE_RETRY_COUNT > 0), the main queue is already
# exhausted from the original run. A retry worker may have found unresolved
# failures via the error-reports fallback and be running them via the Retry
# queue — but those tests are NOT in the Redis running set so active_workers?
# returns false and the loop above exits immediately.
#
# Wait up to inactive_workers_timeout for retry workers to clear error-reports.
# This prevents the summary from canceling retry workers before they finish.
if exhausted? && config.retry? && !rescue_connection_errors { build.failed_tests }.empty?
@time_left_with_no_workers = config.inactive_workers_timeout
until rescue_connection_errors { build.failed_tests }.empty? ||
@time_left_with_no_workers <= 0
sleep 1
@time_left_with_no_workers -= 1
end
end

exhausted?
rescue CI::Queue::Redis::LostMaster
false
Expand Down
12 changes: 10 additions & 2 deletions ruby/lib/minitest/queue/build_status_reporter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,16 @@ def all_workers_died?
end

def aggregates
success = failures.zero? && errors.zero?
failures_count = "#{failures} failures, #{errors} errors,"
# error-reports is authoritative when workers die before flushing per-test stats.
# Floor the displayed count so the summary line is never misleadingly green.
known_error_count = error_reports.size
effective_total = [failures + errors, known_error_count].max
success = effective_total.zero?
failures_count = if failures + errors >= known_error_count
"#{failures} failures, #{errors} errors,"
else
"#{effective_total} failures,"
end

step([
'Ran %d tests, %d assertions,' % [progress, assertions],
Expand Down
61 changes: 59 additions & 2 deletions ruby/test/ci/queue/redis_supervisor_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,50 @@ def test_wait_for_workers
assert_equal true, workers_done
end

def test_wait_for_workers_waits_for_retry_workers_to_clear_failures
# Simulate a rebuild: queue is already exhausted from the original run
poll(worker(1))

# Inject an unresolved failure into error-reports (as if the original run
# recorded a failure but the retry worker hasn't re-run it yet)
entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb")
@redis.hset("build:42:error-reports", entry, "{}")

sup = supervisor(timeout: 2, inactive_workers_timeout: 2)

with_retry_env do
# Simulate a retry worker clearing the failure after a short delay
thread = Thread.start do
sleep 0.5
@redis.hdel("build:42:error-reports", entry)
end

result = sup.wait_for_workers
thread.join

assert_equal true, result
assert @redis.hkeys("build:42:error-reports").empty?,
"error-reports should be empty after retry worker cleared the failure"
end
end

def test_wait_for_workers_does_not_wait_on_non_retry
# Same setup as above but WITHOUT retry env set
poll(worker(1))

entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb")
@redis.hset("build:42:error-reports", entry, "{}")

sup = supervisor(timeout: 30, inactive_workers_timeout: 30)

started_at = CI::Queue.time_now
result = sup.wait_for_workers
elapsed = CI::Queue.time_now - started_at

assert_equal true, result
assert_operator elapsed, :<, 2.0, "should return immediately without the retry wait"
end

def test_num_workers
assert_equal 0, @supervisor.workers_count
worker(1)
Expand All @@ -58,14 +102,27 @@ def worker(id)
).populate(SharedQueueAssertions::TEST_LIST)
end

def supervisor(timeout: 30, queue_init_timeout: nil)
def supervisor(timeout: 30, queue_init_timeout: nil, inactive_workers_timeout: nil)
CI::Queue::Redis::Supervisor.new(
@redis_url,
CI::Queue::Configuration.new(
build_id: '42',
timeout: timeout,
queue_init_timeout: queue_init_timeout
queue_init_timeout: queue_init_timeout,
inactive_workers_timeout: inactive_workers_timeout,
),
)
end

def with_retry_env
original = ENV['BUILDKITE_RETRY_COUNT']
ENV['BUILDKITE_RETRY_COUNT'] = '1'
yield
ensure
if original.nil?
ENV.delete('BUILDKITE_RETRY_COUNT')
else
ENV['BUILDKITE_RETRY_COUNT'] = original
end
end
end
68 changes: 68 additions & 0 deletions ruby/test/integration/minitest_redis_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,74 @@ def test_rebuild_different_worker_with_no_failures_exits_cleanly
assert_match(/All tests were ran already/, out)
end

def test_report_waits_for_retry_worker_to_clear_failures
# Simulates the race condition seen in build 900737:
# - Report step starts (BUILDKITE_RETRY_COUNT=1), sees queue exhausted immediately,
# but error-reports still has a failure from the original run.
# - A retry worker is concurrently running the failed test.
# - Without the fix, report exits immediately and cancels the retry worker.
# - With the fix, report waits up to inactive_workers_timeout for
# retry workers to clear error-reports before reporting.

# First run: worker 1 fails a test
out, err = capture_subprocess_io do
system(
@exe, 'run',
'--queue', @redis_url,
'--seed', 'foobar',
'--build', '1',
'--worker', '1',
'--timeout', '1',
'-Itest',
'test/flaky_test.rb',
chdir: 'test/fixtures/',
)
end
assert_empty filter_deprecation_warnings(err)
assert_match(/1 failures/, normalize(out))

# Start the report concurrently — it should block waiting for retry workers
report_out = nil
report_err = nil
report_thread = Thread.new do
report_out, report_err = capture_subprocess_io do
system(
{ 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'manual' },
@exe, 'report',
'--queue', @redis_url,
'--build', '1',
'--timeout', '1',
'--inactive-workers-timeout', '10',
chdir: 'test/fixtures/',
)
end
end

# Give the report a moment to start, then run the retry worker which
# re-runs the failed test and clears error-reports
sleep 0.3
out, err = capture_subprocess_io do
system(
{ 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'manual', 'FLAKY_TEST_PASS' => '1' },
@exe, 'run',
'--queue', @redis_url,
'--seed', 'foobar',
'--build', '1',
'--worker', '2',
'--timeout', '1',
'-Itest',
'test/flaky_test.rb',
chdir: 'test/fixtures/',
)
end
assert_empty filter_deprecation_warnings(err)
assert_match(/Retrying failed tests/, out)

report_thread.join(15)
assert_empty filter_deprecation_warnings(report_err || '')
assert_match(/0 failures/, normalize(report_out || ''))
end

def test_retry_fails_when_test_run_is_expired
out, err = capture_subprocess_io do
system(
Expand Down
11 changes: 0 additions & 11 deletions ruby/test/minitest/queue/build_status_recorder_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,6 @@
require 'concurrent/set'

module Minitest::Queue
# Lightweight stand-in for a test object in unit tests that don't run real tests.
# Holds test_id and file_path directly so no source_location lookup is needed.
FakeEntry = Struct.new(:id, :queue_entry, :method_name)

def self.fake_entry(method_name)
test_id = "Minitest::Test##{method_name}"
# Use the same file_path as ReporterTestHelper#result so entries match across reserve/record calls
file_path = "#{Minitest::Queue.project_root}/test/my_test.rb"
FakeEntry.new(test_id, CI::Queue::QueueEntry.format(test_id, file_path), method_name)
end

class BuildStatusRecorderTest < Minitest::Test
include ReporterTestHelper

Expand Down
28 changes: 28 additions & 0 deletions ruby/test/minitest/queue/build_status_reporter_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,34 @@ def test_all_workers_died
assert_includes out, "All workers died."
end

def test_aggregate_floors_failures_to_error_reports_when_stats_are_zero
# Simulate a worker that recorded a test failure in error-reports but died
# before flushing its per-worker stats (stats show 0 failures).
queue = worker(1)
queue.poll { |_test| queue.shutdown! }

entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb")
payload = Minitest::Queue::ErrorReport.new(
test_name: "test_failure",
test_suite: "FakeTest",
test_file: "/tmp/fake_test.rb",
test_line: 1,
error_class: "RuntimeError",
output: "FakeTest#test_failure: failed",
).dump
@redis.hset("build:42:error-reports", entry, payload)

@supervisor.instance_variable_set(:@time_left, 1)
@supervisor.instance_variable_set(:@time_left_with_no_workers, 0)

out, _ = capture_subprocess_io do
@reporter.report
end

assert_includes out, "1 failures,", "should floor failure count to error_reports.size"
refute_includes out, "0 failures, 0 errors,", "should not show misleadingly green zero-failure line"
end

private

def worker(id)
Expand Down
15 changes: 15 additions & 0 deletions ruby/test/support/fake_test_entry.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# frozen_string_literal: true
module Minitest
module Queue
# Lightweight stand-in for a test object in unit tests that don't run real tests.
# Holds test_id and file_path directly so no source_location lookup is needed.
FakeEntry = Struct.new(:id, :queue_entry, :method_name)

def self.fake_entry(method_name)
test_id = "Minitest::Test##{method_name}"
# Use the same file_path as ReporterTestHelper#result so entries match across reserve/record calls
file_path = "#{Minitest::Queue.project_root}/test/my_test.rb"
FakeEntry.new(test_id, CI::Queue::QueueEntry.format(test_id, file_path), method_name)
end
end
end
Loading