Shopify · ianks · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/ruby/lib/ci/queue/configuration.rb b/ruby/lib/ci/queue/configuration.rb
@@ -99,6 +99,11 @@ def lazy_load_test_helper_paths
         @lazy_load_test_helpers.split(',').map(&:strip)
       end
 
+      def retry?
+        ENV.fetch("BUILDKITE_RETRY_COUNT", "0").to_i > 0 ||
+          ENV["SEMAPHORE_PIPELINE_RERUN"] == "true"
+      end
+
       def queue_init_timeout
         @queue_init_timeout || timeout
       end

diff --git a/ruby/lib/ci/queue/redis/supervisor.rb b/ruby/lib/ci/queue/redis/supervisor.rb
@@ -39,6 +39,23 @@ def wait_for_workers
             yield if block_given?
           end
 
+          # On retry runs (BUILDKITE_RETRY_COUNT > 0), the main queue is already
+          # exhausted from the original run. A retry worker may have found unresolved
+          # failures via the error-reports fallback and be running them via the Retry
+          # queue — but those tests are NOT in the Redis running set so active_workers?
+          # returns false and the loop above exits immediately.
+          #
+          # Wait up to inactive_workers_timeout for retry workers to clear error-reports.
+          # This prevents the summary from canceling retry workers before they finish.
+          if exhausted? && config.retry? && !rescue_connection_errors { build.failed_tests }.empty?
+            @time_left_with_no_workers = config.inactive_workers_timeout
+            until rescue_connection_errors { build.failed_tests }.empty? ||
+                @time_left_with_no_workers <= 0
+              sleep 1
+              @time_left_with_no_workers -= 1
+            end
+          end
+
           exhausted?
         rescue CI::Queue::Redis::LostMaster
           false

diff --git a/ruby/lib/minitest/queue/build_status_reporter.rb b/ruby/lib/minitest/queue/build_status_reporter.rb
@@ -267,8 +267,16 @@ def all_workers_died?
       end
 
       def aggregates
-        success = failures.zero? && errors.zero?
-        failures_count = "#{failures} failures, #{errors} errors,"
+        # error-reports is authoritative when workers die before flushing per-test stats.
+        # Floor the displayed count so the summary line is never misleadingly green.
+        known_error_count = error_reports.size
+        effective_total = [failures + errors, known_error_count].max
+        success = effective_total.zero?
+        failures_count = if failures + errors >= known_error_count
+          "#{failures} failures, #{errors} errors,"
+        else
+          "#{effective_total} failures,"
+        end
 
         step([
           'Ran %d tests, %d assertions,' % [progress, assertions],

diff --git a/ruby/test/ci/queue/redis_supervisor_test.rb b/ruby/test/ci/queue/redis_supervisor_test.rb
@@ -39,6 +39,50 @@ def test_wait_for_workers
     assert_equal true, workers_done
   end
 
+  def test_wait_for_workers_waits_for_retry_workers_to_clear_failures
+    # Simulate a rebuild: queue is already exhausted from the original run
+    poll(worker(1))
+
+    # Inject an unresolved failure into error-reports (as if the original run
+    # recorded a failure but the retry worker hasn't re-run it yet)
+    entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb")
+    @redis.hset("build:42:error-reports", entry, "{}")
+
+    sup = supervisor(timeout: 2, inactive_workers_timeout: 2)
+
+    with_retry_env do
+      # Simulate a retry worker clearing the failure after a short delay
+      thread = Thread.start do
+        sleep 0.5
+        @redis.hdel("build:42:error-reports", entry)
+      end
+
+      result = sup.wait_for_workers
+      thread.join
+
+      assert_equal true, result
+      assert @redis.hkeys("build:42:error-reports").empty?,
+        "error-reports should be empty after retry worker cleared the failure"
+    end
+  end
+
+  def test_wait_for_workers_does_not_wait_on_non_retry
+    # Same setup as above but WITHOUT retry env set
+    poll(worker(1))
+
+    entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb")
+    @redis.hset("build:42:error-reports", entry, "{}")
+
+    sup = supervisor(timeout: 30, inactive_workers_timeout: 30)
+
+    started_at = CI::Queue.time_now
+    result = sup.wait_for_workers
+    elapsed = CI::Queue.time_now - started_at
+
+    assert_equal true, result
+    assert_operator elapsed, :<, 2.0, "should return immediately without the retry wait"
+  end
+
   def test_num_workers
     assert_equal 0, @supervisor.workers_count
     worker(1)
@@ -58,14 +102,27 @@ def worker(id)
     ).populate(SharedQueueAssertions::TEST_LIST)
   end
 
-  def supervisor(timeout: 30, queue_init_timeout: nil)
+  def supervisor(timeout: 30, queue_init_timeout: nil, inactive_workers_timeout: nil)
     CI::Queue::Redis::Supervisor.new(
       @redis_url,
       CI::Queue::Configuration.new(
         build_id: '42',
         timeout: timeout,
-        queue_init_timeout: queue_init_timeout
+        queue_init_timeout: queue_init_timeout,
+        inactive_workers_timeout: inactive_workers_timeout,
       ),
     )
   end
+
+  def with_retry_env
+    original = ENV['BUILDKITE_RETRY_COUNT']
+    ENV['BUILDKITE_RETRY_COUNT'] = '1'
+    yield
+  ensure
+    if original.nil?
+      ENV.delete('BUILDKITE_RETRY_COUNT')
+    else
+      ENV['BUILDKITE_RETRY_COUNT'] = original
+    end
+  end
 end
diff --git a/ruby/test/integration/minitest_redis_test.rb b/ruby/test/integration/minitest_redis_test.rb
@@ -977,6 +977,74 @@ def test_rebuild_different_worker_with_no_failures_exits_cleanly
       assert_match(/All tests were ran already/, out)
     end
 
+    def test_report_waits_for_retry_worker_to_clear_failures
+      # Simulates the race condition seen in build 900737:
+      # - Report step starts (BUILDKITE_RETRY_COUNT=1), sees queue exhausted immediately,
+      #   but error-reports still has a failure from the original run.
+      # - A retry worker is concurrently running the failed test.
+      # - Without the fix, report exits immediately and cancels the retry worker.
+      # - With the fix, report waits up to inactive_workers_timeout for
+      #   retry workers to clear error-reports before reporting.
+
+      # First run: worker 1 fails a test
+      out, err = capture_subprocess_io do
+        system(
+          @exe, 'run',
+          '--queue', @redis_url,
+          '--seed', 'foobar',
+          '--build', '1',
+          '--worker', '1',
+          '--timeout', '1',
+          '-Itest',
+          'test/flaky_test.rb',
+          chdir: 'test/fixtures/',
+        )
+      end
+      assert_empty filter_deprecation_warnings(err)
+      assert_match(/1 failures/, normalize(out))
+
+      # Start the report concurrently — it should block waiting for retry workers
+      report_out = nil
+      report_err = nil
+      report_thread = Thread.new do
+        report_out, report_err = capture_subprocess_io do
+          system(
+            { 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'manual' },
+            @exe, 'report',
+            '--queue', @redis_url,
+            '--build', '1',
+            '--timeout', '1',
+            '--inactive-workers-timeout', '10',
+            chdir: 'test/fixtures/',
+          )
+        end
+      end
+
+      # Give the report a moment to start, then run the retry worker which
+      # re-runs the failed test and clears error-reports
+      sleep 0.3
+      out, err = capture_subprocess_io do
+        system(
+          { 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'manual', 'FLAKY_TEST_PASS' => '1' },
+          @exe, 'run',
+          '--queue', @redis_url,
+          '--seed', 'foobar',
+          '--build', '1',
+          '--worker', '2',
+          '--timeout', '1',
+          '-Itest',
+          'test/flaky_test.rb',
+          chdir: 'test/fixtures/',
+        )
+      end
+      assert_empty filter_deprecation_warnings(err)
+      assert_match(/Retrying failed tests/, out)
+
+      report_thread.join(15)
+      assert_empty filter_deprecation_warnings(report_err || '')
+      assert_match(/0 failures/, normalize(report_out || ''))
+    end
+
     def test_retry_fails_when_test_run_is_expired
       out, err = capture_subprocess_io do
         system(

diff --git a/ruby/test/minitest/queue/build_status_recorder_test.rb b/ruby/test/minitest/queue/build_status_recorder_test.rb
@@ -3,17 +3,6 @@
 require 'concurrent/set'
 
 module Minitest::Queue
-  # Lightweight stand-in for a test object in unit tests that don't run real tests.
-  # Holds test_id and file_path directly so no source_location lookup is needed.
-  FakeEntry = Struct.new(:id, :queue_entry, :method_name)
-
-  def self.fake_entry(method_name)
-    test_id = "Minitest::Test##{method_name}"
-    # Use the same file_path as ReporterTestHelper#result so entries match across reserve/record calls
-    file_path = "#{Minitest::Queue.project_root}/test/my_test.rb"
-    FakeEntry.new(test_id, CI::Queue::QueueEntry.format(test_id, file_path), method_name)
-  end
-
   class BuildStatusRecorderTest < Minitest::Test
     include ReporterTestHelper
 

diff --git a/ruby/test/minitest/queue/build_status_reporter_test.rb b/ruby/test/minitest/queue/build_status_reporter_test.rb
@@ -52,6 +52,34 @@ def test_all_workers_died
       assert_includes out, "All workers died."
     end
 
+    def test_aggregate_floors_failures_to_error_reports_when_stats_are_zero
+      # Simulate a worker that recorded a test failure in error-reports but died
+      # before flushing its per-worker stats (stats show 0 failures).
+      queue = worker(1)
+      queue.poll { |_test| queue.shutdown! }
+
+      entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb")
+      payload = Minitest::Queue::ErrorReport.new(
+        test_name: "test_failure",
+        test_suite: "FakeTest",
+        test_file: "/tmp/fake_test.rb",
+        test_line: 1,
+        error_class: "RuntimeError",
+        output: "FakeTest#test_failure: failed",
+      ).dump
+      @redis.hset("build:42:error-reports", entry, payload)
+
+      @supervisor.instance_variable_set(:@time_left, 1)
+      @supervisor.instance_variable_set(:@time_left_with_no_workers, 0)
+
+      out, _ = capture_subprocess_io do
+        @reporter.report
+      end
+
+      assert_includes out, "1 failures,", "should floor failure count to error_reports.size"
+      refute_includes out, "0 failures, 0 errors,", "should not show misleadingly green zero-failure line"
+    end
+
     private
 
     def worker(id)

diff --git a/ruby/test/support/fake_test_entry.rb b/ruby/test/support/fake_test_entry.rb
@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+module Minitest
+  module Queue
+    # Lightweight stand-in for a test object in unit tests that don't run real tests.
+    # Holds test_id and file_path directly so no source_location lookup is needed.
+    FakeEntry = Struct.new(:id, :queue_entry, :method_name)
+
+    def self.fake_entry(method_name)
+      test_id = "Minitest::Test##{method_name}"
+      # Use the same file_path as ReporterTestHelper#result so entries match across reserve/record calls
+      file_path = "#{Minitest::Queue.project_root}/test/my_test.rb"
+      FakeEntry.new(test_id, CI::Queue::QueueEntry.format(test_id, file_path), method_name)
+    end
+  end
+end