Don't count requeued test failures toward circuit breaker

ianks · ChrisBr · commit c459a9ee0598 · 2026-04-09T17:36:30.000+01:00
report_failure! was called before the requeue check, so successfully
requeued tests incremented the consecutive failure counter. With
max_consecutive_failures=N and N+ deterministic failures that are all
requeued, the circuit breaker fired prematurely and the worker exited,
stranding requeued tests in the queue with no worker to process them.

Now report_failure!/report_success! is only called when a test genuinely
fails or passes. Requeued tests are invisible to the circuit breaker —
they don't increment or reset the consecutive failure counter.
diff --git a/ruby/lib/minitest/queue.rb b/ruby/lib/minitest/queue.rb
@@ -180,14 +180,14 @@ def handle_test_result(reporter, example, result)
           # When we do a bisect, we don't care about the result other than the test we're running the bisect on
           result.mark_as_flaked!
           failed = false
-        elsif failed
-          queue.report_failure!
-        else
-          queue.report_success!
         end
 
         if failed && CI::Queue.requeueable?(result) && queue.requeue(example)
           result.requeue!
+        elsif failed
+          queue.report_failure!
+        else
+          queue.report_success!
         end
         reporter.record(result)
       end
diff --git a/ruby/test/ci/queue/redis_test.rb b/ruby/test/ci/queue/redis_test.rb
@@ -270,6 +270,27 @@ def test_initialise_from_rediss_uri
     assert_instance_of CI::Queue::Redis::Worker, queue
   end
 
+  def test_circuit_breaker_does_not_count_requeued_failures
+    # Bug: report_failure! was called before the requeue check, so successfully
+    # requeued tests incremented the consecutive failure counter. With
+    # max_consecutive_failures=3 and 3+ deterministic failures that are all
+    # requeued, the circuit breaker fired prematurely and the worker exited,
+    # stranding requeued tests in the queue with no worker to process them.
+    queue = worker(1, max_requeues: 5, requeue_tolerance: 1.0, max_consecutive_failures: 3)
+
+    # All tests fail (deterministic failures that get requeued)
+    tests_processed = poll(queue, false)
+
+    # With 4 tests and max_requeues=5, all tests should be processed multiple
+    # times (requeued after each failure). The circuit breaker should NOT fire
+    # because requeued failures are transient, not consecutive "real" failures.
+    assert tests_processed.size > TEST_LIST.size,
+      "Expected tests to be requeued and re-processed, but only #{tests_processed.size} " \
+      "tests were processed (circuit breaker likely fired prematurely). " \
+      "Circuit breaker open? #{queue.config.circuit_breakers.any?(&:open?)}"
+  end
+
+
   private
 
   def shuffled_test_list
diff --git a/ruby/test/integration/minitest_redis_test.rb b/ruby/test/integration/minitest_redis_test.rb
@@ -296,9 +296,35 @@ def test_circuit_breaker
         )
       end
 
+      # Requeued failures don't count toward the circuit breaker. The worker
+      # processes more tests than the old behavior (which stopped at 3).
+      # Exact count depends on queue ordering, but must be > 3.
+      output = normalize(out.lines.last.strip)
+      ran_count = output.match(/Ran (\d+) tests/)[1].to_i
+      assert ran_count > 3,
+        "Expected more than 3 tests to run (requeues shouldn't trip breaker), got: #{output}"
+    end
+
+    def test_circuit_breaker_without_requeues
+      out, err = capture_subprocess_io do
+        system(
+          @exe, 'run',
+          '--queue', @redis_url,
+          '--seed', 'foobar',
+          '--build', '1',
+          '--worker', '1',
+          '--timeout', '1',
+          '--max-requeues', '0',
+          '--max-consecutive-failures', '3',
+          '-Itest',
+          'test/failing_test.rb',
+          chdir: 'test/fixtures/',
+        )
+      end
+
       assert_equal "This worker is exiting early because it encountered too many consecutive test failures, probably because of some corrupted state.\n", err
       output = normalize(out.lines.last.strip)
-      assert_equal 'Ran 3 tests, 3 assertions, 0 failures, 0 errors, 0 skips, 3 requeues in X.XXs', output
+      assert_equal 'Ran 3 tests, 3 assertions, 3 failures, 0 errors, 0 skips, 0 requeues in X.XXs', output
     end
 
     def test_redis_runner
diff --git a/ruby/test/support/queue_helpers.rb b/ruby/test/support/queue_helpers.rb
@@ -9,8 +9,12 @@ def poll(queue, success = true)
       test_order << test
       failed = !(success.respond_to?(:call) ? success.call(test) : success)
       if failed
-        queue.report_failure!
-        queue.requeue(test) || queue.acknowledge(test.id)
+        if queue.requeue(test)
+          # Requeued — don't report to circuit breaker
+        else
+          queue.report_failure!
+          queue.acknowledge(test.id)
+        end
       else
         queue.report_success!
         queue.acknowledge(test.id)