From c459a9ee0598a1797a8cc72a7bacc53f0288ba14 Mon Sep 17 00:00:00 2001
From: Ian Ker-Seymer <ian.kerseymer@shopify.com>
Date: Tue, 24 Mar 2026 23:23:03 -0400
Subject: [PATCH] Don't count requeued test failures toward circuit breaker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

report_failure! was called before the requeue check, so successfully
requeued tests incremented the consecutive failure counter. With
max_consecutive_failures=N and N+ deterministic failures that are all
requeued, the circuit breaker fired prematurely and the worker exited,
stranding requeued tests in the queue with no worker to process them.

Now report_failure!/report_success! is only called when a test genuinely
fails or passes. Requeued tests are invisible to the circuit breaker —
they don't increment or reset the consecutive failure counter.
---
 ruby/lib/minitest/queue.rb                   |  8 +++---
 ruby/test/ci/queue/redis_test.rb             | 21 +++++++++++++++
 ruby/test/integration/minitest_redis_test.rb | 28 +++++++++++++++++++-
 ruby/test/support/queue_helpers.rb           |  8 ++++--
 4 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/ruby/lib/minitest/queue.rb b/ruby/lib/minitest/queue.rb
index 09849cb7..6eb7a054 100644
--- a/ruby/lib/minitest/queue.rb
+++ b/ruby/lib/minitest/queue.rb
@@ -180,14 +180,14 @@ def handle_test_result(reporter, example, result)
           # When we do a bisect, we don't care about the result other than the test we're running the bisect on
           result.mark_as_flaked!
           failed = false
-        elsif failed
-          queue.report_failure!
-        else
-          queue.report_success!
         end
 
         if failed && CI::Queue.requeueable?(result) && queue.requeue(example)
           result.requeue!
+        elsif failed
+          queue.report_failure!
+        else
+          queue.report_success!
         end
         reporter.record(result)
       end
diff --git a/ruby/test/ci/queue/redis_test.rb b/ruby/test/ci/queue/redis_test.rb
index e518f538..29c94120 100644
--- a/ruby/test/ci/queue/redis_test.rb
+++ b/ruby/test/ci/queue/redis_test.rb
@@ -270,6 +270,27 @@ def test_initialise_from_rediss_uri
     assert_instance_of CI::Queue::Redis::Worker, queue
   end
 
+  def test_circuit_breaker_does_not_count_requeued_failures
+    # Bug: report_failure! was called before the requeue check, so successfully
+    # requeued tests incremented the consecutive failure counter. With
+    # max_consecutive_failures=3 and 3+ deterministic failures that are all
+    # requeued, the circuit breaker fired prematurely and the worker exited,
+    # stranding requeued tests in the queue with no worker to process them.
+    queue = worker(1, max_requeues: 5, requeue_tolerance: 1.0, max_consecutive_failures: 3)
+
+    # All tests fail (deterministic failures that get requeued)
+    tests_processed = poll(queue, false)
+
+    # With 4 tests and max_requeues=5, all tests should be processed multiple
+    # times (requeued after each failure). The circuit breaker should NOT fire
+    # because requeued failures are transient, not consecutive "real" failures.
+    assert tests_processed.size > TEST_LIST.size,
+      "Expected tests to be requeued and re-processed, but only #{tests_processed.size} " \
+      "tests were processed (circuit breaker likely fired prematurely). " \
+      "Circuit breaker open? #{queue.config.circuit_breakers.any?(&:open?)}"
+  end
+
+
   private
 
   def shuffled_test_list
diff --git a/ruby/test/integration/minitest_redis_test.rb b/ruby/test/integration/minitest_redis_test.rb
index cdce6803..322dfa93 100644
--- a/ruby/test/integration/minitest_redis_test.rb
+++ b/ruby/test/integration/minitest_redis_test.rb
@@ -296,9 +296,35 @@ def test_circuit_breaker
         )
       end
 
+      # Requeued failures don't count toward the circuit breaker. The worker
+      # processes more tests than the old behavior (which stopped at 3).
+      # Exact count depends on queue ordering, but must be > 3.
+      output = normalize(out.lines.last.strip)
+      ran_count = output.match(/Ran (\d+) tests/)[1].to_i
+      assert ran_count > 3,
+        "Expected more than 3 tests to run (requeues shouldn't trip breaker), got: #{output}"
+    end
+
+    def test_circuit_breaker_without_requeues
+      out, err = capture_subprocess_io do
+        system(
+          @exe, 'run',
+          '--queue', @redis_url,
+          '--seed', 'foobar',
+          '--build', '1',
+          '--worker', '1',
+          '--timeout', '1',
+          '--max-requeues', '0',
+          '--max-consecutive-failures', '3',
+          '-Itest',
+          'test/failing_test.rb',
+          chdir: 'test/fixtures/',
+        )
+      end
+
       assert_equal "This worker is exiting early because it encountered too many consecutive test failures, probably because of some corrupted state.\n", err
       output = normalize(out.lines.last.strip)
-      assert_equal 'Ran 3 tests, 3 assertions, 0 failures, 0 errors, 0 skips, 3 requeues in X.XXs', output
+      assert_equal 'Ran 3 tests, 3 assertions, 3 failures, 0 errors, 0 skips, 0 requeues in X.XXs', output
     end
 
     def test_redis_runner
diff --git a/ruby/test/support/queue_helpers.rb b/ruby/test/support/queue_helpers.rb
index 2db7549f..0e2fd3bc 100644
--- a/ruby/test/support/queue_helpers.rb
+++ b/ruby/test/support/queue_helpers.rb
@@ -9,8 +9,12 @@ def poll(queue, success = true)
       test_order << test
       failed = !(success.respond_to?(:call) ? success.call(test) : success)
       if failed
-        queue.report_failure!
-        queue.requeue(test) || queue.acknowledge(test.id)
+        if queue.requeue(test)
+          # Requeued — don't report to circuit breaker
+        else
+          queue.report_failure!
+          queue.acknowledge(test.id)
+        end
       else
         queue.report_success!
         queue.acknowledge(test.id)