From c459a9ee0598a1797a8cc72a7bacc53f0288ba14 Mon Sep 17 00:00:00 2001 From: Ian Ker-Seymer Date: Tue, 24 Mar 2026 23:23:03 -0400 Subject: [PATCH] Don't count requeued test failures toward circuit breaker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit report_failure! was called before the requeue check, so successfully requeued tests incremented the consecutive failure counter. With max_consecutive_failures=N and N+ deterministic failures that are all requeued, the circuit breaker fired prematurely and the worker exited, stranding requeued tests in the queue with no worker to process them. Now report_failure!/report_success! is only called when a test genuinely fails or passes. Requeued tests are invisible to the circuit breaker — they don't increment or reset the consecutive failure counter. --- ruby/lib/minitest/queue.rb | 8 +++--- ruby/test/ci/queue/redis_test.rb | 21 +++++++++++++++ ruby/test/integration/minitest_redis_test.rb | 28 +++++++++++++++++++- ruby/test/support/queue_helpers.rb | 8 ++++-- 4 files changed, 58 insertions(+), 7 deletions(-) diff --git a/ruby/lib/minitest/queue.rb b/ruby/lib/minitest/queue.rb index 09849cb7..6eb7a054 100644 --- a/ruby/lib/minitest/queue.rb +++ b/ruby/lib/minitest/queue.rb @@ -180,14 +180,14 @@ def handle_test_result(reporter, example, result) # When we do a bisect, we don't care about the result other than the test we're running the bisect on result.mark_as_flaked! failed = false - elsif failed - queue.report_failure! - else - queue.report_success! end if failed && CI::Queue.requeueable?(result) && queue.requeue(example) result.requeue! + elsif failed + queue.report_failure! + else + queue.report_success! end reporter.record(result) end diff --git a/ruby/test/ci/queue/redis_test.rb b/ruby/test/ci/queue/redis_test.rb index e518f538..29c94120 100644 --- a/ruby/test/ci/queue/redis_test.rb +++ b/ruby/test/ci/queue/redis_test.rb @@ -270,6 +270,27 @@ def test_initialise_from_rediss_uri assert_instance_of CI::Queue::Redis::Worker, queue end + def test_circuit_breaker_does_not_count_requeued_failures + # Bug: report_failure! was called before the requeue check, so successfully + # requeued tests incremented the consecutive failure counter. With + # max_consecutive_failures=3 and 3+ deterministic failures that are all + # requeued, the circuit breaker fired prematurely and the worker exited, + # stranding requeued tests in the queue with no worker to process them. + queue = worker(1, max_requeues: 5, requeue_tolerance: 1.0, max_consecutive_failures: 3) + + # All tests fail (deterministic failures that get requeued) + tests_processed = poll(queue, false) + + # With 4 tests and max_requeues=5, all tests should be processed multiple + # times (requeued after each failure). The circuit breaker should NOT fire + # because requeued failures are transient, not consecutive "real" failures. + assert tests_processed.size > TEST_LIST.size, + "Expected tests to be requeued and re-processed, but only #{tests_processed.size} " \ + "tests were processed (circuit breaker likely fired prematurely). " \ + "Circuit breaker open? #{queue.config.circuit_breakers.any?(&:open?)}" + end + + private def shuffled_test_list diff --git a/ruby/test/integration/minitest_redis_test.rb b/ruby/test/integration/minitest_redis_test.rb index cdce6803..322dfa93 100644 --- a/ruby/test/integration/minitest_redis_test.rb +++ b/ruby/test/integration/minitest_redis_test.rb @@ -296,9 +296,35 @@ def test_circuit_breaker ) end + # Requeued failures don't count toward the circuit breaker. The worker + # processes more tests than the old behavior (which stopped at 3). + # Exact count depends on queue ordering, but must be > 3. + output = normalize(out.lines.last.strip) + ran_count = output.match(/Ran (\d+) tests/)[1].to_i + assert ran_count > 3, + "Expected more than 3 tests to run (requeues shouldn't trip breaker), got: #{output}" + end + + def test_circuit_breaker_without_requeues + out, err = capture_subprocess_io do + system( + @exe, 'run', + '--queue', @redis_url, + '--seed', 'foobar', + '--build', '1', + '--worker', '1', + '--timeout', '1', + '--max-requeues', '0', + '--max-consecutive-failures', '3', + '-Itest', + 'test/failing_test.rb', + chdir: 'test/fixtures/', + ) + end + assert_equal "This worker is exiting early because it encountered too many consecutive test failures, probably because of some corrupted state.\n", err output = normalize(out.lines.last.strip) - assert_equal 'Ran 3 tests, 3 assertions, 0 failures, 0 errors, 0 skips, 3 requeues in X.XXs', output + assert_equal 'Ran 3 tests, 3 assertions, 3 failures, 0 errors, 0 skips, 0 requeues in X.XXs', output end def test_redis_runner diff --git a/ruby/test/support/queue_helpers.rb b/ruby/test/support/queue_helpers.rb index 2db7549f..0e2fd3bc 100644 --- a/ruby/test/support/queue_helpers.rb +++ b/ruby/test/support/queue_helpers.rb @@ -9,8 +9,12 @@ def poll(queue, success = true) test_order << test failed = !(success.respond_to?(:call) ? success.call(test) : success) if failed - queue.report_failure! - queue.requeue(test) || queue.acknowledge(test.id) + if queue.requeue(test) + # Requeued — don't report to circuit breaker + else + queue.report_failure! + queue.acknowledge(test.id) + end else queue.report_success! queue.acknowledge(test.id)