Skip to content

Commit 879bc71

Browse files
committed
only add semaphore to waitingList once per version
TrackLastVersion is called once per store during IN_PROGRESS. Each call creates a new semaphore and overwrites lastVersionTransactionsDone, orphaning the previous one in the waitingList. DecrementActiveTransactions only releases the last one. ProcessWaitingListAsync blocks forever on the orphaned semaphore. Since both stores share the same transaction counter, we only need one semaphore per version. If TrackLastVersion has already been called for a given version, subsequent calls return immediately. Includes a regression test that fails without the fix.
1 parent 3610cd6 commit 879bc71

File tree

2 files changed

+51
-0
lines changed

2 files changed

+51
-0
lines changed

libs/storage/Tsavorite/cs/src/core/Index/Checkpointing/StateMachineDriver.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@ void DecrementActiveTransactions(long txnVersion)
6666

6767
internal void TrackLastVersion(long version)
6868
{
69+
// Only create and enqueue one semaphore per version, if we create a
70+
// new one on each call, the earlier semaphore is orphaned in the waitingList
71+
// and never released, and we permanently block ProcessWaitingListAsync.
72+
if (lastVersion == version)
73+
return;
74+
6975
if (GetNumActiveTransactions(version) > 0)
7076
{
7177
// Set version number first, then create semaphore

libs/storage/Tsavorite/cs/test/StateMachineDriverTests.cs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,4 +401,49 @@ public async ValueTask GrowIndexVersionSwitchTxnTest(
401401
[Values] bool useTimingFuzzing)
402402
=> await DoGrowIndexVersionSwitchEquivalenceCheck(indexSize, useTimingFuzzing).ConfigureAwait(false);
403403
}
404+
405+
/// <summary>
406+
/// Regression test for checkpoint deadlock with two-store checkpoints.
407+
///
408+
/// TrackLastVersion is called once per store during the IN_PROGRESS phase.
409+
/// Without the fix, the second call overwrites lastVersionTransactionsDone,
410+
/// orphaning the first semaphore in the waitingList. ProcessWaitingListAsync
411+
/// then waits on it forever.
412+
/// </summary>
413+
[TestFixture]
414+
public class TrackLastVersionTwoStoreDeadlock
415+
{
416+
[Test]
417+
public async Task TrackLastVersionCalledTwiceDoesNotDeadlock()
418+
{
419+
var epoch = new LightEpoch();
420+
var driver = new StateMachineDriver(epoch);
421+
422+
// Simulate an active transaction (e.g. Lua script touching both stores)
423+
var txnVersion = driver.AcquireTransactionVersion();
424+
425+
// GlobalAfterEnteringState calls TrackLastVersion once per store
426+
driver.TrackLastVersion(txnVersion); // MainStore
427+
driver.TrackLastVersion(txnVersion); // ObjectStore
428+
429+
// Transaction completes
430+
driver.EndTransaction(txnVersion);
431+
432+
// Verify all waitingList semaphores are released (not orphaned)
433+
var waitingList = (System.Collections.Generic.List<System.Threading.SemaphoreSlim>)
434+
typeof(StateMachineDriver)
435+
.GetField("waitingList", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)
436+
.GetValue(driver);
437+
438+
ClassicAssert.AreEqual(1, waitingList.Count,
439+
"Expected 1 semaphore in waitingList, not 2. " +
440+
"Two means the second TrackLastVersion call created a new semaphore " +
441+
"that overwrote the first, orphaning it.");
442+
443+
var acquired = await waitingList[0].WaitAsync(System.TimeSpan.FromSeconds(5));
444+
ClassicAssert.IsTrue(acquired,
445+
"Semaphore was not released after EndTransaction. " +
446+
"This causes ProcessWaitingListAsync to deadlock permanently.");
447+
}
448+
}
404449
}

0 commit comments

Comments
 (0)