From 962a814b849f160ad6bdd924bbfd535fd7d6a080 Mon Sep 17 00:00:00 2001
From: Zachary Levy <zack@bfpower.io>
Date: Sat, 6 Jun 2026 02:38:08 +0000
Subject: [PATCH] Spinlock features (#33)

Co-authored-by: Zachary Levy <zachary@sunforge.is>
Reviewed-on: https://git.bfpower.io/BFPOWER/levlib/pulls/33
---
 levsync/levsync.odin | 189 ++++++++++++++++++++++++++++++-------------
 1 file changed, 132 insertions(+), 57 deletions(-)

diff --git a/levsync/levsync.odin b/levsync/levsync.odin
index e9bbd94..f8c1d44 100644
--- a/levsync/levsync.odin
+++ b/levsync/levsync.odin
@@ -120,10 +120,52 @@ spinlock_try_lock :: #force_inline proc "contextless" (lock: ^Spinlock) -> bool
 	return lock_acquired
 }
 
+// Spins until the lock is acquired, relaxing the CPU between attempts.
+spinlock_lock :: #force_inline proc "contextless" (lock: ^Spinlock) {
+	for !spinlock_try_lock(lock) {
+		intrinsics.cpu_relax()
+	}
+}
+
 spinlock_unlock :: #force_inline proc "contextless" (lock: ^Spinlock) {
 	intrinsics.atomic_store_explicit(lock, false, .Release)
 }
 
+// Spins until the lock is acquired, then unlocks at the end of the calling scope. Always returns
+// true so it can guard a critical section from within an `if`:
+//
+//	if spinlock_guard(&lock) {
+//		// critical section
+//	}
+@(deferred_in = spinlock_unlock)
+spinlock_guard :: #force_inline proc "contextless" (lock: ^Spinlock) -> bool {
+	spinlock_lock(lock)
+	return true
+}
+
+// Tries to acquire the lock once without spinning. Returns true and unlocks at the end of the
+// calling scope if acquired, otherwise returns false and does nothing:
+//
+//	if spinlock_tryguard(&lock) {
+//		// critical section, entered only if the lock was acquired
+//	}
+@(deferred_in_out = spinlock_tryguard_unlock)
+spinlock_tryguard :: #force_inline proc "contextless" (lock: ^Spinlock) -> bool {
+	return spinlock_try_lock(lock)
+}
+
+// Deferred companion of `spinlock_tryguard`; unlocks only when the lock was actually acquired.
+@(private)
+spinlock_tryguard_unlock :: #force_inline proc "contextless" (lock: ^Spinlock, locked: bool) {
+	if locked {
+		spinlock_unlock(lock)
+	}
+}
+
+lock :: proc {
+	spinlock_lock,
+}
+
 try_lock :: proc {
 	spinlock_try_lock,
 }
@@ -132,6 +174,14 @@ unlock :: proc {
 	spinlock_unlock,
 }
 
+guard :: proc {
+	spinlock_guard,
+}
+
+tryguard :: proc {
+	spinlock_tryguard,
+}
+
 // ---------------------------------------------------------------------------------------------------------------------
 // ----- Tests ------------------------
 // ---------------------------------------------------------------------------------------------------------------------
@@ -139,10 +189,10 @@ import "core:sync"
 import "core:testing"
 import "core:thread"
 
+// Multiple threads will each add 1.0 this many times.
+// If any updates are lost due to race conditions, the final sum will be wrong.
 @(test)
 test_concurrent_atomic_add_no_lost_updates :: proc(t: ^testing.T) {
-	// Multiple threads will each add 1.0 this many times.
-	// If any updates are lost due to race conditions, the final sum will be wrong.
 	NUM_THREADS :: 8
 	ITERATIONS_PER_THREAD :: 10_000
 
@@ -184,10 +234,10 @@ test_concurrent_atomic_add_no_lost_updates :: proc(t: ^testing.T) {
 	testing.expect_value(t, shared_value, expected)
 }
 
+// Start with a known value, multiple threads subtract.
+// If any updates are lost due to race conditions, the final result will be wrong.
 @(test)
 test_concurrent_atomic_sub_no_lost_updates :: proc(t: ^testing.T) {
-	// Start with a known value, multiple threads subtract.
-	// If any updates are lost due to race conditions, the final result will be wrong.
 	NUM_THREADS :: 8
 	ITERATIONS_PER_THREAD :: 10_000
 
@@ -228,11 +278,11 @@ test_concurrent_atomic_sub_no_lost_updates :: proc(t: ^testing.T) {
 	testing.expect_value(t, shared_value, 0.0)
 }
 
+// Each thread multiplies by 2.0 then divides by 2.0.
+// Since these are inverses, the final value should equal the starting value
+// regardless of how operations interleave.
 @(test)
 test_concurrent_atomic_mul_div_round_trip :: proc(t: ^testing.T) {
-	// Each thread multiplies by 2.0 then divides by 2.0.
-	// Since these are inverses, the final value should equal the starting value
-	// regardless of how operations interleave.
 	NUM_THREADS :: 8
 	ITERATIONS_PER_THREAD :: 10_000
 
@@ -274,10 +324,10 @@ test_concurrent_atomic_mul_div_round_trip :: proc(t: ^testing.T) {
 	testing.expect_value(t, shared_value, 1000.0)
 }
 
+// Verify the f32 type dispatch works correctly under contention.
+// Same approach as the f64 add test but with f32.
 @(test)
 test_atomic_add_with_f32 :: proc(t: ^testing.T) {
-	// Verify the f32 type dispatch works correctly under contention.
-	// Same approach as the f64 add test but with f32.
 	NUM_THREADS :: 8
 	ITERATIONS_PER_THREAD :: 10_000
 
@@ -319,17 +369,17 @@ test_atomic_add_with_f32 :: proc(t: ^testing.T) {
 	testing.expect_value(t, shared_value, expected)
 }
 
+// Tests that the memory order passed to atomic_float_op's CAS success condition
+// provides full ordering guarantees for the entire float operation.
+//
+// Both sides use atomic_add_float (not raw intrinsics) to verify:
+// - Release on CAS success publishes prior non-atomic writes
+// - Acquire on CAS success makes those writes visible to the reader
+//
+// NOTE: This test may pass even with Relaxed ordering on x86 due to its strong memory model.
+// On ARM or other weak-memory architectures, using Relaxed here would likely cause failures.
 @(test)
 test_atomic_release_acquire_publish_visibility :: proc(t: ^testing.T) {
-	// Tests that the memory order passed to atomic_float_op's CAS success condition
-	// provides full ordering guarantees for the entire float operation.
-	//
-	// Both sides use atomic_add_float (not raw intrinsics) to verify:
-	// - Release on CAS success publishes prior non-atomic writes
-	// - Acquire on CAS success makes those writes visible to the reader
-	//
-	// NOTE: This test may pass even with Relaxed ordering on x86 due to its strong memory model.
-	// On ARM or other weak-memory architectures, using Relaxed here would likely cause failures.
 	NUM_READERS :: 4
 
 	Shared_State :: struct {
@@ -426,17 +476,20 @@ test_atomic_release_acquire_publish_visibility :: proc(t: ^testing.T) {
 	}
 }
 
+// Stress test for every spinlock acquisition variant: N threads contend on a
+// single lock and perform a deliberate non-atomic read-modify-write on shared
+// data. Each iteration rotates through spinlock_try_lock, spinlock_lock,
+// spinlock_guard, and spinlock_tryguard so every variant runs concurrently and
+// must uphold mutual exclusion on the same lock.
+//
+// If mutual exclusion holds:
+//   - `counter` ends at exactly NUM_THREADS * ITERATIONS_PER_THREAD
+//   - `concurrent_holders` never exceeds 1
+//
+// A multi-step RMW (read → relax → write) widens the critical section so
+// any failure to exclude is virtually guaranteed to corrupt the counter.
 @(test)
-test_spinlock_try_lock_mutual_exclusion :: proc(t: ^testing.T) {
-	// Stress test for spinlock_try_lock: N threads spin-acquire the lock and
-	// perform a deliberate non-atomic read-modify-write on shared data.
-	//
-	// If mutual exclusion holds:
-	//   - `counter` ends at exactly NUM_THREADS * ITERATIONS_PER_THREAD
-	//   - `concurrent_holders` never exceeds 1
-	//
-	// A multi-step RMW (read → relax → write) widens the critical section so
-	// any failure to exclude is virtually guaranteed to corrupt the counter.
+test_spinlock_mutual_exclusion :: proc(t: ^testing.T) {
 	NUM_THREADS :: 8
 	ITERATIONS_PER_THREAD :: 50_000
 
@@ -461,6 +514,29 @@ test_spinlock_try_lock_mutual_exclusion :: proc(t: ^testing.T) {
 	barrier: sync.Barrier
 	sync.barrier_init(&barrier, NUM_THREADS)
 
+	// The single critical section every acquisition variant must protect. Sharing
+	// it guarantees they all stress the exact same non-atomic read-modify-write.
+	critical_section :: proc(s: ^Shared) {
+		// Atomically bump the holder count so we can detect overlapping holders.
+		holders := intrinsics.atomic_add_explicit(&s.concurrent_holders, 1, .Relaxed)
+
+		// Track the maximum we ever observed (relaxed is fine, this is
+		// purely diagnostic and protected by the spinlock for writes).
+		if holders + 1 > s.max_holders {
+			s.max_holders = holders + 1
+		}
+
+		// Non-atomic RMW: read, spin a tiny bit, then write.
+		// This deliberately creates a wide window where a second holder
+		// would cause a lost update.
+		val := s.counter
+		intrinsics.cpu_relax()
+		intrinsics.cpu_relax()
+		s.counter = val + 1
+
+		intrinsics.atomic_sub_explicit(&s.concurrent_holders, 1, .Relaxed)
+	}
+
 	thread_proc :: proc(th: ^thread.Thread) {
 		ctx := cast(^Thread_Data)th.data
 		s := ctx.shared
@@ -468,36 +544,35 @@ test_spinlock_try_lock_mutual_exclusion :: proc(t: ^testing.T) {
 		// All threads rendezvous here for maximum contention.
 		sync.barrier_wait(ctx.barrier)
 
-		for _ in 0 ..< ITERATIONS_PER_THREAD {
-			// Spin on try_lock until we acquire it.
-			for !spinlock_try_lock(&s.lock) {
-				intrinsics.cpu_relax()
+		for i in 0 ..< ITERATIONS_PER_THREAD {
+			// Rotate through every acquisition variant so they all contend on the
+			// same lock simultaneously and must each uphold mutual exclusion.
+			switch i & 3 {
+			case 0:
+				// Manual spin on try_lock until we acquire it.
+				for !spinlock_try_lock(&s.lock) {
+					intrinsics.cpu_relax()
+				}
+				critical_section(s)
+				spinlock_unlock(&s.lock)
+			case 1:
+				// Blocking lock that loops internally until acquired.
+				spinlock_lock(&s.lock)
+				critical_section(s)
+				spinlock_unlock(&s.lock)
+			case 2: // Scoped guard: unlocks automatically at the end of the block.
+					if spinlock_guard(&s.lock) {
+						critical_section(s)
+					}
+			case 3: // Scoped try-guard: retry until acquired, auto-unlocks on success.
+					for {
+						if spinlock_tryguard(&s.lock) {
+							critical_section(s)
+							break
+						}
+						intrinsics.cpu_relax()
+					}
 			}
-
-			// --- critical section start ---
-
-			// Atomically bump the holder count so we can detect overlapping holders.
-			holders := intrinsics.atomic_add_explicit(&s.concurrent_holders, 1, .Relaxed)
-
-			// Track the maximum we ever observed (relaxed is fine, this is
-			// purely diagnostic and protected by the spinlock for writes).
-			if holders + 1 > s.max_holders {
-				s.max_holders = holders + 1
-			}
-
-			// Non-atomic RMW: read, spin a tiny bit, then write.
-			// This deliberately creates a wide window where a second holder
-			// would cause a lost update.
-			val := s.counter
-			intrinsics.cpu_relax()
-			intrinsics.cpu_relax()
-			s.counter = val + 1
-
-			intrinsics.atomic_sub_explicit(&s.concurrent_holders, 1, .Relaxed)
-
-			// --- critical section end ---
-
-			spinlock_unlock(&s.lock)
 		}
 	}