2
0
mirror of https://github.com/esiur/esiur-dotnet.git synced 2026-06-13 14:38:43 +00:00

Deadlock tests

This commit is contained in:
2026-06-03 13:02:56 +03:00
parent 3dc36149b7
commit 2431166f25
25 changed files with 2160 additions and 157 deletions
@@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\..\Libraries\Esiur\Esiur.csproj" OutputItemType="Analyzer"/>
</ItemGroup>
</Project>
@@ -0,0 +1,160 @@
// ============================================================
// Distributed deadlock test — CLIENT NODE
// Connects to the server, fetches the resource graph concurrently, and classifies each run as
// COMPLETED, DEADLOCKED, or SLOW using a progress (stall) detector — deadlock is detected as the
// absence of attachment progress while requests are still pending, NOT as a blunt timeout, so it is
// distinguished from slow WAN processing. Reports completion-time distribution, cycle-break and
// unnecessary-placeholder counts, and the published-state of delivered resources.
//
// Usage:
// dotnet run -- --host SERVER_IP --port 10950 --nodes 8 --mode WaitWithCycleDetection --iterations 20
// dotnet run -- --host SERVER_IP --port 10950 --nodes 4 --roots 0 --mode WaitWithCycleDetection (single-root cycle)
// dotnet run -- --host SERVER_IP --port 10950 --nodes 8 --mode NaiveWait (control: deadlocks)
// Modes: WaitWithCycleDetection (default) | NaiveWait | LegacyCrossChainPlaceholder
// ============================================================
using System.Collections;
using System.Diagnostics;
using Esiur.Protocol;
using Esiur.Resource;
var host = GetArg(args, "--host", "127.0.0.1");
var port = int.Parse(GetArg(args, "--port", "10950"));
var nodeCount = int.Parse(GetArg(args, "--nodes", "100"));
var modeArg = GetArg(args, "--mode", "NaiveWait");
var iterations = int.Parse(GetArg(args, "--iterations", "20"));
var stallMs = int.Parse(GetArg(args, "--stall-ms", "5000"));
var hardMs = int.Parse(GetArg(args, "--hard-ms", "60000"));
var rootsArg = GetArg(args, "--roots", "all");
if (!Enum.TryParse<DeadlockResolutionMode>(modeArg, ignoreCase: true, out var mode))
{
Console.WriteLine($"Unknown --mode '{modeArg}'. Use WaitWithCycleDetection | NaiveWait | LegacyCrossChainPlaceholder.");
return;
}
var roots = rootsArg.Equals("all", StringComparison.OrdinalIgnoreCase)
? Enumerable.Range(0, nodeCount).Select(i => $"sys/n{i}").ToArray()
: rootsArg.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
.Select(s => $"sys/n{int.Parse(s)}").ToArray();
Console.WriteLine($"[Client] {host}:{port} nodes={nodeCount} mode={mode} roots={roots.Length} " +
$"iterations={iterations} stallMs={stallMs} hardMs={hardMs}");
Console.WriteLine($"[Client] {"iter",-5}{"outcome",-14}{"ms",10}{"breaks",10}{"unnec",8}{"unpublished",13}");
var rows = new List<(int iter, string outcome, double ms, long breaks, long unnec, int unpublished)>();
for (var it = 0; it < iterations; it++)
{
// Fresh warehouse + connection per iteration so the per-connection counters start at 0.
var wh = new Warehouse();
EpConnection con;
try { con = await wh.Get<EpConnection>($"ep://{host}:{port}"); }
catch (Exception ex) { Console.WriteLine($"[Client] connect failed: {ex.Message}"); return; }
con.DeadlockResolution = mode;
Console.WriteLine($"[Client] iter {it + 1}: connected, fetching {roots.Length} roots...");
var (outcome, ms, results) = await Classify(con, roots, stallMs, hardMs);
var unpublished = results == null ? -1 : CountUnpublished(results);
rows.Add((it + 1, outcome, ms, con.CycleBreakCount, con.UnnecessaryPlaceholderCount, unpublished));
Console.WriteLine($"[Client] {it + 1,-5}{outcome,-14}{ms,10:F1}{con.CycleBreakCount,10}{con.UnnecessaryPlaceholderCount,8}{unpublished,13}");
try { con.Destroy(); } catch { }
}
// ---- summary ----------------------------------------------------------------------------------
var completed = rows.Where(r => r.outcome == "Completed").ToList();
var times = completed.Select(r => r.ms).OrderBy(x => x).ToList();
double Pct(double p) => times.Count == 0 ? 0 : times[(int)Math.Min(times.Count - 1, p * times.Count)];
Console.WriteLine();
Console.WriteLine($"[Client] === summary ({mode}) ===");
Console.WriteLine($" completed={completed.Count} deadlocked={rows.Count(r => r.outcome == "Deadlocked")} " +
$"slow={rows.Count(r => r.outcome == "SlowTimeout")} faulted={rows.Count(r => r.outcome == "Faulted")}");
Console.WriteLine($" completion ms: median={Pct(0.5):F1} p99={Pct(0.99):F1} max={(times.Count > 0 ? times[^1] : 0):F1}");
Console.WriteLine($" cycle-breaks total={rows.Sum(r => r.breaks)} unnecessary-placeholders total={rows.Sum(r => r.unnec)}");
Console.WriteLine($" partial deliveries (unpublished>0) in {rows.Count(r => r.unpublished > 0)}/{rows.Count} runs");
var csv = "iteration,outcome,ms,cycle_breaks,unnecessary_placeholders,unpublished\n" +
string.Join("\n", rows.Select(r => $"{r.iter},{r.outcome},{r.ms:F1},{r.breaks},{r.unnec},{r.unpublished}"));
var outFile = $"deadlock_{mode}_{host}_{port}.csv";
await File.WriteAllTextAsync(outFile, csv);
Console.WriteLine($"[Client] results written to {outFile}");
Console.ReadLine();
// ---- stall-based classification ---------------------------------------------------------------
// Fires fetches for all roots and classifies the run. A run is DEADLOCKED when fetches are still
// pending but the connection's attached-resource count has not advanced for stallMs (no progress);
// SLOW if it is still progressing at hardMs; COMPLETED when every fetch resolves.
static async Task<(string outcome, double ms, EpResource[]? results)> Classify(
EpConnection con, string[] roots, int stallMs, int hardMs)
{
var tasks = roots.Select(p =>
{
var tcs = new TaskCompletionSource<IResource?>();
con.Get(p)
.Then(r => tcs.TrySetResult(r as IResource))
.Error(ex => { Console.WriteLine($"[Client] Get({p}) error: {ex.Message}"); tcs.TrySetException((Exception)ex); });
return tcs.Task;
}).ToArray();
var all = Task.WhenAll(tasks);
var sw = Stopwatch.StartNew();
var lastProgress = con.AttachedResourceCount;
var lastProgressMs = 0.0;
while (true)
{
await Task.WhenAny(all, Task.Delay(25));
if (all.IsCompletedSuccessfully)
{
sw.Stop();
return ("Completed", sw.Elapsed.TotalMilliseconds, all.Result.OfType<EpResource>().ToArray());
}
if (all.IsFaulted)
{
sw.Stop();
return ("Faulted", sw.Elapsed.TotalMilliseconds, null);
}
var progress = con.AttachedResourceCount;
if (progress != lastProgress) { lastProgress = progress; lastProgressMs = sw.Elapsed.TotalMilliseconds; }
if (sw.Elapsed.TotalMilliseconds - lastProgressMs >= stallMs) { sw.Stop(); return ("Deadlocked", sw.Elapsed.TotalMilliseconds, null); }
if (sw.Elapsed.TotalMilliseconds >= hardMs) { sw.Stop(); return ("SlowTimeout", sw.Elapsed.TotalMilliseconds, null); }
}
}
// Counts resources reachable from the delivered roots that are not Published — i.e. handed to the
// application while their dependency graph was not fully attached. Links is property index 1.
static int CountUnpublished(EpResource[] roots)
{
var seen = new HashSet<uint>();
var queue = new Queue<EpResource>(roots);
var unpublished = 0;
while (queue.Count > 0)
{
var node = queue.Dequeue();
if (node == null || !seen.Add(node.ResourceInstanceId)) continue;
if (node.Status != ResourceStatus.Published) unpublished++;
if (node.Status == ResourceStatus.Attached && node.TryGetPropertyValue((byte)1, out var linksObj) && linksObj is IEnumerable links)
foreach (var child in links)
if (child is EpResource childResource)
queue.Enqueue(childResource);
}
return unpublished;
}
static string GetArg(string[] args, string key, string def)
{
var i = Array.IndexOf(args, key);
return (i >= 0 && i + 1 < args.Length) ? args[i + 1] : def;
}
+86
View File
@@ -0,0 +1,86 @@
# Distributed deadlock test (two nodes / WAN)
Two console apps that evaluate the recursive-attachment deadlock-prevention algorithm over a real
TCP connection between two machines:
- **Server** (`Server/`) hosts a configurable graph of `Node` resources whose references may form
cycles, and prints the *cycle census* of the deployed graph (so the experiment can state that
circular dependencies were actually generated).
- **Client** (`Client/`) connects, fetches the graph concurrently, and classifies each run as
**Completed / Deadlocked / Slow** using a *stall detector* — a deadlock is detected as the absence
of attachment progress while requests are still pending, which distinguishes it from slow WAN
processing rather than relying on a blunt timeout.
Authentication is disabled (`AllowUnauthorizedAccess`, anonymous `None` mode), so no credentials are
needed.
## Build
```
dotnet build Tests/Distribution/Deadlock/Server/Esiur.Tests.Deadlock.Server.csproj -c Release
dotnet build Tests/Distribution/Deadlock/Client/Esiur.Tests.Deadlock.Client.csproj -c Release
```
## Run
**On node A (server):**
```
dotnet run --project Tests/Distribution/Deadlock/Server -c Release -- \
--port 10950 --topology ring --nodes 8
```
It prints, e.g.: `topology=ring nodes=8 edges=8 cyclic=True backEdges=1` and the node count to pass
to the client. Leave it running (Ctrl+C to stop).
Topologies (`--topology`):
| name | cyclic | description |
|------|:------:|-------------|
| `ring` | yes | `i → (i+1) mod n`; every node fetched as an independent request (cross-chain cycles) |
| `cycle` | yes | single-root cycle `0→1→…→n-1→0` (fetch only `--roots 0`) |
| `complete` | yes | every ordered pair `i → j` |
| `staggered` | no | two roots share a deep dependency reached at different depths (stresses non-cyclic contention; `--nodes` is derived) |
| `random` | usually | ErdősRényi directed graph (`--nodes`, `--seed`, `--edge-prob`) |
| `chain` | no | acyclic control `0→1→…→n-1` |
| `diamond` | no | acyclic control |
**On node B (client):**
```
dotnet run --project Tests/Distribution/Deadlock/Client -c Release -- \
--host <NODE_A_IP> --port 10950 --nodes 8 \
--mode WaitWithCycleDetection --iterations 20 --stall-ms 5000 --hard-ms 60000
```
Modes (`--mode`):
- `WaitWithCycleDetection` (default, the production algorithm) — completes; breaks only genuine cycles.
- `NaiveWait` (control) — no cycle handling; **deadlocks** on any cyclic graph (detected via the stall window).
- `LegacyCrossChainPlaceholder` — for reference only.
Other client options: `--roots all|0,1,2` (which nodes to fetch; default all `n0..n{N-1}`),
`--stall-ms` (no-progress window ⇒ deadlock; set comfortably above your WAN round-trip × graph depth),
`--hard-ms` (progress-but-unfinished ⇒ slow).
## Output
The client prints per-iteration rows and a summary, and writes `deadlock_<mode>_<host>_<port>.csv`:
```
iteration,outcome,ms,cycle_breaks,unnecessary_placeholders,unpublished
```
- `outcome``Completed` / `Deadlocked` / `SlowTimeout`.
- `ms` — fetch time (deadlocked rows equal the stall window).
- `cycle_breaks` — placeholders returned to break a cycle on this connection.
- `unnecessary_placeholders` — placeholders returned where no genuine cycle existed (always 0 for the
production resolver; non-zero only for the legacy reference mode).
- `unpublished` — resources delivered to the application whose dependency graph was not fully attached
at delivery (`-1` for a deadlocked/failed run).
## Suggested WAN runs for the paper
1. **Detection works and cycles exist.** Server `--topology ring --nodes 8`; client
`--mode WaitWithCycleDetection` (expect all *Completed*, `cycle_breaks > 0`) and then
`--mode NaiveWait` (expect *Deadlocked* — validates the detector on the same cyclic graph).
2. **Random pool census.** Server `--topology random --nodes 12 --seed 20260603`; the server prints
whether the deployed graph is cyclic; run the client in `WaitWithCycleDetection`.
3. **Threshold justification.** Compare the client's reported completion `ms` (median/p99) against
`--stall-ms`; the stall window should be orders of magnitude larger.
@@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\..\Libraries\Esiur\Esiur.csproj" OutputItemType="Analyzer"/>
</ItemGroup>
</Project>
@@ -0,0 +1,19 @@
using Esiur.Resource;
using Esiur.Tests.Deadlock.Server;
/// <summary>
/// Resource used to build reference topologies (cycles, cross-references) for the distributed
/// deadlock test. <see cref="Links"/> holds references to other nodes; fetching a node transitively
/// fetches its links, which is what exercises EpConnection.FetchResource cycle handling.
/// Property indices are stable: Id = 0, Links = 1.
/// </summary>
[Resource]
public partial class Node
{
[Export] public int Id { get; set; }
[Export] public Node[]? Links { get; set; }
[Export] public Resource1[] Resources1 { get; set; }
[Export] public Resource2[] Resources2 { get; set; }
}
@@ -0,0 +1,189 @@
// ============================================================
// Distributed deadlock test — SERVER NODE
// Hosts a configurable graph of Node resources (sys/n0 .. sys/n{N-1}) whose references can form
// cycles. A client on another node fetches the graph and measures whether the recursive-attachment
// resolver completes or deadlocks. The server prints the cycle census of the deployed graph so the
// experiment can state, for the record, that circular dependencies were actually generated.
//
// Usage:
// dotnet run -- --port 10950 --topology ring --nodes 8
// dotnet run -- --port 10950 --topology random --nodes 12 --seed 20260603 --edge-prob 0.22
// dotnet run -- --port 10950 --topology staggered
// Topologies: ring | cycle | chain | diamond | complete | staggered | random
// ============================================================
using Esiur.Protocol;
using Esiur.Resource;
using Esiur.Stores;
using Esiur.Tests.Deadlock.Server;
var port = int.Parse(GetArg(args, "--port", "10950"));
var topology = GetArg(args, "--topology", "ring").ToLowerInvariant();
var nodeCount = int.Parse(GetArg(args, "--nodes", "100"));
var res1Count = int.Parse(GetArg(args, "--res1", "100"));
var res2Count = int.Parse(GetArg(args, "--res2", "100"));
var seed = int.Parse(GetArg(args, "--seed", "20260603"));
var edgeProb = double.Parse(GetArg(args, "--edge-prob", "0.22"));
var edges = BuildTopology(topology, ref nodeCount, seed, edgeProb);
var (hasCycle, backEdges) = CycleCensus(nodeCount, edges);
Console.WriteLine($"[Server] topology={topology} nodes={nodeCount} edges={edges.Count} " +
$"cyclic={hasCycle} backEdges={backEdges} port={port}");
var wh = new Warehouse();
await wh.Put("sys", new MemoryStore());
// AllowUnauthorizedAccess enables anonymous (None-mode) connections so the test needs no
// credentials — the deadlock behaviour under study is independent of authentication.
var server = await wh.Put("sys/server", new EpServer { Port = (ushort)port, AllowUnauthorizedAccess = true });
var nodes = new Node[nodeCount];
var resources1 = new Resource1[res1Count];
var resources2 = new Resource2[res2Count];
for (var i = 0; i < nodeCount; i++) {
nodes[i] = new Node { Id = i };
await wh.Put($"sys/n{i}", nodes[i]);
}
for (var i = 0; i < res1Count; i++)
{
resources1[i] = new Resource1();
await wh.Put($"sys/r1_{i}", resources1[i]);
}
for (var i = 0; i < res2Count; i++)
{
resources2[i] = new Resource2();
await wh.Put($"sys/r2_{i}", resources2[i]);
}
// randomly assign some resources to each node so the fetches do some work beyond just traversing the links; this also
for(var i = 0; i < nodeCount; i++)
{
var rng = new Random(seed);
nodes[i].Resources1 = rng.GetItems(resources1, res1Count / 2);
nodes[i].Resources2 = rng.GetItems(resources2, res2Count / 2);
}
for(var i =0; i < res1Count; i++)
{
var rng = new Random(seed);
var res1Index = rng.Next(res1Count);
var res2Index = rng.Next(res2Count);
resources1[i].res1 = resources1[res1Index];
resources1[i].res2 = resources2[res2Index];
}
for (var i = 0; i < res2Count; i++)
{
var rng = new Random(seed);
var res1Index = rng.Next(res1Count);
var res2Index = rng.Next(res2Count);
resources2[i].res1 = resources1[res1Index];
resources2[i].res2 = resources2[res2Index];
}
foreach (var grp in edges.GroupBy(e => e.from))
nodes[grp.Key].Links = grp.Select(e => nodes[e.to]).ToArray();
await wh.Open();
Console.WriteLine($"[Server] Listening on port {port}. Hosting {nodeCount} nodes: sys/n0 .. sys/n{nodeCount - 1}.");
Console.WriteLine($"[Server] The deployed request graph {(hasCycle ? "CONTAINS circular dependencies" : "is acyclic")} " +
$"({backEdges} cycle-closing edge(s)).");
Console.WriteLine($"[Server] Point the client at this host:port with --nodes {nodeCount}. Press Ctrl+C to stop.");
// Stay up until Ctrl+C (works whether or not stdin is interactive / redirected).
var stop = new TaskCompletionSource();
Console.CancelKeyPress += (_, e) => { e.Cancel = true; stop.TrySetResult(); };
await stop.Task;
await wh.Close();
// ---- topology + cycle census -------------------------------------------------------------
static List<(int from, int to)> BuildTopology(string topo, ref int n, int seed, double edgeProb)
{
var edges = new List<(int, int)>();
switch (topo)
{
case "ring": // i -> (i+1) mod n; every node a root
for (var i = 0; i < n; i++) edges.Add((i, (i + 1) % n));
break;
case "cycle": // single-root cycle 0->1->..->n-1->0
for (var i = 0; i < n - 1; i++) edges.Add((i, i + 1));
edges.Add((n - 1, 0));
break;
case "chain": // acyclic control
for (var i = 0; i < n - 1; i++) edges.Add((i, i + 1));
break;
case "diamond": // acyclic control: 0->1,0->2,1->3,2->3
n = Math.Max(n, 4);
edges.AddRange(new[] { (0, 1), (0, 2), (1, 3), (2, 3) });
break;
case "complete": // every ordered pair
for (var i = 0; i < n; i++) for (var j = 0; j < n; j++) if (i != j) edges.Add((i, j));
break;
case "staggered": // X (0) and Y (1) share S; Y reaches S late; no cycle
{
var e = new List<(int, int)>();
var next = 2;
int Chain(int from, int depth) { for (var d = 0; d < depth; d++) { e.Add((from, next)); from = next; next++; } return from; }
var xTail = Chain(0, 0); // X reaches S immediately
var yTail = Chain(1, 3); // Y reaches S through a 3-hop chain
var shared = next++;
e.Add((xTail, shared)); e.Add((yTail, shared));
Chain(shared, 3); // S has its own deep chain
n = next;
return e;
}
case "random": // Erdos-Renyi directed graph, fixed seed
{
var rng = new Random(seed);
for (var i = 0; i < n; i++) for (var j = 0; j < n; j++) if (i != j && rng.NextDouble() < edgeProb) edges.Add((i, j));
break;
}
default:
throw new ArgumentException($"Unknown topology '{topo}'. Use ring|cycle|chain|diamond|complete|staggered|random.");
}
return edges;
}
// DFS three-colouring; counts back edges (cycle-closing edges, including self loops).
static (bool hasCycle, int backEdges) CycleCensus(int n, IReadOnlyList<(int from, int to)> edges)
{
var adj = new List<int>[n];
for (var i = 0; i < n; i++) adj[i] = new List<int>();
var back = 0;
foreach (var (a, b) in edges) { if (a == b) back++; else adj[a].Add(b); }
var color = new byte[n]; // 0 unvisited, 1 on-stack, 2 done
for (var s = 0; s < n; s++)
{
if (color[s] != 0) continue;
var stack = new Stack<(int node, int idx)>();
stack.Push((s, 0)); color[s] = 1;
while (stack.Count > 0)
{
var (u, idx) = stack.Pop();
if (idx < adj[u].Count)
{
stack.Push((u, idx + 1));
var v = adj[u][idx];
if (color[v] == 1) back++;
else if (color[v] == 0) { color[v] = 1; stack.Push((v, 0)); }
}
else color[u] = 2;
}
}
return (back > 0, back);
}
static string GetArg(string[] args, string key, string def)
{
var i = Array.IndexOf(args, key);
return (i >= 0 && i + 1 < args.Length) ? args[i + 1] : def;
}
@@ -0,0 +1,15 @@
using System;
using System.Collections.Generic;
using System.Text;
using Esiur.Protocol;
using Esiur.Resource;
namespace Esiur.Tests.Deadlock.Server
{
[Resource]
public partial class Resource1
{
[Export] public Resource1 res1;
[Export] public Resource2 res2;
}
}
@@ -0,0 +1,14 @@
using Esiur.Resource;
using System;
using System.Collections.Generic;
using System.Text;
namespace Esiur.Tests.Deadlock.Server
{
[Resource]
public partial class Resource2
{
[Export] public Resource1 res1;
[Export] public Resource2 res2;
}
}