Rewrite findloop.c for a simpler variant of Tarjan's algorithm.

Thanks to Amir Livne Bar-on, who implemented this variant from a description posted in the Mastodon thread following up my recent blog post about loop-finding. The revised algorithm has the same asymptotic complexity as the one I already had. But it has better constant factors, and less code: it can run in a single depth-first pass over the graph instead of three separate passes, and it needs to store fewer variables per vertex. This version relies on the insight that if you DFS over an undirected graph and imagine it constructing a rooted spanning forest with each component's tree rooted at whatever vertex you started that component from, then every edge that the DFS visits without making it part of the spanning forest must join a vertex to one of its direct ancestors in that component's tree. (Because the other options are that it joins the DFS's current vertex to one the search hasn't visited at all yet – in which case the DFS _would_ follow it, and make it a forest edge after all. Or else it joins this vertex to a cousin in an earlier finished subtree – but then when the DFS processed that subtree, it would have explored the same edge in the other direction, and added our current vertex to that subtree, which by assumption it didn't.) Hence, instead of assigning every vertex a distinct integer label and calculating the min/max label reachable from each subtree, we can instead assign each vertex its tree depth, and simply calculate the minimum _depth_ of vertex reachable from each subtree: if a subtree starting at depth D can reach a vertex at depth <D, it's because there's one of those non-tree edges to a vertex outside the subtree, so the tree edge entering the subtree isn't a bridge. And since every non-tree edge must point to a vertex we've already seen (and hence assigned a depth to), this can be done in the same pass as calculating the depths in the first place - and we don't even need to _store_ the spanning forest we generate.
2025-04-19 23:21:31 -07:00 · 2024-09-12 20:38:50 +01:00
parent 5044e00588
commit 3dbd97b60f
1 changed files with 140 additions and 296 deletions
--- a/findloop.c
+++ b/findloop.c
@ -13,28 +13,22 @@
 * For some fun background reading about all the _wrong_ ways the
 * Puzzles code base has tried to solve this problem in the past:
 * https://www.chiark.greenend.org.uk/~sgtatham/quasiblog/findloop/
+ *
+ * The specific variant of Tarjan's algorithm we use is the one from
+ * https://mathstodon.xyz/@abacabadabacaba@infosec.exchange/113113280480134188
 */

 #include "puzzles.h"

 struct findloopstate {
-    int parent, child, sibling, component_root;
-    bool visited;
-    int index, minindex, maxindex;
-    int minreachable, maxreachable;
-    int bridge;
+    int depth, shallowest_reachable, subtree_size;
+    int parent, component_root;
+    int prev, next;
 };

 struct findloopstate *findloop_new_state(int nvertices)
 {
-    /*
-     * Allocate a findloopstate structure for each vertex, and one
-     * extra one at the end which will be the overall root of a
-     * 'super-tree', which links the whole graph together to make it
-     * as easy as possible to iterate over all the connected
-     * components.
-     */
-    return snewn(nvertices + 1, struct findloopstate);
+    return snewn(nvertices, struct findloopstate);
 }

 void findloop_free_state(struct findloopstate *state)
@ -45,38 +39,34 @@ void findloop_free_state(struct findloopstate *state)
 bool findloop_is_loop_edge(struct findloopstate *pv, int u, int v)
 {
    /*
-     * Since the algorithm is intended for finding bridges, and a
-     * bridge must be part of any spanning tree, it follows that there
-     * is at most one bridge per vertex.
+     * In the DFS-built forest, all edges are either are from parent
+     * to child or from child to ancestor.
     *
-     * Furthermore, by finding a _rooted_ spanning tree (so that each
-     * bridge is a parent->child link), you can find an injection from
-     * bridges to vertices (namely, map each bridge to the vertex at
-     * its child end).
-     *
-     * So if the u-v edge is a bridge, then either v was u's parent
-     * when the algorithm ran and we set pv[u].bridge = v, or vice
-     * versa.
+     * Back-edges to ancestors must be parts of loops. In order to
+     * detect whether a parent-to-child edge is part of a loop, we
+     * check if any ancestor is reachable from that child's subtree.
     */
-    return !(pv[u].bridge == v || pv[v].bridge == u);
+    if (pv[u].parent == v && pv[u].shallowest_reachable >= pv[u].depth)
+	return false;
+    if (pv[v].parent == u && pv[v].shallowest_reachable >= pv[v].depth)
+	return false;
+    return true;
 }

 static bool findloop_is_bridge_oneway(
    struct findloopstate *pv, int u, int v, int *u_vertices, int *v_vertices)
 {
-    int r, total, below;
-
-    if (pv[u].bridge != v)
-        return false;
-
-    r = pv[u].component_root;
-    total = pv[r].maxindex - pv[r].minindex + 1;
-    below = pv[u].maxindex - pv[u].minindex + 1;
+    if (pv[u].parent != v)
+	return false;
+    if (pv[u].shallowest_reachable < pv[u].depth)
+	return false;

    if (u_vertices)
-        *u_vertices = below;
-    if (v_vertices)
-        *v_vertices = total - below;
+	*u_vertices = pv[u].subtree_size;
+    if (v_vertices) {
+	int r = pv[u].component_root;
+	*v_vertices = pv[r].subtree_size - pv[u].subtree_size;
+    }

    return true;
 }
@ -85,278 +75,132 @@ bool findloop_is_bridge(
    struct findloopstate *pv, int u, int v, int *u_vertices, int *v_vertices)
 {
    return (findloop_is_bridge_oneway(pv, u, v, u_vertices, v_vertices) ||
-            findloop_is_bridge_oneway(pv, v, u, v_vertices, u_vertices));
+	    findloop_is_bridge_oneway(pv, v, u, v_vertices, u_vertices));
 }

 bool findloop_run(struct findloopstate *pv, int nvertices,
-                  neighbour_fn_t neighbour, void *ctx)
+		  neighbour_fn_t neighbour, void *ctx)
 {
-    int u, v, w, root, index;
-    int nbridges, nedges;
-
-    root = nvertices;
+    int u, v, w;
+    bool any_loop = false;

    /*
-     * First pass: organise the graph into a rooted spanning forest.
-     * That is, a tree structure with a clear up/down orientation -
-     * every node has exactly one parent (which may be 'root') and
-     * zero or more children, and every parent-child link corresponds
-     * to a graph edge.
+     * We run a DFS to split the graph into disjoint spanning trees.
+     * This construction guarantees that every edge either descends
+     * to a new child never visited before or goes to an ancestor.
+     * Tracking which ancestors are linked from which subtrees lets
+     * us detect all loops efficiently.
     *
-     * (A side effect of this is to find all the connected components,
-     * which of course we could do less confusingly with a dsf - but
-     * then we'd have to do that *and* build the tree, so it's less
-     * effort to do it all at once.)
+     * Here we don't use recursion, instead holding the entire DSF
+     * state in the findloopstate struct. The loop below visits each
+     * node exactly twice: before and after visiting its subtree.
+     *
+     * The first time we visit a node, we take care of marking its
+     * children with their position in the tree and when they're
+     * scheduled to be visited. The second time we update the parent
+     * with statistics about the subtree.
+     *
+     * The order of nodes is managed using a doubly-linked list.
+     * The first time a node is visited, we add its children before it
+     * in the list and set the pointer to go through them first. The
+     * second time we move to the next node in the list, which is a
+     * sibling or a parent, or if we're at the root of the connected
+     * component will be a node in the next component. (All the nodes
+     * of the graph always appear in the list)
+     * A linked list is used to handle the case where the same child
+     * appears in two levels, to allow us to efficiently remove it from
+     * its previous position.
+     *
+     * The algorithm tracks whether we're at the first or the second
+     * visit at a node using the depth property. It's set to a negative
+     * value on initialization, and to the depth in the connected
+     * component's tree on the first visit.
+     * It detects a new connected component using the parent pointer,
+     * it's always set to a real node in the search, and is negative for
+     * new trees.
+     *
+     * In the first visit, we go over the node's children, moving them
+     * in the list and setting their parent pointer. Edges going to
+     * ancestors are noted in the shallowest_reachable field.
+     * In the second visit, we adjust the subtree_size and
+     * shallowest_reachable fields of the parent.
+     *
+     * Variables:
+     *   u = the current node under examination
+     *   v = the node to go to in the next iteration
+     *   w = neighbour iterator
     */
-    for (v = 0; v <= nvertices; v++) {
-        pv[v].parent = root;
-        pv[v].child = -2;
-        pv[v].sibling = -1;
-        pv[v].visited = false;
+
+    for (u = 0; u < nvertices; u++) {
+	pv[u].depth = -1;
+	pv[u].shallowest_reachable = nvertices;
+	pv[u].subtree_size = 1;
+	pv[u].parent = -1;
+	pv[u].component_root = u;
+	pv[u].prev = u - 1;
+	pv[u].next = (u == nvertices - 1) ? -1 : u + 1;
    }
-    pv[root].child = -1;
-    nedges = 0;
+
    debug(("------------- new find_loops, nvertices=%d\n", nvertices));
-    for (v = 0; v < nvertices; v++) {
-        if (pv[v].parent == root) {
-            /*
-             * Found a new connected component. Enumerate and treeify
-             * it.
-             */
-            pv[v].sibling = pv[root].child;
-            pv[root].child = v;
-            pv[v].component_root = v;
-            debug(("%d is new child of root\n", v));

-            u = v;
-            while (1) {
-                if (!pv[u].visited) {
-                    pv[u].visited = true;
+    v = 0;
+    while (v != -1) {
+	u = v;
+	if (pv[u].depth < 0) {
+	    /* Our first visit to the node (on the way down the search) */
+	    if (pv[u].parent < 0) {
+		debug(("    new component: processing %d\n", u));
+		pv[u].depth = 0;
+		pv[u].component_root = u;
+	    } else {
+		debug(("    processing %d\n", u));
+		pv[u].depth = pv[pv[u].parent].depth + 1;
+		pv[u].component_root = pv[pv[u].parent].component_root;
+	    }

-                    /*
-                     * Enumerate the neighbours of u, and any that are
-                     * as yet not in the tree structure (indicated by
-                     * child==-2, and distinct from the 'visited'
-                     * flag) become children of u.
-                     */
-                    debug(("  component pass: processing %d\n", u));
-                    for (w = neighbour(u, ctx); w >= 0;
-                         w = neighbour(-1, ctx)) {
-                        debug(("    edge %d-%d\n", u, w));
-                        if (pv[w].child == -2) {
-                            debug(("      -> new child\n"));
-                            pv[w].child = -1;
-                            pv[w].sibling = pv[u].child;
-                            pv[w].parent = u;
-                            pv[w].component_root = pv[u].component_root;
-                            pv[u].child = w;
-                        }
-
-                        /* While we're here, count the edges in the whole
-                         * graph, so that we can easily check at the end
-                         * whether all of them are bridges, i.e. whether
-                         * no loop exists at all. */
-                        if (w > u) /* count each edge only in one direction */
-                            nedges++;
-                    }
-
-                    /*
-                     * Now descend in depth-first search.
-                     */
-                    if (pv[u].child >= 0) {
-                        u = pv[u].child;
-                        debug(("    descending to %d\n", u));
-                        continue;
-                    }
-                }
-
-                if (u == v) {
-                    debug(("      back at %d, done this component\n", u));
-                    break;
-                } else if (pv[u].sibling >= 0) {
-                    u = pv[u].sibling;
-                    debug(("    sideways to %d\n", u));
-                } else {
-                    u = pv[u].parent;
-                    debug(("    ascending to %d\n", u));
-                }
-            }
-        }
+	    /* Schedule visits to the neighbors, and then back here */
+	    v = u;
+	    for (w = neighbour(u, ctx); w >= 0; w = neighbour(-1, ctx)) {
+		if (w == pv[u].parent)
+		    continue;
+		if (pv[w].depth < 0) {
+		    debug(("    adding edge %d-%d to tree\n", u, w));
+		    pv[w].parent = u;
+		    /* Remove the neighbour from the linked list */
+		    if (pv[w].prev >= 0)
+			pv[pv[w].prev].next = pv[w].next;
+		    if (pv[w].next >= 0)
+			pv[pv[w].next].prev = pv[w].prev;
+		    /* Add it to the start of the list */
+		    pv[w].prev = pv[v].prev;
+		    pv[w].next = v;
+		    if (pv[v].prev >= 0)
+			pv[pv[v].prev].next = w;
+		    pv[v].prev = w;
+		    /* Mark this as the next node to visit */
+		    v = w;
+		} else {
+		    debug(("    found back-edge %d-%d\n", u, w));
+		    pv[u].shallowest_reachable =
+			min(pv[u].shallowest_reachable, pv[w].depth);
+		    any_loop = true;
+		}
+	    }
+	} else {
+	    debug(("    wrapping up %d. |subtree| = %d, min(reachable) = %d\n",
+		   u, pv[u].subtree_size, pv[u].shallowest_reachable));
+	    if (pv[u].parent >= 0) {
+		if (pv[u].shallowest_reachable >= pv[u].depth) {
+		    debug(("    bridge: %d-%d\n", u, pv[u].parent));
+		}
+		pv[pv[u].parent].subtree_size += pv[u].subtree_size;
+		pv[pv[u].parent].shallowest_reachable =
+		    min(pv[pv[u].parent].shallowest_reachable,
+			pv[u].shallowest_reachable);
+	    }
+	    v = pv[u].next;
+	}
    }

-    /*
-     * Second pass: index all the vertices in such a way that every
-     * subtree has a contiguous range of indices. (Easily enough done,
-     * by iterating through the tree structure we just built and
-     * numbering its elements as if they were those of a sorted list.)
-     *
-     * For each vertex, we compute the min and max index of the
-     * subtree starting there.
-     *
-     * (We index the vertices in preorder, per Tarjan's original
-     * description, so that each vertex's min subtree index is its own
-     * index; but that doesn't actually matter; either way round would
-     * do. The important thing is that we have a simple arithmetic
-     * criterion that tells us whether a vertex is in a given subtree
-     * or not.)
-     */
-    debug(("--- begin indexing pass\n"));
-    index = 0;
-    for (v = 0; v < nvertices; v++)
-        pv[v].visited = false;
-    pv[root].visited = true;
-    u = pv[root].child;
-    while (1) {
-        if (!pv[u].visited) {
-            pv[u].visited = true;
-
-            /*
-             * Index this node.
-             */
-            pv[u].minindex = pv[u].index = index;
-            debug(("  vertex %d <- index %d\n", u, index));
-            index++;
-
-            /*
-             * Now descend in depth-first search.
-             */
-            if (pv[u].child >= 0) {
-                u = pv[u].child;
-                debug(("    descending to %d\n", u));
-                continue;
-            }
-        }
-
-        if (u == root) {
-            debug(("      back at %d, done indexing\n", u));
-            break;
-        }
-
-        /*
-         * As we re-ascend to here from its children (or find that we
-         * had no children to descend to in the first place), fill in
-         * its maxindex field.
-         */
-        pv[u].maxindex = index-1;
-        debug(("  vertex %d <- maxindex %d\n", u, pv[u].maxindex));
-
-        if (pv[u].sibling >= 0) {
-            u = pv[u].sibling;
-            debug(("    sideways to %d\n", u));
-        } else {
-            u = pv[u].parent;
-            debug(("    ascending to %d\n", u));
-        }
-    }
-
-    /*
-     * We're ready to generate output now, so initialise the output
-     * fields.
-     */
-    for (v = 0; v < nvertices; v++)
-        pv[v].bridge = -1;
-
-    /*
-     * Final pass: determine the min and max index of the vertices
-     * reachable from every subtree, not counting the link back to
-     * each vertex's parent. Then our criterion is: given a vertex u,
-     * defining a subtree consisting of u and all its descendants, we
-     * compare the range of vertex indices _in_ that subtree (which is
-     * just the minindex and maxindex of u) with the range of vertex
-     * indices in the _neighbourhood_ of the subtree (computed in this
-     * final pass, and not counting u's own edge to its parent), and
-     * if the latter includes anything outside the former, then there
-     * must be some path from u to outside its subtree which does not
-     * go through the parent edge - i.e. the edge from u to its parent
-     * is part of a loop.
-     */
-    debug(("--- begin min-max pass\n"));
-    nbridges = 0;
-    for (v = 0; v < nvertices; v++)
-        pv[v].visited = false;
-    u = pv[root].child;
-    pv[root].visited = true;
-    while (1) {
-        if (!pv[u].visited) {
-            pv[u].visited = true;
-
-            /*
-             * Look for vertices reachable directly from u, including
-             * u itself.
-             */
-            debug(("  processing vertex %d\n", u));
-            pv[u].minreachable = pv[u].maxreachable = pv[u].minindex;
-            for (w = neighbour(u, ctx); w >= 0; w = neighbour(-1, ctx)) {
-                debug(("    edge %d-%d\n", u, w));
-                if (w != pv[u].parent) {
-                    int i = pv[w].index;
-                    if (pv[u].minreachable > i)
-                        pv[u].minreachable = i;
-                    if (pv[u].maxreachable < i)
-                        pv[u].maxreachable = i;
-                }
-            }
-            debug(("    initial min=%d max=%d\n",
-                   pv[u].minreachable, pv[u].maxreachable));
-
-            /*
-             * Now descend in depth-first search.
-             */
-            if (pv[u].child >= 0) {
-                u = pv[u].child;
-                debug(("    descending to %d\n", u));
-                continue;
-            }
-        }
-
-        if (u == root) {
-            debug(("      back at %d, done min-maxing\n", u));
-            break;
-        }
-
-        /*
-         * As we re-ascend to this vertex, go back through its
-         * immediate children and do a post-update of its min/max.
-         */
-        for (v = pv[u].child; v >= 0; v = pv[v].sibling) {
-            if (pv[u].minreachable > pv[v].minreachable)
-                pv[u].minreachable = pv[v].minreachable;
-            if (pv[u].maxreachable < pv[v].maxreachable)
-                pv[u].maxreachable = pv[v].maxreachable;
-        }
-
-        debug(("  postorder update of %d: min=%d max=%d (indices %d-%d)\n", u,
-               pv[u].minreachable, pv[u].maxreachable,
-               pv[u].minindex, pv[u].maxindex));
-
-        /*
-         * And now we know whether each to our own parent is a bridge.
-         */
-        if ((v = pv[u].parent) != root) {
-            if (pv[u].minreachable >= pv[u].minindex &&
-                pv[u].maxreachable <= pv[u].maxindex) {
-                /* Yes, it's a bridge. */
-                pv[u].bridge = v;
-                nbridges++;
-                debug(("  %d-%d is a bridge\n", v, u));
-            } else {
-                debug(("  %d-%d is not a bridge\n", v, u));
-            }
-        }
-
-        if (pv[u].sibling >= 0) {
-            u = pv[u].sibling;
-            debug(("    sideways to %d\n", u));
-        } else {
-            u = pv[u].parent;
-            debug(("    ascending to %d\n", u));
-        }
-    }
-
-    debug(("finished, nedges=%d nbridges=%d\n", nedges, nbridges));
-
-    /*
-     * Done.
-     */
-    return nbridges < nedges;
+    return any_loop;
 }