From 2e2dfd5c4ac2d697c65719e499992faffdce4d84 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Thu, 22 Jan 2026 21:44:51 +0200
Subject: [PATCH 1/7] POC

---
 Lib/difflib.py | 214 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 167 insertions(+), 47 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index 7c7e233b013a76..0b5bc0ce48502c 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -32,11 +32,110 @@
 
 from _colorize import can_colorize, get_theme
 from heapq import nlargest as _nlargest
-from collections import namedtuple as _namedtuple
+from collections import Counter as _Counter, namedtuple as _namedtuple
 from types import GenericAlias
 
 Match = _namedtuple('Match', 'a b size')
 
+_LENGTH = 0
+_LINK = 1
+_NEXT = 2
+_POS = 3
+
+
+class _LCSUBAutomaton:
+    """Suffix Automaton for finding longest common substring."""
+
+    def __init__(self, s2, start2=0, stop2=None, *, junk=()):
+        if stop2 is None:
+            stop2 = len(s2)
+
+        self.start2 = start2
+        self.stop2 = stop2
+        self.junk = frozenset(junk)
+        self.root = root = [0, None, {}, -1]  # [length, link, next, end_pos]
+
+        last_len = 0
+        last = root
+        for j in range(start2, stop2):
+            c = s2[j]
+            if c in junk:
+                last_len = 0
+                last = root
+            else:
+                last_len += 1
+                curr = [last_len, None, {}, j]
+
+                p = last
+                p_next = p[_NEXT]
+                while c not in p_next:
+                    p_next[c] = curr
+                    if p is root:
+                        curr[_LINK] = root
+                        break
+                    p = p[_LINK]
+                    p_next = p[_NEXT]
+                else:
+                    q = p_next[c]
+                    p_length_p1 = p[_LENGTH] + 1
+                    if p_length_p1 == q[_LENGTH]:
+                        curr[_LINK] = q
+                    else:
+                        clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]]
+                        while (p_next := p[_NEXT]).get(c) is q:
+                            p_next[c] = clone
+                            if p is root:
+                                break
+                            p = p[_LINK]
+
+                        q[_LINK] = curr[_LINK] = clone
+
+                last = curr
+
+    def find(self, s1, start1=0, stop1=None):
+        if stop1 is None:
+            stop1 = len(s1)
+        root = self.root
+        junk = self.junk
+        v = root
+        l = 0
+        best_len = 0
+        best_state = None
+        best_pos = 0
+
+        for i in range(start1, stop1):
+            c = s1[i]
+            if c in junk:
+                v = root
+                l = 0
+            else:
+                while v is not root and c not in v[_NEXT]:
+                    v = v[_LINK]
+                    l = v[_LENGTH]
+
+                v_next = v[_NEXT]
+                if c in v_next:
+                    v = v_next[c]
+                    l += 1
+                    if l > best_len:
+                        best_len = l
+                        best_state = v
+                        best_pos = i
+
+        if not best_len:
+            return (start1, self.start2, 0)
+
+        start_in_s1 = best_pos + 1 - best_len
+        end_in_s2 = best_state[_POS]
+        start_in_s2 = end_in_s2 + 1 - best_len
+        return (start_in_s1, start_in_s2, best_len)
+
+
+def longest_common_substring(s1, s2, start1=0, stop1=None, start2=0, stop2=None,
+                             *, junk=()):
+    return _LCSUBAutomaton(s2, start2, stop2, junk=junk).find(s1, start1, stop1)
+
+
 def _calculate_ratio(matches, length):
     if length:
         return 2.0 * matches / length
@@ -276,32 +375,42 @@ def __chain_b(self):
         # out the junk later is much cheaper than building b2j "right"
         # from the start.
         b = self.b
-        self.b2j = b2j = {}
-
-        for i, elt in enumerate(b):
-            indices = b2j.setdefault(elt, [])
-            indices.append(i)
-
-        # Purge junk elements
-        self.bjunk = junk = set()
         isjunk = self.isjunk
-        if isjunk:
-            for elt in b2j.keys():
-                if isjunk(elt):
-                    junk.add(elt)
-            for elt in junk: # separate loop avoids separate list of keys
-                del b2j[elt]
-
-        # Purge popular elements that are not junk
+        self.bjunk = junk = set()
+        autojunk = self.autojunk
         self.bpopular = popular = set()
-        n = len(b)
-        if self.autojunk and n >= 200:
-            ntest = n // 100 + 1
-            for elt, idxs in b2j.items():
-                if len(idxs) > ntest:
-                    popular.add(elt)
-            for elt in popular: # ditto; as fast for 1% deletion
-                del b2j[elt]
+        self.b2j = b2j = {}
+        if autojunk:
+            for i, elt in enumerate(b):
+                indices = b2j.setdefault(elt, [])
+                indices.append(i)
+
+            # Purge junk elements
+            if isjunk:
+                for elt in b2j.keys():
+                    if isjunk(elt):
+                        junk.add(elt)
+                for elt in junk: # separate loop avoids separate list of keys
+                    del b2j[elt]
+
+            # Purge popular elements that are not junk
+            n = len(b)
+            if autojunk and n >= 200:
+                ntest = n // 100 + 1
+                for elt, idxs in b2j.items():
+                    if len(idxs) > ntest:
+                        popular.add(elt)
+                for elt in popular: # ditto; as fast for 1% deletion
+                    del b2j[elt]
+        else:
+            # Prepare LCSUB Automaton
+            if isjunk:
+                bcounts = _Counter(b)
+                junk.update(filter(isjunk, bcounts))
+                for elt in junk:
+                    del bcounts[elt]
+            self.aut_cache = (None, None)     # Cache last automaton
+            self.all_junk = junk | popular
 
     def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -361,32 +470,43 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
         # Windiff ends up at the same place as diff, but by pairing up
         # the unique 'b's and then matching the first two 'a's.
 
-        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
+        a, b, isbjunk = self.a, self.b, self.bjunk.__contains__
         if ahi is None:
             ahi = len(a)
         if bhi is None:
             bhi = len(b)
-        besti, bestj, bestsize = alo, blo, 0
-        # find longest junk-free match
-        # during an iteration of the loop, j2len[j] = length of longest
-        # junk-free match ending with a[i-1] and b[j]
-        j2len = {}
-        nothing = []
-        for i in range(alo, ahi):
-            # look at all instances of a[i] in b; note that because
-            # b2j has no junk keys, the loop is skipped if a[i] is junk
-            j2lenget = j2len.get
-            newj2len = {}
-            for j in b2j.get(a[i], nothing):
-                # a[i] matches b[j]
-                if j < blo:
-                    continue
-                if j >= bhi:
-                    break
-                k = newj2len[j] = j2lenget(j-1, 0) + 1
-                if k > bestsize:
-                    besti, bestj, bestsize = i-k+1, j-k+1, k
-            j2len = newj2len
+        if alo >= ahi:
+            besti, bestj, bestsize = alo, blo, 0
+        elif self.autojunk:
+            b2j = self.b2j
+            besti, bestj, bestsize = alo, blo, 0
+            # find longest junk-free match
+            # during an iteration of the loop, j2len[j] = length of longest
+            # junk-free match ending with a[i-1] and b[j]
+            j2len = {}
+            nothing = []
+            for i in range(alo, ahi):
+                # look at all instances of a[i] in b; note that because
+                # b2j has no junk keys, the loop is skipped if a[i] is junk
+                j2lenget = j2len.get
+                newj2len = {}
+                for j in b2j.get(a[i], nothing):
+                    # a[i] matches b[j]
+                    if j < blo:
+                        continue
+                    if j >= bhi:
+                        break
+                    k = newj2len[j] = j2lenget(j-1, 0) + 1
+                    if k > bestsize:
+                        besti, bestj, bestsize = i-k+1, j-k+1, k
+                j2len = newj2len
+        else:
+            # Without autojunk, run LCSUB Automaton
+            blo_bhi, aut = self.aut_cache
+            if aut is None or blo_bhi != (blo, bhi):
+                aut = _LCSUBAutomaton(b, blo, bhi, junk=self.all_junk)
+                self.aut_cache = ((blo, bhi), aut)
+            besti, bestj, bestsize = aut.find(a, alo, ahi)
 
         # Extend the best by non-junk elements on each end.  In particular,
         # "popular" non-junk elements aren't in b2j, which greatly speeds

From 07f3db053cb6edd77017cfb121045d328e67a22c Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Fri, 23 Jan 2026 02:23:05 +0200
Subject: [PATCH 2/7] initial version

---
 Lib/difflib.py | 296 +++++++++++++++++++++++++++++++------------------
 1 file changed, 187 insertions(+), 109 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index 0b5bc0ce48502c..bcde4a0aa56ea9 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -37,6 +37,65 @@
 
 Match = _namedtuple('Match', 'a b size')
 
+
+class _LCSUBDict:
+    """Dict method for finding longest common substring.
+
+    Complexity:
+        T: O(n1 + n2) best, O(n1 × n2) worst
+        S: O(n2)
+
+    Members:
+        pos2    for x in seq2, pos2[x] is a list of the indices (into seq2)
+                at which x appears; junk elements do not appear
+    """
+
+    def __init__(self, seq2, junk=()):
+        if not isinstance(junk, frozenset):
+            junk = frozenset(junk)
+        self.seq2 = seq2
+        self.junk = junk
+        self.pos2 = pos2 = {}   # positions of each element in seq2
+        for i, elt in enumerate(seq2):
+            indices = pos2.setdefault(elt, [])
+            indices.append(i)
+        if junk:
+            for elt in junk:
+                del pos2[elt]
+
+    def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
+        if stop1 is None:
+            stop1 = len(seq1)
+        if stop2 is None:
+            stop2 = len(self.seq2)
+        pos2 = self.pos2
+        j2len = {}
+        nothing = []
+        besti, bestj, bestsize = start1, start2, 0
+        # find longest junk-free match
+        # during an iteration of the loop, j2len[j] = length of longest
+        # junk-free match ending with seq1[i-1] and seq2[j]
+        for i in range(start1, stop1):
+            # look at all instances of seq1[i] in seq2; note that because
+            # pos2 has no junk keys, the loop is skipped if seq1[i] is junk
+            j2lenget = j2len.get
+            newj2len = {}
+            for j in pos2.get(seq1[i], nothing):
+                # seq1[i] matches seq2[j]
+                if j < start2:
+                    continue
+                if j >= stop2:
+                    break
+                k = newj2len[j] = j2lenget(j-1, 0) + 1
+                if k > bestsize:
+                    besti = i - k + 1
+                    bestj = j - k + 1
+                    bestsize = k
+            j2len = newj2len
+
+        return besti, bestj, bestsize
+
+
 _LENGTH = 0
 _LINK = 1
 _NEXT = 2
@@ -44,21 +103,43 @@
 
 
 class _LCSUBAutomaton:
-    """Suffix Automaton for finding longest common substring."""
+    """Suffix Automaton for finding longest common substring.
+
+    Complexity:
+        T: O(n1 + n2)   - roughly 2 * n1 + 6 * n2
+        S: O(n2)        - maximum nodes: 2 * n2 + 1
+
+    Node spec:
+        node: list = [length: int, link: list, next: dict, end_pos: int]
+            length  - match length when the node is reached
+            link    - reference to a node to fall back to
+            next    - map to nodes to go to when matched
+            end_pos - end position of first occurrence (used for result)
+    """
 
-    def __init__(self, s2, start2=0, stop2=None, *, junk=()):
-        if stop2 is None:
-            stop2 = len(s2)
+    def __init__(self, seq2, junk=()):
+        if not isinstance(junk, frozenset):
+            junk = frozenset(junk)
+        self.seq2 = seq2
+        self.junk = junk
+        self.root = None
+        self.cache = (None, None)
 
-        self.start2 = start2
-        self.stop2 = stop2
-        self.junk = frozenset(junk)
-        self.root = root = [0, None, {}, -1]  # [length, link, next, end_pos]
+    def _build(self, start2, stop2):
+        """
+        Automaton needs to rebuild for every (start2, stop2)
+        This is made to cache the last one and only rebuild on new values
+        """
+        if self.root is not None and self.cache == (start2, stop2):
+            return
 
+        self.root = root = [0, None, {}, -1]
+        seq2 = self.seq2
+        junk = self.junk
         last_len = 0
         last = root
         for j in range(start2, stop2):
-            c = s2[j]
+            c = seq2[j]
             if c in junk:
                 last_len = 0
                 last = root
@@ -81,6 +162,7 @@ def __init__(self, s2, start2=0, stop2=None, *, junk=()):
                     if p_length_p1 == q[_LENGTH]:
                         curr[_LINK] = q
                     else:
+                        # Copy `q[_POS]` to ensure leftmost match in seq2
                         clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]]
                         while (p_next := p[_NEXT]).get(c) is q:
                             p_next[c] = clone
@@ -92,9 +174,16 @@ def __init__(self, s2, start2=0, stop2=None, *, junk=()):
 
                 last = curr
 
-    def find(self, s1, start1=0, stop1=None):
-        if stop1 is None:
-            stop1 = len(s1)
+        self.cache = (start2, stop2)
+
+    def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
+        size1 = len(seq1)
+        size2 = len(self.seq2)
+        if stop1 is None or stop1 > size1:
+            stop1 = size1
+        if stop2 is None or stop2 > size2:
+            stop2 = size2
+        self._build(start2, stop2)
         root = self.root
         junk = self.junk
         v = root
@@ -104,7 +193,7 @@ def find(self, s1, start1=0, stop1=None):
         best_pos = 0
 
         for i in range(start1, stop1):
-            c = s1[i]
+            c = seq1[i]
             if c in junk:
                 v = root
                 l = 0
@@ -123,7 +212,7 @@ def find(self, s1, start1=0, stop1=None):
                         best_pos = i
 
         if not best_len:
-            return (start1, self.start2, 0)
+            return (start1, start2, 0)
 
         start_in_s1 = best_pos + 1 - best_len
         end_in_s2 = best_state[_POS]
@@ -131,16 +220,12 @@ def find(self, s1, start1=0, stop1=None):
         return (start_in_s1, start_in_s2, best_len)
 
 
-def longest_common_substring(s1, s2, start1=0, stop1=None, start2=0, stop2=None,
-                             *, junk=()):
-    return _LCSUBAutomaton(s2, start2, stop2, junk=junk).find(s1, start1, stop1)
-
-
 def _calculate_ratio(matches, length):
     if length:
         return 2.0 * matches / length
     return 1.0
 
+
 class SequenceMatcher:
 
     """
@@ -379,38 +464,40 @@ def __chain_b(self):
         self.bjunk = junk = set()
         autojunk = self.autojunk
         self.bpopular = popular = set()
-        self.b2j = b2j = {}
-        if autojunk:
-            for i, elt in enumerate(b):
-                indices = b2j.setdefault(elt, [])
-                indices.append(i)
-
-            # Purge junk elements
-            if isjunk:
-                for elt in b2j.keys():
-                    if isjunk(elt):
-                        junk.add(elt)
-                for elt in junk: # separate loop avoids separate list of keys
-                    del b2j[elt]
-
-            # Purge popular elements that are not junk
-            n = len(b)
-            if autojunk and n >= 200:
-                ntest = n // 100 + 1
-                for elt, idxs in b2j.items():
-                    if len(idxs) > ntest:
-                        popular.add(elt)
-                for elt in popular: # ditto; as fast for 1% deletion
-                    del b2j[elt]
+        self._bcounts = bcounts = dict(_Counter(b))
+        if isjunk:
+            junk.update(filter(isjunk, bcounts))
+            for elt in junk:
+                del bcounts[elt]
+
+        n = len(b)
+        if autojunk and n >= 200:
+            ntest = n // 100 + 1
+            for elt, num in bcounts.items():
+                if num > ntest:
+                    popular.add(elt)
+            for elt in popular: # ditto; as fast for 1% deletion
+                del bcounts[elt]
+
+        self._max_bcount = max(bcounts.values()) if bcounts else 0
+        self._all_junk = frozenset(junk | popular)
+        self._lcsub_aut = None       # _LCSUBAutomaton instance
+        self._lcsub_dict = None      # _LCSUBDict instanct
+
+    def _get_lcsub_calculator(self, automaton=False):
+        if automaton:
+            if self._lcsub_aut is None:
+                self._lcsub_aut = _LCSUBAutomaton(self.b, self._all_junk)
+            return self._lcsub_aut
         else:
-            # Prepare LCSUB Automaton
-            if isjunk:
-                bcounts = _Counter(b)
-                junk.update(filter(isjunk, bcounts))
-                for elt in junk:
-                    del bcounts[elt]
-            self.aut_cache = (None, None)     # Cache last automaton
-            self.all_junk = junk | popular
+            if self._lcsub_dict is None:
+                self._lcsub_dict = _LCSUBDict(self.b, self._all_junk)
+            return self._lcsub_dict
+
+    @property
+    def b2j(self):
+        # NOTE: For backwards compatibility
+        return self._get_lcsub_calculator(automaton=False).pos2
 
     def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -475,67 +562,58 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
             ahi = len(a)
         if bhi is None:
             bhi = len(b)
-        if alo >= ahi:
-            besti, bestj, bestsize = alo, blo, 0
-        elif self.autojunk:
-            b2j = self.b2j
+        asize = ahi - alo
+        bsize = bhi - blo
+
+        if asize <= 0 and bsize <= 0:
             besti, bestj, bestsize = alo, blo, 0
-            # find longest junk-free match
-            # during an iteration of the loop, j2len[j] = length of longest
-            # junk-free match ending with a[i-1] and b[j]
-            j2len = {}
-            nothing = []
-            for i in range(alo, ahi):
-                # look at all instances of a[i] in b; note that because
-                # b2j has no junk keys, the loop is skipped if a[i] is junk
-                j2lenget = j2len.get
-                newj2len = {}
-                for j in b2j.get(a[i], nothing):
-                    # a[i] matches b[j]
-                    if j < blo:
-                        continue
-                    if j >= bhi:
-                        break
-                    k = newj2len[j] = j2lenget(j-1, 0) + 1
-                    if k > bestsize:
-                        besti, bestj, bestsize = i-k+1, j-k+1, k
-                j2len = newj2len
         else:
-            # Without autojunk, run LCSUB Automaton
-            blo_bhi, aut = self.aut_cache
-            if aut is None or blo_bhi != (blo, bhi):
-                aut = _LCSUBAutomaton(b, blo, bhi, junk=self.all_junk)
-                self.aut_cache = ((blo, bhi), aut)
-            besti, bestj, bestsize = aut.find(a, alo, ahi)
-
-        # Extend the best by non-junk elements on each end.  In particular,
-        # "popular" non-junk elements aren't in b2j, which greatly speeds
-        # the inner loop above, but also means "the best" match so far
-        # doesn't contain any junk *or* popular non-junk elements.
-        while besti > alo and bestj > blo and \
-              not isbjunk(b[bestj-1]) and \
-              a[besti-1] == b[bestj-1]:
-            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
-        while besti+bestsize < ahi and bestj+bestsize < bhi and \
-              not isbjunk(b[bestj+bestsize]) and \
-              a[besti+bestsize] == b[bestj+bestsize]:
-            bestsize += 1
-
-        # Now that we have a wholly interesting match (albeit possibly
-        # empty!), we may as well suck up the matching junk on each
-        # side of it too.  Can't think of a good reason not to, and it
-        # saves post-processing the (possibly considerable) expense of
-        # figuring out what to do with it.  In the case of an empty
-        # interesting match, this is clearly the right thing to do,
-        # because no other kind of match is possible in the regions.
-        while besti > alo and bestj > blo and \
-              isbjunk(b[bestj-1]) and \
-              a[besti-1] == b[bestj-1]:
-            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
-        while besti+bestsize < ahi and bestj+bestsize < bhi and \
-              isbjunk(b[bestj+bestsize]) and \
-              a[besti+bestsize] == b[bestj+bestsize]:
-            bestsize = bestsize + 1
+            # Constant to contruct automaton is roughly 6.
+            # Constant to run automaton is roughly 2.
+            # This has been tested on a range of data sets.
+            # For that specific set it gave selection accuracy of 95%.
+            # Weak spot in this is cases with little or no element overlap at all.
+            # However, such check would have more cost than benefit.
+            use_automaton = self._max_bcount * asize > bsize * 6 + asize * 2
+            calc = self._get_lcsub_calculator(use_automaton)
+            besti, bestj, bestsize = calc.find(a, alo, ahi, blo, bhi)
+
+        if self.bpopular:
+            # Extend the best by non-junk elements on each end.  In particular,
+            # "popular" non-junk elements aren't in b2j, which greatly speeds
+            # the inner loop above, but also means "the best" match so far
+            # doesn't contain any junk *or* popular non-junk elements.
+            while besti > alo and bestj > blo and \
+                  not isbjunk(b[bestj-1]) and \
+                  a[besti-1] == b[bestj-1]:
+                besti -= 1
+                bestj -= 1
+                bestsize += 1
+
+            while besti+bestsize < ahi and bestj+bestsize < bhi and \
+                  not isbjunk(b[bestj+bestsize]) and \
+                  a[besti+bestsize] == b[bestj+bestsize]:
+                bestsize += 1
+
+        if self.bjunk:
+            # Now that we have a wholly interesting match (albeit possibly
+            # empty!), we may as well suck up the matching junk on each
+            # side of it too.  Can't think of a good reason not to, and it
+            # saves post-processing the (possibly considerable) expense of
+            # figuring out what to do with it.  In the case of an empty
+            # interesting match, this is clearly the right thing to do,
+            # because no other kind of match is possible in the regions.
+            while besti > alo and bestj > blo and \
+                  isbjunk(b[bestj-1]) and \
+                  a[besti-1] == b[bestj-1]:
+                besti -= 1
+                bestj -= 1
+                bestsize += 1
+
+            while besti+bestsize < ahi and bestj+bestsize < bhi and \
+                  isbjunk(b[bestj+bestsize]) and \
+                  a[besti+bestsize] == b[bestj+bestsize]:
+                bestsize = bestsize + 1
 
         return Match(besti, bestj, bestsize)
 

From c1470ad7fe42396391572a61cd45c6f055cf3c6a Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Fri, 23 Jan 2026 02:45:08 +0200
Subject: [PATCH 3/7] minor changes

---
 Lib/difflib.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index bcde4a0aa56ea9..6459ce45bffa62 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -38,8 +38,8 @@
 Match = _namedtuple('Match', 'a b size')
 
 
-class _LCSUBDict:
-    """Dict method for finding longest common substring.
+class _LCSUBSimple:
+    """Simple dict method for finding longest common substring.
 
     Complexity:
         T: O(n1 + n2) best, O(n1 × n2) worst
@@ -481,18 +481,18 @@ def __chain_b(self):
 
         self._max_bcount = max(bcounts.values()) if bcounts else 0
         self._all_junk = frozenset(junk | popular)
-        self._lcsub_aut = None       # _LCSUBAutomaton instance
-        self._lcsub_dict = None      # _LCSUBDict instanct
+        self._lcsub_automaton = None    # _LCSUBAutomaton instance
+        self._lcsub_simple = None       # _LCSUBSimple instanct
 
     def _get_lcsub_calculator(self, automaton=False):
         if automaton:
-            if self._lcsub_aut is None:
-                self._lcsub_aut = _LCSUBAutomaton(self.b, self._all_junk)
-            return self._lcsub_aut
+            if self._lcsub_automaton is None:
+                self._lcsub_automaton = _LCSUBAutomaton(self.b, self._all_junk)
+            return self._lcsub_automaton
         else:
-            if self._lcsub_dict is None:
-                self._lcsub_dict = _LCSUBDict(self.b, self._all_junk)
-            return self._lcsub_dict
+            if self._lcsub_simple is None:
+                self._lcsub_simple = _LCSUBSimple(self.b, self._all_junk)
+            return self._lcsub_simple
 
     @property
     def b2j(self):
@@ -574,7 +574,9 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
             # For that specific set it gave selection accuracy of 95%.
             # Weak spot in this is cases with little or no element overlap at all.
             # However, such check would have more cost than benefit.
-            use_automaton = self._max_bcount * asize > bsize * 6 + asize * 2
+            automaton_cost = bsize * 6 + asize * 2
+            simple_cost = self._max_bcount * asize
+            use_automaton = simple_cost > automaton_cost
             calc = self._get_lcsub_calculator(use_automaton)
             besti, bestj, bestsize = calc.find(a, alo, ahi, blo, bhi)
 

From 8fb5f47fdfb37185fc882a6d3ce4c9a7f028c4df Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Fri, 23 Jan 2026 06:03:50 +0200
Subject: [PATCH 4/7] initial trimming of a and test fix

---
 Lib/difflib.py          | 106 ++++++++++++++++++++++++++++------------
 Lib/test/test_pyclbr.py |   2 +-
 2 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index 6459ce45bffa62..b2cacf8a08141b 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -55,19 +55,25 @@ def __init__(self, seq2, junk=()):
             junk = frozenset(junk)
         self.seq2 = seq2
         self.junk = junk
-        self.pos2 = pos2 = {}   # positions of each element in seq2
-        for i, elt in enumerate(seq2):
-            indices = pos2.setdefault(elt, [])
-            indices.append(i)
-        if junk:
-            for elt in junk:
-                del pos2[elt]
+        self.pos2 = None
+
+    def _build(self):
+        if self.pos2 is None:
+            self.pos2 = pos2 = {}   # positions of each element in seq2
+            for i, elt in enumerate(self.seq2):
+                indices = pos2.setdefault(elt, [])
+                indices.append(i)
+            junk = self.junk
+            if junk:
+                for elt in junk:
+                    del pos2[elt]
 
     def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
         if stop1 is None:
             stop1 = len(seq1)
         if stop2 is None:
             stop2 = len(self.seq2)
+        self._build()
         pos2 = self.pos2
         j2len = {}
         nothing = []
@@ -129,6 +135,12 @@ def _build(self, start2, stop2):
         """
         Automaton needs to rebuild for every (start2, stop2)
         This is made to cache the last one and only rebuild on new values
+
+        Note that to construct Automaton that can be queried for any
+            (start2, stop2), each node would need to store a store a set of
+            indices. And this is prone to O(n^2) memory explosion.
+            Current approach maintains reasonable memory guarantees
+            and is also much simpler in comparison.
         """
         if self.root is not None and self.cache == (start2, stop2):
             return
@@ -480,24 +492,17 @@ def __chain_b(self):
                 del bcounts[elt]
 
         self._max_bcount = max(bcounts.values()) if bcounts else 0
-        self._all_junk = frozenset(junk | popular)
-        self._lcsub_automaton = None    # _LCSUBAutomaton instance
-        self._lcsub_simple = None       # _LCSUBSimple instanct
-
-    def _get_lcsub_calculator(self, automaton=False):
-        if automaton:
-            if self._lcsub_automaton is None:
-                self._lcsub_automaton = _LCSUBAutomaton(self.b, self._all_junk)
-            return self._lcsub_automaton
-        else:
-            if self._lcsub_simple is None:
-                self._lcsub_simple = _LCSUBSimple(self.b, self._all_junk)
-            return self._lcsub_simple
+        self._all_junk = all_junk = frozenset(junk | popular)
+        self._lcsub_simple = _LCSUBSimple(b, all_junk)
+        self._lcsub_automaton = _LCSUBAutomaton(b, all_junk)
 
     @property
     def b2j(self):
         # NOTE: For backwards compatibility
-        return self._get_lcsub_calculator(automaton=False).pos2
+        simple_calc = self._lcsub_simple
+        if simple_calc.pos2 is None:
+            simple_calc._build()
+        return simple_calc.pos2
 
     def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -568,17 +573,54 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
         if asize <= 0 and bsize <= 0:
             besti, bestj, bestsize = alo, blo, 0
         else:
-            # Constant to contruct automaton is roughly 6.
-            # Constant to run automaton is roughly 2.
-            # This has been tested on a range of data sets.
-            # For that specific set it gave selection accuracy of 95%.
-            # Weak spot in this is cases with little or no element overlap at all.
-            # However, such check would have more cost than benefit.
-            automaton_cost = bsize * 6 + asize * 2
-            simple_cost = self._max_bcount * asize
-            use_automaton = simple_cost > automaton_cost
-            calc = self._get_lcsub_calculator(use_automaton)
-            besti, bestj, bestsize = calc.find(a, alo, ahi, blo, bhi)
+            # Can trim a from both ends while characters are not in b
+            # This is cheap and we have bcounts at all times
+            bcounts = self._bcounts
+            tmp_alo = alo
+            tmp_ahi = ahi
+            while tmp_alo < tmp_ahi and a[tmp_alo] not in bcounts:
+                tmp_alo += 1
+            while tmp_alo < tmp_ahi and a[tmp_ahi - 1] not in bcounts:
+                tmp_ahi -= 1
+            tmp_asize = tmp_ahi - tmp_alo
+            if tmp_asize <= 0:
+                besti, bestj, bestsize = alo, blo, 0
+            else:
+                # Constant to contruct automaton is roughly - 6.
+                # Constant to run automaton is roughly - 1.
+                # This has been tested on a range of data sets.
+                # It gave selection accuracy of ~95%.
+                # Weak spot is cases with little or no element overlap at all.
+                # However, such check would likely have more cost than benefit.
+                simple_calc = self._lcsub_simple
+                automaton = self._lcsub_automaton
+
+                automaton_cost = tmp_asize
+                if automaton.cache != (blo, bhi):
+                    automaton_cost += bsize * 6
+                simple_cost = self._max_bcount * tmp_asize
+                if simple_calc.pos2 is None:
+                    simple_cost += bsize
+                if simple_cost < automaton_cost:
+                    calc = simple_calc
+                else:
+                    calc = automaton
+                besti, bestj, bestsize = calc.find(a, tmp_alo, tmp_ahi, blo, bhi)
+
+        # NOTE: Doing it at the same time results in bigger matches!
+        # # If bjunk or bpopular were omitted in matching (performance reasons)
+        # # We now extend the match to capture as much as we can
+        # if self.bjunk or self.bpopular:
+        #     while besti > alo and bestj > blo and a[besti-1] == b[bestj-1]:
+        #         besti -= 1
+        #         bestj -= 1
+        #         bestsize += 1
+        #     lasti = besti + bestsize
+        #     lastj = bestj + bestsize
+        #     while lasti < ahi and lastj < bhi and a[lasti] == b[lastj]:
+        #         lasti += 1
+        #         lastj += 1
+        #         bestsize += 1
 
         if self.bpopular:
             # Extend the best by non-junk elements on each end.  In particular,
diff --git a/Lib/test/test_pyclbr.py b/Lib/test/test_pyclbr.py
index 79ef178f3807f4..f709cdb9522055 100644
--- a/Lib/test/test_pyclbr.py
+++ b/Lib/test/test_pyclbr.py
@@ -172,7 +172,7 @@ def test_easy(self):
         with temporary_main_spec():
             self.checkModule('doctest', ignore=("TestResults", "_SpoofOut",
                                                 "DocTestCase", '_DocTestSuite'))
-        self.checkModule('difflib', ignore=("Match",))
+        self.checkModule('difflib', ignore=("Match", "b2j"))
 
     def test_cases(self):
         # see test.pyclbr_input for the rationale behind the ignored symbols

From 2a8e6a440608fb25597ea9cd9dbfcb199264fac9 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Fri, 23 Jan 2026 19:31:03 +0200
Subject: [PATCH 5/7] minor changes

---
 Lib/difflib.py | 84 +++++++++++++++++++-------------------------------
 1 file changed, 31 insertions(+), 53 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index b2cacf8a08141b..51bef0d81858ca 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -142,7 +142,8 @@ def _build(self, start2, stop2):
             Current approach maintains reasonable memory guarantees
             and is also much simpler in comparison.
         """
-        if self.root is not None and self.cache == (start2, stop2):
+        key = (start2, stop2)
+        if self.root is not None and self.cache == key:
             return
 
         self.root = root = [0, None, {}, -1]
@@ -186,7 +187,7 @@ def _build(self, start2, stop2):
 
                 last = curr
 
-        self.cache = (start2, stop2)
+        self.cache = key
 
     def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
         size1 = len(seq1)
@@ -607,57 +608,34 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
                     calc = automaton
                 besti, bestj, bestsize = calc.find(a, tmp_alo, tmp_ahi, blo, bhi)
 
-        # NOTE: Doing it at the same time results in bigger matches!
-        # # If bjunk or bpopular were omitted in matching (performance reasons)
-        # # We now extend the match to capture as much as we can
-        # if self.bjunk or self.bpopular:
-        #     while besti > alo and bestj > blo and a[besti-1] == b[bestj-1]:
-        #         besti -= 1
-        #         bestj -= 1
-        #         bestsize += 1
-        #     lasti = besti + bestsize
-        #     lastj = bestj + bestsize
-        #     while lasti < ahi and lastj < bhi and a[lasti] == b[lastj]:
-        #         lasti += 1
-        #         lastj += 1
-        #         bestsize += 1
-
-        if self.bpopular:
-            # Extend the best by non-junk elements on each end.  In particular,
-            # "popular" non-junk elements aren't in b2j, which greatly speeds
-            # the inner loop above, but also means "the best" match so far
-            # doesn't contain any junk *or* popular non-junk elements.
-            while besti > alo and bestj > blo and \
-                  not isbjunk(b[bestj-1]) and \
-                  a[besti-1] == b[bestj-1]:
-                besti -= 1
-                bestj -= 1
-                bestsize += 1
-
-            while besti+bestsize < ahi and bestj+bestsize < bhi and \
-                  not isbjunk(b[bestj+bestsize]) and \
-                  a[besti+bestsize] == b[bestj+bestsize]:
-                bestsize += 1
-
-        if self.bjunk:
-            # Now that we have a wholly interesting match (albeit possibly
-            # empty!), we may as well suck up the matching junk on each
-            # side of it too.  Can't think of a good reason not to, and it
-            # saves post-processing the (possibly considerable) expense of
-            # figuring out what to do with it.  In the case of an empty
-            # interesting match, this is clearly the right thing to do,
-            # because no other kind of match is possible in the regions.
-            while besti > alo and bestj > blo and \
-                  isbjunk(b[bestj-1]) and \
-                  a[besti-1] == b[bestj-1]:
-                besti -= 1
-                bestj -= 1
-                bestsize += 1
-
-            while besti+bestsize < ahi and bestj+bestsize < bhi and \
-                  isbjunk(b[bestj+bestsize]) and \
-                  a[besti+bestsize] == b[bestj+bestsize]:
-                bestsize = bestsize + 1
+        # Extend the best by non-junk elements on each end.  In particular,
+        # "popular" non-junk elements aren't in b2j, which greatly speeds
+        # the inner loop above, but also means "the best" match so far
+        # doesn't contain any junk *or* popular non-junk elements.
+        while besti > alo and bestj > blo and \
+              not isbjunk(b[bestj-1]) and \
+              a[besti-1] == b[bestj-1]:
+            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
+        while besti+bestsize < ahi and bestj+bestsize < bhi and \
+              not isbjunk(b[bestj+bestsize]) and \
+              a[besti+bestsize] == b[bestj+bestsize]:
+            bestsize += 1
+
+        # Now that we have a wholly interesting match (albeit possibly
+        # empty!), we may as well suck up the matching junk on each
+        # side of it too.  Can't think of a good reason not to, and it
+        # saves post-processing the (possibly considerable) expense of
+        # figuring out what to do with it.  In the case of an empty
+        # interesting match, this is clearly the right thing to do,
+        # because no other kind of match is possible in the regions.
+        while besti > alo and bestj > blo and \
+              isbjunk(b[bestj-1]) and \
+              a[besti-1] == b[bestj-1]:
+            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
+        while besti+bestsize < ahi and bestj+bestsize < bhi and \
+              isbjunk(b[bestj+bestsize]) and \
+              a[besti+bestsize] == b[bestj+bestsize]:
+            bestsize = bestsize + 1
 
         return Match(besti, bestj, bestsize)
 

From 49b69ddf7c0efef6b71e800af2a01eb427533c98 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Fri, 23 Jan 2026 22:13:22 +0200
Subject: [PATCH 6/7] new threshold and consitency edits

---
 Lib/difflib.py | 230 ++++++++++++++++++++++++++++---------------------
 1 file changed, 130 insertions(+), 100 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index 51bef0d81858ca..5e6a61e407814b 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -38,6 +38,14 @@
 Match = _namedtuple('Match', 'a b size')
 
 
+def _adjust_indices(seq, start, stop):
+    assert start >= 0
+    size = len(seq)
+    if stop is None or stop > size:
+        stop = size
+    return start, stop
+
+
 class _LCSUBSimple:
     """Simple dict method for finding longest common substring.
 
@@ -46,51 +54,61 @@ class _LCSUBSimple:
         S: O(n2)
 
     Members:
-        pos2    for x in seq2, pos2[x] is a list of the indices (into seq2)
+        b2j    for x in b, b2j[x] is a list of the indices (into b)
                 at which x appears; junk elements do not appear
     """
 
-    def __init__(self, seq2, junk=()):
+    def __init__(self, b, junk=()):
         if not isinstance(junk, frozenset):
             junk = frozenset(junk)
-        self.seq2 = seq2
+        self.b = b
         self.junk = junk
-        self.pos2 = None
-
-    def _build(self):
-        if self.pos2 is None:
-            self.pos2 = pos2 = {}   # positions of each element in seq2
-            for i, elt in enumerate(self.seq2):
-                indices = pos2.setdefault(elt, [])
+        self._b2j = None
+
+    def isbuilt(self, blo, bhi):
+        blo, bhi = _adjust_indices(self.b, blo, bhi)
+        if blo >= bhi:
+            return True
+        return self._b2j is not None
+
+    def _get_b2j(self):
+        b2j = self._b2j
+        if b2j is None:
+            b2j = {}   # positions of each element in b
+            for i, elt in enumerate(self.b):
+                indices = b2j.setdefault(elt, [])
                 indices.append(i)
             junk = self.junk
             if junk:
                 for elt in junk:
-                    del pos2[elt]
-
-    def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
-        if stop1 is None:
-            stop1 = len(seq1)
-        if stop2 is None:
-            stop2 = len(self.seq2)
-        self._build()
-        pos2 = self.pos2
+                    del b2j[elt]
+            self._b2j = b2j
+
+        return b2j
+
+    def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
+        alo, ahi = _adjust_indices(a, alo, ahi)
+        blo, bhi = _adjust_indices(self.b, blo, bhi)
+        if alo >= ahi or blo >= bhi:
+            return (alo, blo, 0)
+
+        b2j = self._get_b2j()
         j2len = {}
         nothing = []
-        besti, bestj, bestsize = start1, start2, 0
+        besti, bestj, bestsize = alo, blo, 0
         # find longest junk-free match
         # during an iteration of the loop, j2len[j] = length of longest
-        # junk-free match ending with seq1[i-1] and seq2[j]
-        for i in range(start1, stop1):
-            # look at all instances of seq1[i] in seq2; note that because
-            # pos2 has no junk keys, the loop is skipped if seq1[i] is junk
+        # junk-free match ending with a[i-1] and b[j]
+        for i in range(alo, ahi):
+            # look at all instances of a[i] in b; note that because
+            # b2j has no junk keys, the loop is skipped if a[i] is junk
             j2lenget = j2len.get
             newj2len = {}
-            for j in pos2.get(seq1[i], nothing):
-                # seq1[i] matches seq2[j]
-                if j < start2:
+            for j in b2j.get(a[i], nothing):
+                # a[i] matches b[j]
+                if j < blo:
                     continue
-                if j >= stop2:
+                if j >= bhi:
                     break
                 k = newj2len[j] = j2lenget(j-1, 0) + 1
                 if k > bestsize:
@@ -123,81 +141,87 @@ class _LCSUBAutomaton:
             end_pos - end position of first occurrence (used for result)
     """
 
-    def __init__(self, seq2, junk=()):
+    def __init__(self, b, junk=()):
         if not isinstance(junk, frozenset):
             junk = frozenset(junk)
-        self.seq2 = seq2
+        self.b = b
         self.junk = junk
-        self.root = None
-        self.cache = (None, None)
+        self._root = None
+        self._cache = (None, None)
+
+    def isbuilt(self, blo, bhi):
+        blo, bhi = _adjust_indices(self.b, blo, bhi)
+        if blo >= bhi:
+            return True
+        return self._root is not None and self._cache == (blo, bhi)
 
-    def _build(self, start2, stop2):
+    def _get_root(self, blo, bhi):
         """
-        Automaton needs to rebuild for every (start2, stop2)
+        Automaton needs to rebuild for every (blo, bhi)
         This is made to cache the last one and only rebuild on new values
 
         Note that to construct Automaton that can be queried for any
-            (start2, stop2), each node would need to store a store a set of
+            (blo, bhi), each node would need to store a store a set of
             indices. And this is prone to O(n^2) memory explosion.
             Current approach maintains reasonable memory guarantees
             and is also much simpler in comparison.
         """
-        key = (start2, stop2)
-        if self.root is not None and self.cache == key:
-            return
+        key = (blo, bhi)
+        root = self._root
+        if root is None or self._cache != key:
+            root = [0, None, {}, -1]
+            b = self.b
+            junk = self.junk
+            last_len = 0
+            last = root
+            for j in range(blo, bhi):
+                c = b[j]
+                if c in junk:
+                    last_len = 0
+                    last = root
+                else:
+                    last_len += 1
+                    curr = [last_len, None, {}, j]
 
-        self.root = root = [0, None, {}, -1]
-        seq2 = self.seq2
-        junk = self.junk
-        last_len = 0
-        last = root
-        for j in range(start2, stop2):
-            c = seq2[j]
-            if c in junk:
-                last_len = 0
-                last = root
-            else:
-                last_len += 1
-                curr = [last_len, None, {}, j]
-
-                p = last
-                p_next = p[_NEXT]
-                while c not in p_next:
-                    p_next[c] = curr
-                    if p is root:
-                        curr[_LINK] = root
-                        break
-                    p = p[_LINK]
+                    p = last
                     p_next = p[_NEXT]
-                else:
-                    q = p_next[c]
-                    p_length_p1 = p[_LENGTH] + 1
-                    if p_length_p1 == q[_LENGTH]:
-                        curr[_LINK] = q
+                    while c not in p_next:
+                        p_next[c] = curr
+                        if p is root:
+                            curr[_LINK] = root
+                            break
+                        p = p[_LINK]
+                        p_next = p[_NEXT]
                     else:
-                        # Copy `q[_POS]` to ensure leftmost match in seq2
-                        clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]]
-                        while (p_next := p[_NEXT]).get(c) is q:
-                            p_next[c] = clone
-                            if p is root:
-                                break
-                            p = p[_LINK]
-
-                        q[_LINK] = curr[_LINK] = clone
-
-                last = curr
-
-        self.cache = key
-
-    def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
-        size1 = len(seq1)
-        size2 = len(self.seq2)
-        if stop1 is None or stop1 > size1:
-            stop1 = size1
-        if stop2 is None or stop2 > size2:
-            stop2 = size2
-        self._build(start2, stop2)
-        root = self.root
+                        q = p_next[c]
+                        p_length_p1 = p[_LENGTH] + 1
+                        if p_length_p1 == q[_LENGTH]:
+                            curr[_LINK] = q
+                        else:
+                            # Copy `q[_POS]` to ensure leftmost match in b
+                            clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]]
+                            while (p_next := p[_NEXT]).get(c) is q:
+                                p_next[c] = clone
+                                if p is root:
+                                    break
+                                p = p[_LINK]
+
+                            q[_LINK] = curr[_LINK] = clone
+
+                    last = curr
+
+            self._root = root
+            self._cache = key
+
+        return root
+
+    def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
+        alo, ahi = _adjust_indices(a, alo, ahi)
+        blo, bhi = _adjust_indices(self.b, blo, bhi)
+        if alo >= ahi or blo >= bhi:
+            return (alo, blo, 0)
+
+        root = self._get_root(blo, bhi)
         junk = self.junk
         v = root
         l = 0
@@ -205,8 +229,8 @@ def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
         best_state = None
         best_pos = 0
 
-        for i in range(start1, stop1):
-            c = seq1[i]
+        for i in range(alo, ahi):
+            c = a[i]
             if c in junk:
                 v = root
                 l = 0
@@ -225,7 +249,7 @@ def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
                         best_pos = i
 
         if not best_len:
-            return (start1, start2, 0)
+            return (alo, blo, 0)
 
         start_in_s1 = best_pos + 1 - best_len
         end_in_s2 = best_state[_POS]
@@ -492,7 +516,14 @@ def __chain_b(self):
             for elt in popular: # ditto; as fast for 1% deletion
                 del bcounts[elt]
 
-        self._max_bcount = max(bcounts.values()) if bcounts else 0
+        if not bcounts:
+            self._bcount_thres = 0
+        else:
+            sum_bcount = sum(bcounts.values())
+            avg_bcount = sum(c * c for c in bcounts.values()) / sum_bcount
+            max_bcount = max(bcounts.values())
+            self._bcount_thres = avg_bcount * 0.8 + max_bcount * 0.2
+
         self._all_junk = all_junk = frozenset(junk | popular)
         self._lcsub_simple = _LCSUBSimple(b, all_junk)
         self._lcsub_automaton = _LCSUBAutomaton(b, all_junk)
@@ -500,10 +531,7 @@ def __chain_b(self):
     @property
     def b2j(self):
         # NOTE: For backwards compatibility
-        simple_calc = self._lcsub_simple
-        if simple_calc.pos2 is None:
-            simple_calc._build()
-        return simple_calc.pos2
+        return self._lcsub_simple._get_b2j()
 
     def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -596,12 +624,14 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
                 simple_calc = self._lcsub_simple
                 automaton = self._lcsub_automaton
 
+                simple_cost = self._bcount_thres * tmp_asize
+                if not simple_calc.isbuilt(blo, bhi):
+                    simple_cost += bsize
+
                 automaton_cost = tmp_asize
-                if automaton.cache != (blo, bhi):
+                if not automaton.isbuilt(blo, bhi):
                     automaton_cost += bsize * 6
-                simple_cost = self._max_bcount * tmp_asize
-                if simple_calc.pos2 is None:
-                    simple_cost += bsize
+
                 if simple_cost < automaton_cost:
                     calc = simple_calc
                 else:

From e5a51241d8d5c424b3385d5cb6704edf28a7c863 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sat, 24 Jan 2026 02:20:24 +0200
Subject: [PATCH 7/7] minor minor

---
 Lib/difflib.py | 142 +++++++++++++++++++++++--------------------------
 1 file changed, 68 insertions(+), 74 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index 5e6a61e407814b..3a2f2ba0785ee3 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -39,7 +39,8 @@
 
 
 def _adjust_indices(seq, start, stop):
-    assert start >= 0
+    if start < 0:
+        raise ValueError('Starting index can not be negative')
     size = len(seq)
     if stop is None or stop > size:
         stop = size
@@ -52,9 +53,10 @@ class _LCSUBSimple:
     Complexity:
         T: O(n1 + n2) best, O(n1 × n2) worst
         S: O(n2)
+            , where n1 = len(a), n2 = len(b)
 
     Members:
-        b2j    for x in b, b2j[x] is a list of the indices (into b)
+        _b2j    for x in b, b2j[x] is a list of the indices (into b)
                 at which x appears; junk elements do not appear
     """
 
@@ -73,17 +75,18 @@ def isbuilt(self, blo, bhi):
 
     def _get_b2j(self):
         b2j = self._b2j
-        if b2j is None:
-            b2j = {}   # positions of each element in b
-            for i, elt in enumerate(self.b):
-                indices = b2j.setdefault(elt, [])
-                indices.append(i)
-            junk = self.junk
-            if junk:
-                for elt in junk:
-                    del b2j[elt]
-            self._b2j = b2j
+        if b2j is not None:
+            return b2j
 
+        b2j = {}   # positions of each element in b
+        for i, elt in enumerate(self.b):
+            indices = b2j.setdefault(elt, [])
+            indices.append(i)
+        junk = self.junk
+        if junk:
+            for elt in junk:
+                del b2j[elt]
+        self._b2j = b2j
         return b2j
 
     def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
@@ -120,18 +123,13 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
         return besti, bestj, bestsize
 
 
-_LENGTH = 0
-_LINK = 1
-_NEXT = 2
-_POS = 3
-
-
 class _LCSUBAutomaton:
     """Suffix Automaton for finding longest common substring.
 
     Complexity:
         T: O(n1 + n2)   - roughly 2 * n1 + 6 * n2
         S: O(n2)        - maximum nodes: 2 * n2 + 1
+            , where n1 = len(a), n2 = len(b)
 
     Node spec:
         node: list = [length: int, link: list, next: dict, end_pos: int]
@@ -157,62 +155,58 @@ def isbuilt(self, blo, bhi):
 
     def _get_root(self, blo, bhi):
         """
-        Automaton needs to rebuild for every (blo, bhi)
-        This is made to cache the last one and only rebuild on new values
-
-        Note that to construct Automaton that can be queried for any
-            (blo, bhi), each node would need to store a store a set of
-            indices. And this is prone to O(n^2) memory explosion.
-            Current approach maintains reasonable memory guarantees
-            and is also much simpler in comparison.
+        Automaton needs to rebuild for every (start2, stop2)
+        It is made to cache the last one and only rebuilds for new range
         """
         key = (blo, bhi)
         root = self._root
-        if root is None or self._cache != key:
-            root = [0, None, {}, -1]
-            b = self.b
-            junk = self.junk
-            last_len = 0
-            last = root
-            for j in range(blo, bhi):
-                c = b[j]
-                if c in junk:
-                    last_len = 0
-                    last = root
+        if root is not None and self._cache == key:
+            return root
+
+        LEN, LINK, NEXT, EPOS = 0, 1, 2, 3
+        root = [0, None, {}, -1]
+        b = self.b
+        junk = self.junk
+        last_len = 0
+        last = root
+        for j in range(blo, bhi):
+            c = b[j]
+            if c in junk:
+                last_len = 0
+                last = root
+            else:
+                last_len += 1
+                curr = [last_len, None, {}, j]
+
+                p = last
+                p_next = p[NEXT]
+                while c not in p_next:
+                    p_next[c] = curr
+                    if p is root:
+                        curr[LINK] = root
+                        break
+                    p = p[LINK]
+                    p_next = p[NEXT]
                 else:
-                    last_len += 1
-                    curr = [last_len, None, {}, j]
-
-                    p = last
-                    p_next = p[_NEXT]
-                    while c not in p_next:
-                        p_next[c] = curr
-                        if p is root:
-                            curr[_LINK] = root
-                            break
-                        p = p[_LINK]
-                        p_next = p[_NEXT]
+                    q = p_next[c]
+                    p_len_p1 = p[LEN] + 1
+                    if p_len_p1 == q[LEN]:
+                        curr[LINK] = q
                     else:
-                        q = p_next[c]
-                        p_length_p1 = p[_LENGTH] + 1
-                        if p_length_p1 == q[_LENGTH]:
-                            curr[_LINK] = q
-                        else:
-                            # Copy `q[_POS]` to ensure leftmost match in b
-                            clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]]
-                            while (p_next := p[_NEXT]).get(c) is q:
-                                p_next[c] = clone
-                                if p is root:
-                                    break
-                                p = p[_LINK]
-
-                            q[_LINK] = curr[_LINK] = clone
-
-                    last = curr
-
-            self._root = root
-            self._cache = key
+                        # Copy `q[EPOS]` to ensure leftmost match in b
+                        clone = [p_len_p1, q[LINK], q[NEXT].copy(), q[EPOS]]
+                        while (p_next := p[NEXT]).get(c) is q:
+                            p_next[c] = clone
+                            if p is root:
+                                break
+                            p = p[LINK]
+
+                        q[LINK] = curr[LINK] = clone
+
+                last = curr
 
+        self._root = root
+        self._cache = key
         return root
 
     def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
@@ -221,6 +215,7 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
         if alo >= ahi or blo >= bhi:
             return (alo, blo, 0)
 
+        LEN, LINK, NEXT, EPOS = 0, 1, 2, 3
         root = self._get_root(blo, bhi)
         junk = self.junk
         v = root
@@ -235,11 +230,11 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
                 v = root
                 l = 0
             else:
-                while v is not root and c not in v[_NEXT]:
-                    v = v[_LINK]
-                    l = v[_LENGTH]
+                while v is not root and c not in v[NEXT]:
+                    v = v[LINK]
+                    l = v[LEN]
 
-                v_next = v[_NEXT]
+                v_next = v[NEXT]
                 if c in v_next:
                     v = v_next[c]
                     l += 1
@@ -252,8 +247,7 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
             return (alo, blo, 0)
 
         start_in_s1 = best_pos + 1 - best_len
-        end_in_s2 = best_state[_POS]
-        start_in_s2 = end_in_s2 + 1 - best_len
+        start_in_s2 = best_state[EPOS] + 1 - best_len
         return (start_in_s1, start_in_s2, best_len)