From 2e2dfd5c4ac2d697c65719e499992faffdce4d84 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 22 Jan 2026 21:44:51 +0200 Subject: [PATCH 1/7] POC --- Lib/difflib.py | 214 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 167 insertions(+), 47 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 7c7e233b013a76..0b5bc0ce48502c 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -32,11 +32,110 @@ from _colorize import can_colorize, get_theme from heapq import nlargest as _nlargest -from collections import namedtuple as _namedtuple +from collections import Counter as _Counter, namedtuple as _namedtuple from types import GenericAlias Match = _namedtuple('Match', 'a b size') +_LENGTH = 0 +_LINK = 1 +_NEXT = 2 +_POS = 3 + + +class _LCSUBAutomaton: + """Suffix Automaton for finding longest common substring.""" + + def __init__(self, s2, start2=0, stop2=None, *, junk=()): + if stop2 is None: + stop2 = len(s2) + + self.start2 = start2 + self.stop2 = stop2 + self.junk = frozenset(junk) + self.root = root = [0, None, {}, -1] # [length, link, next, end_pos] + + last_len = 0 + last = root + for j in range(start2, stop2): + c = s2[j] + if c in junk: + last_len = 0 + last = root + else: + last_len += 1 + curr = [last_len, None, {}, j] + + p = last + p_next = p[_NEXT] + while c not in p_next: + p_next[c] = curr + if p is root: + curr[_LINK] = root + break + p = p[_LINK] + p_next = p[_NEXT] + else: + q = p_next[c] + p_length_p1 = p[_LENGTH] + 1 + if p_length_p1 == q[_LENGTH]: + curr[_LINK] = q + else: + clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]] + while (p_next := p[_NEXT]).get(c) is q: + p_next[c] = clone + if p is root: + break + p = p[_LINK] + + q[_LINK] = curr[_LINK] = clone + + last = curr + + def find(self, s1, start1=0, stop1=None): + if stop1 is None: + stop1 = len(s1) + root = self.root + junk = self.junk + v = root + l = 0 + best_len = 0 + best_state = None + best_pos = 0 + + for i in range(start1, stop1): + c = s1[i] + if c in junk: + v = root + l = 0 + else: + while v is not root and c not in v[_NEXT]: + v = v[_LINK] + l = v[_LENGTH] + + v_next = v[_NEXT] + if c in v_next: + v = v_next[c] + l += 1 + if l > best_len: + best_len = l + best_state = v + best_pos = i + + if not best_len: + return (start1, self.start2, 0) + + start_in_s1 = best_pos + 1 - best_len + end_in_s2 = best_state[_POS] + start_in_s2 = end_in_s2 + 1 - best_len + return (start_in_s1, start_in_s2, best_len) + + +def longest_common_substring(s1, s2, start1=0, stop1=None, start2=0, stop2=None, + *, junk=()): + return _LCSUBAutomaton(s2, start2, stop2, junk=junk).find(s1, start1, stop1) + + def _calculate_ratio(matches, length): if length: return 2.0 * matches / length @@ -276,32 +375,42 @@ def __chain_b(self): # out the junk later is much cheaper than building b2j "right" # from the start. b = self.b - self.b2j = b2j = {} - - for i, elt in enumerate(b): - indices = b2j.setdefault(elt, []) - indices.append(i) - - # Purge junk elements - self.bjunk = junk = set() isjunk = self.isjunk - if isjunk: - for elt in b2j.keys(): - if isjunk(elt): - junk.add(elt) - for elt in junk: # separate loop avoids separate list of keys - del b2j[elt] - - # Purge popular elements that are not junk + self.bjunk = junk = set() + autojunk = self.autojunk self.bpopular = popular = set() - n = len(b) - if self.autojunk and n >= 200: - ntest = n // 100 + 1 - for elt, idxs in b2j.items(): - if len(idxs) > ntest: - popular.add(elt) - for elt in popular: # ditto; as fast for 1% deletion - del b2j[elt] + self.b2j = b2j = {} + if autojunk: + for i, elt in enumerate(b): + indices = b2j.setdefault(elt, []) + indices.append(i) + + # Purge junk elements + if isjunk: + for elt in b2j.keys(): + if isjunk(elt): + junk.add(elt) + for elt in junk: # separate loop avoids separate list of keys + del b2j[elt] + + # Purge popular elements that are not junk + n = len(b) + if autojunk and n >= 200: + ntest = n // 100 + 1 + for elt, idxs in b2j.items(): + if len(idxs) > ntest: + popular.add(elt) + for elt in popular: # ditto; as fast for 1% deletion + del b2j[elt] + else: + # Prepare LCSUB Automaton + if isjunk: + bcounts = _Counter(b) + junk.update(filter(isjunk, bcounts)) + for elt in junk: + del bcounts[elt] + self.aut_cache = (None, None) # Cache last automaton + self.all_junk = junk | popular def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. @@ -361,32 +470,43 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): # Windiff ends up at the same place as diff, but by pairing up # the unique 'b's and then matching the first two 'a's. - a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ + a, b, isbjunk = self.a, self.b, self.bjunk.__contains__ if ahi is None: ahi = len(a) if bhi is None: bhi = len(b) - besti, bestj, bestsize = alo, blo, 0 - # find longest junk-free match - # during an iteration of the loop, j2len[j] = length of longest - # junk-free match ending with a[i-1] and b[j] - j2len = {} - nothing = [] - for i in range(alo, ahi): - # look at all instances of a[i] in b; note that because - # b2j has no junk keys, the loop is skipped if a[i] is junk - j2lenget = j2len.get - newj2len = {} - for j in b2j.get(a[i], nothing): - # a[i] matches b[j] - if j < blo: - continue - if j >= bhi: - break - k = newj2len[j] = j2lenget(j-1, 0) + 1 - if k > bestsize: - besti, bestj, bestsize = i-k+1, j-k+1, k - j2len = newj2len + if alo >= ahi: + besti, bestj, bestsize = alo, blo, 0 + elif self.autojunk: + b2j = self.b2j + besti, bestj, bestsize = alo, blo, 0 + # find longest junk-free match + # during an iteration of the loop, j2len[j] = length of longest + # junk-free match ending with a[i-1] and b[j] + j2len = {} + nothing = [] + for i in range(alo, ahi): + # look at all instances of a[i] in b; note that because + # b2j has no junk keys, the loop is skipped if a[i] is junk + j2lenget = j2len.get + newj2len = {} + for j in b2j.get(a[i], nothing): + # a[i] matches b[j] + if j < blo: + continue + if j >= bhi: + break + k = newj2len[j] = j2lenget(j-1, 0) + 1 + if k > bestsize: + besti, bestj, bestsize = i-k+1, j-k+1, k + j2len = newj2len + else: + # Without autojunk, run LCSUB Automaton + blo_bhi, aut = self.aut_cache + if aut is None or blo_bhi != (blo, bhi): + aut = _LCSUBAutomaton(b, blo, bhi, junk=self.all_junk) + self.aut_cache = ((blo, bhi), aut) + besti, bestj, bestsize = aut.find(a, alo, ahi) # Extend the best by non-junk elements on each end. In particular, # "popular" non-junk elements aren't in b2j, which greatly speeds From 07f3db053cb6edd77017cfb121045d328e67a22c Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Fri, 23 Jan 2026 02:23:05 +0200 Subject: [PATCH 2/7] initial version --- Lib/difflib.py | 296 +++++++++++++++++++++++++++++++------------------ 1 file changed, 187 insertions(+), 109 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 0b5bc0ce48502c..bcde4a0aa56ea9 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -37,6 +37,65 @@ Match = _namedtuple('Match', 'a b size') + +class _LCSUBDict: + """Dict method for finding longest common substring. + + Complexity: + T: O(n1 + n2) best, O(n1 × n2) worst + S: O(n2) + + Members: + pos2 for x in seq2, pos2[x] is a list of the indices (into seq2) + at which x appears; junk elements do not appear + """ + + def __init__(self, seq2, junk=()): + if not isinstance(junk, frozenset): + junk = frozenset(junk) + self.seq2 = seq2 + self.junk = junk + self.pos2 = pos2 = {} # positions of each element in seq2 + for i, elt in enumerate(seq2): + indices = pos2.setdefault(elt, []) + indices.append(i) + if junk: + for elt in junk: + del pos2[elt] + + def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): + if stop1 is None: + stop1 = len(seq1) + if stop2 is None: + stop2 = len(self.seq2) + pos2 = self.pos2 + j2len = {} + nothing = [] + besti, bestj, bestsize = start1, start2, 0 + # find longest junk-free match + # during an iteration of the loop, j2len[j] = length of longest + # junk-free match ending with seq1[i-1] and seq2[j] + for i in range(start1, stop1): + # look at all instances of seq1[i] in seq2; note that because + # pos2 has no junk keys, the loop is skipped if seq1[i] is junk + j2lenget = j2len.get + newj2len = {} + for j in pos2.get(seq1[i], nothing): + # seq1[i] matches seq2[j] + if j < start2: + continue + if j >= stop2: + break + k = newj2len[j] = j2lenget(j-1, 0) + 1 + if k > bestsize: + besti = i - k + 1 + bestj = j - k + 1 + bestsize = k + j2len = newj2len + + return besti, bestj, bestsize + + _LENGTH = 0 _LINK = 1 _NEXT = 2 @@ -44,21 +103,43 @@ class _LCSUBAutomaton: - """Suffix Automaton for finding longest common substring.""" + """Suffix Automaton for finding longest common substring. + + Complexity: + T: O(n1 + n2) - roughly 2 * n1 + 6 * n2 + S: O(n2) - maximum nodes: 2 * n2 + 1 + + Node spec: + node: list = [length: int, link: list, next: dict, end_pos: int] + length - match length when the node is reached + link - reference to a node to fall back to + next - map to nodes to go to when matched + end_pos - end position of first occurrence (used for result) + """ - def __init__(self, s2, start2=0, stop2=None, *, junk=()): - if stop2 is None: - stop2 = len(s2) + def __init__(self, seq2, junk=()): + if not isinstance(junk, frozenset): + junk = frozenset(junk) + self.seq2 = seq2 + self.junk = junk + self.root = None + self.cache = (None, None) - self.start2 = start2 - self.stop2 = stop2 - self.junk = frozenset(junk) - self.root = root = [0, None, {}, -1] # [length, link, next, end_pos] + def _build(self, start2, stop2): + """ + Automaton needs to rebuild for every (start2, stop2) + This is made to cache the last one and only rebuild on new values + """ + if self.root is not None and self.cache == (start2, stop2): + return + self.root = root = [0, None, {}, -1] + seq2 = self.seq2 + junk = self.junk last_len = 0 last = root for j in range(start2, stop2): - c = s2[j] + c = seq2[j] if c in junk: last_len = 0 last = root @@ -81,6 +162,7 @@ def __init__(self, s2, start2=0, stop2=None, *, junk=()): if p_length_p1 == q[_LENGTH]: curr[_LINK] = q else: + # Copy `q[_POS]` to ensure leftmost match in seq2 clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]] while (p_next := p[_NEXT]).get(c) is q: p_next[c] = clone @@ -92,9 +174,16 @@ def __init__(self, s2, start2=0, stop2=None, *, junk=()): last = curr - def find(self, s1, start1=0, stop1=None): - if stop1 is None: - stop1 = len(s1) + self.cache = (start2, stop2) + + def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): + size1 = len(seq1) + size2 = len(self.seq2) + if stop1 is None or stop1 > size1: + stop1 = size1 + if stop2 is None or stop2 > size2: + stop2 = size2 + self._build(start2, stop2) root = self.root junk = self.junk v = root @@ -104,7 +193,7 @@ def find(self, s1, start1=0, stop1=None): best_pos = 0 for i in range(start1, stop1): - c = s1[i] + c = seq1[i] if c in junk: v = root l = 0 @@ -123,7 +212,7 @@ def find(self, s1, start1=0, stop1=None): best_pos = i if not best_len: - return (start1, self.start2, 0) + return (start1, start2, 0) start_in_s1 = best_pos + 1 - best_len end_in_s2 = best_state[_POS] @@ -131,16 +220,12 @@ def find(self, s1, start1=0, stop1=None): return (start_in_s1, start_in_s2, best_len) -def longest_common_substring(s1, s2, start1=0, stop1=None, start2=0, stop2=None, - *, junk=()): - return _LCSUBAutomaton(s2, start2, stop2, junk=junk).find(s1, start1, stop1) - - def _calculate_ratio(matches, length): if length: return 2.0 * matches / length return 1.0 + class SequenceMatcher: """ @@ -379,38 +464,40 @@ def __chain_b(self): self.bjunk = junk = set() autojunk = self.autojunk self.bpopular = popular = set() - self.b2j = b2j = {} - if autojunk: - for i, elt in enumerate(b): - indices = b2j.setdefault(elt, []) - indices.append(i) - - # Purge junk elements - if isjunk: - for elt in b2j.keys(): - if isjunk(elt): - junk.add(elt) - for elt in junk: # separate loop avoids separate list of keys - del b2j[elt] - - # Purge popular elements that are not junk - n = len(b) - if autojunk and n >= 200: - ntest = n // 100 + 1 - for elt, idxs in b2j.items(): - if len(idxs) > ntest: - popular.add(elt) - for elt in popular: # ditto; as fast for 1% deletion - del b2j[elt] + self._bcounts = bcounts = dict(_Counter(b)) + if isjunk: + junk.update(filter(isjunk, bcounts)) + for elt in junk: + del bcounts[elt] + + n = len(b) + if autojunk and n >= 200: + ntest = n // 100 + 1 + for elt, num in bcounts.items(): + if num > ntest: + popular.add(elt) + for elt in popular: # ditto; as fast for 1% deletion + del bcounts[elt] + + self._max_bcount = max(bcounts.values()) if bcounts else 0 + self._all_junk = frozenset(junk | popular) + self._lcsub_aut = None # _LCSUBAutomaton instance + self._lcsub_dict = None # _LCSUBDict instanct + + def _get_lcsub_calculator(self, automaton=False): + if automaton: + if self._lcsub_aut is None: + self._lcsub_aut = _LCSUBAutomaton(self.b, self._all_junk) + return self._lcsub_aut else: - # Prepare LCSUB Automaton - if isjunk: - bcounts = _Counter(b) - junk.update(filter(isjunk, bcounts)) - for elt in junk: - del bcounts[elt] - self.aut_cache = (None, None) # Cache last automaton - self.all_junk = junk | popular + if self._lcsub_dict is None: + self._lcsub_dict = _LCSUBDict(self.b, self._all_junk) + return self._lcsub_dict + + @property + def b2j(self): + # NOTE: For backwards compatibility + return self._get_lcsub_calculator(automaton=False).pos2 def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. @@ -475,67 +562,58 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): ahi = len(a) if bhi is None: bhi = len(b) - if alo >= ahi: - besti, bestj, bestsize = alo, blo, 0 - elif self.autojunk: - b2j = self.b2j + asize = ahi - alo + bsize = bhi - blo + + if asize <= 0 and bsize <= 0: besti, bestj, bestsize = alo, blo, 0 - # find longest junk-free match - # during an iteration of the loop, j2len[j] = length of longest - # junk-free match ending with a[i-1] and b[j] - j2len = {} - nothing = [] - for i in range(alo, ahi): - # look at all instances of a[i] in b; note that because - # b2j has no junk keys, the loop is skipped if a[i] is junk - j2lenget = j2len.get - newj2len = {} - for j in b2j.get(a[i], nothing): - # a[i] matches b[j] - if j < blo: - continue - if j >= bhi: - break - k = newj2len[j] = j2lenget(j-1, 0) + 1 - if k > bestsize: - besti, bestj, bestsize = i-k+1, j-k+1, k - j2len = newj2len else: - # Without autojunk, run LCSUB Automaton - blo_bhi, aut = self.aut_cache - if aut is None or blo_bhi != (blo, bhi): - aut = _LCSUBAutomaton(b, blo, bhi, junk=self.all_junk) - self.aut_cache = ((blo, bhi), aut) - besti, bestj, bestsize = aut.find(a, alo, ahi) - - # Extend the best by non-junk elements on each end. In particular, - # "popular" non-junk elements aren't in b2j, which greatly speeds - # the inner loop above, but also means "the best" match so far - # doesn't contain any junk *or* popular non-junk elements. - while besti > alo and bestj > blo and \ - not isbjunk(b[bestj-1]) and \ - a[besti-1] == b[bestj-1]: - besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 - while besti+bestsize < ahi and bestj+bestsize < bhi and \ - not isbjunk(b[bestj+bestsize]) and \ - a[besti+bestsize] == b[bestj+bestsize]: - bestsize += 1 - - # Now that we have a wholly interesting match (albeit possibly - # empty!), we may as well suck up the matching junk on each - # side of it too. Can't think of a good reason not to, and it - # saves post-processing the (possibly considerable) expense of - # figuring out what to do with it. In the case of an empty - # interesting match, this is clearly the right thing to do, - # because no other kind of match is possible in the regions. - while besti > alo and bestj > blo and \ - isbjunk(b[bestj-1]) and \ - a[besti-1] == b[bestj-1]: - besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 - while besti+bestsize < ahi and bestj+bestsize < bhi and \ - isbjunk(b[bestj+bestsize]) and \ - a[besti+bestsize] == b[bestj+bestsize]: - bestsize = bestsize + 1 + # Constant to contruct automaton is roughly 6. + # Constant to run automaton is roughly 2. + # This has been tested on a range of data sets. + # For that specific set it gave selection accuracy of 95%. + # Weak spot in this is cases with little or no element overlap at all. + # However, such check would have more cost than benefit. + use_automaton = self._max_bcount * asize > bsize * 6 + asize * 2 + calc = self._get_lcsub_calculator(use_automaton) + besti, bestj, bestsize = calc.find(a, alo, ahi, blo, bhi) + + if self.bpopular: + # Extend the best by non-junk elements on each end. In particular, + # "popular" non-junk elements aren't in b2j, which greatly speeds + # the inner loop above, but also means "the best" match so far + # doesn't contain any junk *or* popular non-junk elements. + while besti > alo and bestj > blo and \ + not isbjunk(b[bestj-1]) and \ + a[besti-1] == b[bestj-1]: + besti -= 1 + bestj -= 1 + bestsize += 1 + + while besti+bestsize < ahi and bestj+bestsize < bhi and \ + not isbjunk(b[bestj+bestsize]) and \ + a[besti+bestsize] == b[bestj+bestsize]: + bestsize += 1 + + if self.bjunk: + # Now that we have a wholly interesting match (albeit possibly + # empty!), we may as well suck up the matching junk on each + # side of it too. Can't think of a good reason not to, and it + # saves post-processing the (possibly considerable) expense of + # figuring out what to do with it. In the case of an empty + # interesting match, this is clearly the right thing to do, + # because no other kind of match is possible in the regions. + while besti > alo and bestj > blo and \ + isbjunk(b[bestj-1]) and \ + a[besti-1] == b[bestj-1]: + besti -= 1 + bestj -= 1 + bestsize += 1 + + while besti+bestsize < ahi and bestj+bestsize < bhi and \ + isbjunk(b[bestj+bestsize]) and \ + a[besti+bestsize] == b[bestj+bestsize]: + bestsize = bestsize + 1 return Match(besti, bestj, bestsize) From c1470ad7fe42396391572a61cd45c6f055cf3c6a Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Fri, 23 Jan 2026 02:45:08 +0200 Subject: [PATCH 3/7] minor changes --- Lib/difflib.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index bcde4a0aa56ea9..6459ce45bffa62 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -38,8 +38,8 @@ Match = _namedtuple('Match', 'a b size') -class _LCSUBDict: - """Dict method for finding longest common substring. +class _LCSUBSimple: + """Simple dict method for finding longest common substring. Complexity: T: O(n1 + n2) best, O(n1 × n2) worst @@ -481,18 +481,18 @@ def __chain_b(self): self._max_bcount = max(bcounts.values()) if bcounts else 0 self._all_junk = frozenset(junk | popular) - self._lcsub_aut = None # _LCSUBAutomaton instance - self._lcsub_dict = None # _LCSUBDict instanct + self._lcsub_automaton = None # _LCSUBAutomaton instance + self._lcsub_simple = None # _LCSUBSimple instanct def _get_lcsub_calculator(self, automaton=False): if automaton: - if self._lcsub_aut is None: - self._lcsub_aut = _LCSUBAutomaton(self.b, self._all_junk) - return self._lcsub_aut + if self._lcsub_automaton is None: + self._lcsub_automaton = _LCSUBAutomaton(self.b, self._all_junk) + return self._lcsub_automaton else: - if self._lcsub_dict is None: - self._lcsub_dict = _LCSUBDict(self.b, self._all_junk) - return self._lcsub_dict + if self._lcsub_simple is None: + self._lcsub_simple = _LCSUBSimple(self.b, self._all_junk) + return self._lcsub_simple @property def b2j(self): @@ -574,7 +574,9 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): # For that specific set it gave selection accuracy of 95%. # Weak spot in this is cases with little or no element overlap at all. # However, such check would have more cost than benefit. - use_automaton = self._max_bcount * asize > bsize * 6 + asize * 2 + automaton_cost = bsize * 6 + asize * 2 + simple_cost = self._max_bcount * asize + use_automaton = simple_cost > automaton_cost calc = self._get_lcsub_calculator(use_automaton) besti, bestj, bestsize = calc.find(a, alo, ahi, blo, bhi) From 8fb5f47fdfb37185fc882a6d3ce4c9a7f028c4df Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Fri, 23 Jan 2026 06:03:50 +0200 Subject: [PATCH 4/7] initial trimming of a and test fix --- Lib/difflib.py | 106 ++++++++++++++++++++++++++++------------ Lib/test/test_pyclbr.py | 2 +- 2 files changed, 75 insertions(+), 33 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 6459ce45bffa62..b2cacf8a08141b 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -55,19 +55,25 @@ def __init__(self, seq2, junk=()): junk = frozenset(junk) self.seq2 = seq2 self.junk = junk - self.pos2 = pos2 = {} # positions of each element in seq2 - for i, elt in enumerate(seq2): - indices = pos2.setdefault(elt, []) - indices.append(i) - if junk: - for elt in junk: - del pos2[elt] + self.pos2 = None + + def _build(self): + if self.pos2 is None: + self.pos2 = pos2 = {} # positions of each element in seq2 + for i, elt in enumerate(self.seq2): + indices = pos2.setdefault(elt, []) + indices.append(i) + junk = self.junk + if junk: + for elt in junk: + del pos2[elt] def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): if stop1 is None: stop1 = len(seq1) if stop2 is None: stop2 = len(self.seq2) + self._build() pos2 = self.pos2 j2len = {} nothing = [] @@ -129,6 +135,12 @@ def _build(self, start2, stop2): """ Automaton needs to rebuild for every (start2, stop2) This is made to cache the last one and only rebuild on new values + + Note that to construct Automaton that can be queried for any + (start2, stop2), each node would need to store a store a set of + indices. And this is prone to O(n^2) memory explosion. + Current approach maintains reasonable memory guarantees + and is also much simpler in comparison. """ if self.root is not None and self.cache == (start2, stop2): return @@ -480,24 +492,17 @@ def __chain_b(self): del bcounts[elt] self._max_bcount = max(bcounts.values()) if bcounts else 0 - self._all_junk = frozenset(junk | popular) - self._lcsub_automaton = None # _LCSUBAutomaton instance - self._lcsub_simple = None # _LCSUBSimple instanct - - def _get_lcsub_calculator(self, automaton=False): - if automaton: - if self._lcsub_automaton is None: - self._lcsub_automaton = _LCSUBAutomaton(self.b, self._all_junk) - return self._lcsub_automaton - else: - if self._lcsub_simple is None: - self._lcsub_simple = _LCSUBSimple(self.b, self._all_junk) - return self._lcsub_simple + self._all_junk = all_junk = frozenset(junk | popular) + self._lcsub_simple = _LCSUBSimple(b, all_junk) + self._lcsub_automaton = _LCSUBAutomaton(b, all_junk) @property def b2j(self): # NOTE: For backwards compatibility - return self._get_lcsub_calculator(automaton=False).pos2 + simple_calc = self._lcsub_simple + if simple_calc.pos2 is None: + simple_calc._build() + return simple_calc.pos2 def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. @@ -568,17 +573,54 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): if asize <= 0 and bsize <= 0: besti, bestj, bestsize = alo, blo, 0 else: - # Constant to contruct automaton is roughly 6. - # Constant to run automaton is roughly 2. - # This has been tested on a range of data sets. - # For that specific set it gave selection accuracy of 95%. - # Weak spot in this is cases with little or no element overlap at all. - # However, such check would have more cost than benefit. - automaton_cost = bsize * 6 + asize * 2 - simple_cost = self._max_bcount * asize - use_automaton = simple_cost > automaton_cost - calc = self._get_lcsub_calculator(use_automaton) - besti, bestj, bestsize = calc.find(a, alo, ahi, blo, bhi) + # Can trim a from both ends while characters are not in b + # This is cheap and we have bcounts at all times + bcounts = self._bcounts + tmp_alo = alo + tmp_ahi = ahi + while tmp_alo < tmp_ahi and a[tmp_alo] not in bcounts: + tmp_alo += 1 + while tmp_alo < tmp_ahi and a[tmp_ahi - 1] not in bcounts: + tmp_ahi -= 1 + tmp_asize = tmp_ahi - tmp_alo + if tmp_asize <= 0: + besti, bestj, bestsize = alo, blo, 0 + else: + # Constant to contruct automaton is roughly - 6. + # Constant to run automaton is roughly - 1. + # This has been tested on a range of data sets. + # It gave selection accuracy of ~95%. + # Weak spot is cases with little or no element overlap at all. + # However, such check would likely have more cost than benefit. + simple_calc = self._lcsub_simple + automaton = self._lcsub_automaton + + automaton_cost = tmp_asize + if automaton.cache != (blo, bhi): + automaton_cost += bsize * 6 + simple_cost = self._max_bcount * tmp_asize + if simple_calc.pos2 is None: + simple_cost += bsize + if simple_cost < automaton_cost: + calc = simple_calc + else: + calc = automaton + besti, bestj, bestsize = calc.find(a, tmp_alo, tmp_ahi, blo, bhi) + + # NOTE: Doing it at the same time results in bigger matches! + # # If bjunk or bpopular were omitted in matching (performance reasons) + # # We now extend the match to capture as much as we can + # if self.bjunk or self.bpopular: + # while besti > alo and bestj > blo and a[besti-1] == b[bestj-1]: + # besti -= 1 + # bestj -= 1 + # bestsize += 1 + # lasti = besti + bestsize + # lastj = bestj + bestsize + # while lasti < ahi and lastj < bhi and a[lasti] == b[lastj]: + # lasti += 1 + # lastj += 1 + # bestsize += 1 if self.bpopular: # Extend the best by non-junk elements on each end. In particular, diff --git a/Lib/test/test_pyclbr.py b/Lib/test/test_pyclbr.py index 79ef178f3807f4..f709cdb9522055 100644 --- a/Lib/test/test_pyclbr.py +++ b/Lib/test/test_pyclbr.py @@ -172,7 +172,7 @@ def test_easy(self): with temporary_main_spec(): self.checkModule('doctest', ignore=("TestResults", "_SpoofOut", "DocTestCase", '_DocTestSuite')) - self.checkModule('difflib', ignore=("Match",)) + self.checkModule('difflib', ignore=("Match", "b2j")) def test_cases(self): # see test.pyclbr_input for the rationale behind the ignored symbols From 2a8e6a440608fb25597ea9cd9dbfcb199264fac9 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Fri, 23 Jan 2026 19:31:03 +0200 Subject: [PATCH 5/7] minor changes --- Lib/difflib.py | 84 +++++++++++++++++++------------------------------- 1 file changed, 31 insertions(+), 53 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index b2cacf8a08141b..51bef0d81858ca 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -142,7 +142,8 @@ def _build(self, start2, stop2): Current approach maintains reasonable memory guarantees and is also much simpler in comparison. """ - if self.root is not None and self.cache == (start2, stop2): + key = (start2, stop2) + if self.root is not None and self.cache == key: return self.root = root = [0, None, {}, -1] @@ -186,7 +187,7 @@ def _build(self, start2, stop2): last = curr - self.cache = (start2, stop2) + self.cache = key def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): size1 = len(seq1) @@ -607,57 +608,34 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): calc = automaton besti, bestj, bestsize = calc.find(a, tmp_alo, tmp_ahi, blo, bhi) - # NOTE: Doing it at the same time results in bigger matches! - # # If bjunk or bpopular were omitted in matching (performance reasons) - # # We now extend the match to capture as much as we can - # if self.bjunk or self.bpopular: - # while besti > alo and bestj > blo and a[besti-1] == b[bestj-1]: - # besti -= 1 - # bestj -= 1 - # bestsize += 1 - # lasti = besti + bestsize - # lastj = bestj + bestsize - # while lasti < ahi and lastj < bhi and a[lasti] == b[lastj]: - # lasti += 1 - # lastj += 1 - # bestsize += 1 - - if self.bpopular: - # Extend the best by non-junk elements on each end. In particular, - # "popular" non-junk elements aren't in b2j, which greatly speeds - # the inner loop above, but also means "the best" match so far - # doesn't contain any junk *or* popular non-junk elements. - while besti > alo and bestj > blo and \ - not isbjunk(b[bestj-1]) and \ - a[besti-1] == b[bestj-1]: - besti -= 1 - bestj -= 1 - bestsize += 1 - - while besti+bestsize < ahi and bestj+bestsize < bhi and \ - not isbjunk(b[bestj+bestsize]) and \ - a[besti+bestsize] == b[bestj+bestsize]: - bestsize += 1 - - if self.bjunk: - # Now that we have a wholly interesting match (albeit possibly - # empty!), we may as well suck up the matching junk on each - # side of it too. Can't think of a good reason not to, and it - # saves post-processing the (possibly considerable) expense of - # figuring out what to do with it. In the case of an empty - # interesting match, this is clearly the right thing to do, - # because no other kind of match is possible in the regions. - while besti > alo and bestj > blo and \ - isbjunk(b[bestj-1]) and \ - a[besti-1] == b[bestj-1]: - besti -= 1 - bestj -= 1 - bestsize += 1 - - while besti+bestsize < ahi and bestj+bestsize < bhi and \ - isbjunk(b[bestj+bestsize]) and \ - a[besti+bestsize] == b[bestj+bestsize]: - bestsize = bestsize + 1 + # Extend the best by non-junk elements on each end. In particular, + # "popular" non-junk elements aren't in b2j, which greatly speeds + # the inner loop above, but also means "the best" match so far + # doesn't contain any junk *or* popular non-junk elements. + while besti > alo and bestj > blo and \ + not isbjunk(b[bestj-1]) and \ + a[besti-1] == b[bestj-1]: + besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 + while besti+bestsize < ahi and bestj+bestsize < bhi and \ + not isbjunk(b[bestj+bestsize]) and \ + a[besti+bestsize] == b[bestj+bestsize]: + bestsize += 1 + + # Now that we have a wholly interesting match (albeit possibly + # empty!), we may as well suck up the matching junk on each + # side of it too. Can't think of a good reason not to, and it + # saves post-processing the (possibly considerable) expense of + # figuring out what to do with it. In the case of an empty + # interesting match, this is clearly the right thing to do, + # because no other kind of match is possible in the regions. + while besti > alo and bestj > blo and \ + isbjunk(b[bestj-1]) and \ + a[besti-1] == b[bestj-1]: + besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 + while besti+bestsize < ahi and bestj+bestsize < bhi and \ + isbjunk(b[bestj+bestsize]) and \ + a[besti+bestsize] == b[bestj+bestsize]: + bestsize = bestsize + 1 return Match(besti, bestj, bestsize) From 49b69ddf7c0efef6b71e800af2a01eb427533c98 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Fri, 23 Jan 2026 22:13:22 +0200 Subject: [PATCH 6/7] new threshold and consitency edits --- Lib/difflib.py | 230 ++++++++++++++++++++++++++++--------------------- 1 file changed, 130 insertions(+), 100 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 51bef0d81858ca..5e6a61e407814b 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -38,6 +38,14 @@ Match = _namedtuple('Match', 'a b size') +def _adjust_indices(seq, start, stop): + assert start >= 0 + size = len(seq) + if stop is None or stop > size: + stop = size + return start, stop + + class _LCSUBSimple: """Simple dict method for finding longest common substring. @@ -46,51 +54,61 @@ class _LCSUBSimple: S: O(n2) Members: - pos2 for x in seq2, pos2[x] is a list of the indices (into seq2) + b2j for x in b, b2j[x] is a list of the indices (into b) at which x appears; junk elements do not appear """ - def __init__(self, seq2, junk=()): + def __init__(self, b, junk=()): if not isinstance(junk, frozenset): junk = frozenset(junk) - self.seq2 = seq2 + self.b = b self.junk = junk - self.pos2 = None - - def _build(self): - if self.pos2 is None: - self.pos2 = pos2 = {} # positions of each element in seq2 - for i, elt in enumerate(self.seq2): - indices = pos2.setdefault(elt, []) + self._b2j = None + + def isbuilt(self, blo, bhi): + blo, bhi = _adjust_indices(self.b, blo, bhi) + if blo >= bhi: + return True + return self._b2j is not None + + def _get_b2j(self): + b2j = self._b2j + if b2j is None: + b2j = {} # positions of each element in b + for i, elt in enumerate(self.b): + indices = b2j.setdefault(elt, []) indices.append(i) junk = self.junk if junk: for elt in junk: - del pos2[elt] - - def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): - if stop1 is None: - stop1 = len(seq1) - if stop2 is None: - stop2 = len(self.seq2) - self._build() - pos2 = self.pos2 + del b2j[elt] + self._b2j = b2j + + return b2j + + def find(self, a, alo=0, ahi=None, blo=0, bhi=None): + alo, ahi = _adjust_indices(a, alo, ahi) + blo, bhi = _adjust_indices(self.b, blo, bhi) + if alo >= ahi or blo >= bhi: + return (alo, blo, 0) + + b2j = self._get_b2j() j2len = {} nothing = [] - besti, bestj, bestsize = start1, start2, 0 + besti, bestj, bestsize = alo, blo, 0 # find longest junk-free match # during an iteration of the loop, j2len[j] = length of longest - # junk-free match ending with seq1[i-1] and seq2[j] - for i in range(start1, stop1): - # look at all instances of seq1[i] in seq2; note that because - # pos2 has no junk keys, the loop is skipped if seq1[i] is junk + # junk-free match ending with a[i-1] and b[j] + for i in range(alo, ahi): + # look at all instances of a[i] in b; note that because + # b2j has no junk keys, the loop is skipped if a[i] is junk j2lenget = j2len.get newj2len = {} - for j in pos2.get(seq1[i], nothing): - # seq1[i] matches seq2[j] - if j < start2: + for j in b2j.get(a[i], nothing): + # a[i] matches b[j] + if j < blo: continue - if j >= stop2: + if j >= bhi: break k = newj2len[j] = j2lenget(j-1, 0) + 1 if k > bestsize: @@ -123,81 +141,87 @@ class _LCSUBAutomaton: end_pos - end position of first occurrence (used for result) """ - def __init__(self, seq2, junk=()): + def __init__(self, b, junk=()): if not isinstance(junk, frozenset): junk = frozenset(junk) - self.seq2 = seq2 + self.b = b self.junk = junk - self.root = None - self.cache = (None, None) + self._root = None + self._cache = (None, None) + + def isbuilt(self, blo, bhi): + blo, bhi = _adjust_indices(self.b, blo, bhi) + if blo >= bhi: + return True + return self._root is not None and self._cache == (blo, bhi) - def _build(self, start2, stop2): + def _get_root(self, blo, bhi): """ - Automaton needs to rebuild for every (start2, stop2) + Automaton needs to rebuild for every (blo, bhi) This is made to cache the last one and only rebuild on new values Note that to construct Automaton that can be queried for any - (start2, stop2), each node would need to store a store a set of + (blo, bhi), each node would need to store a store a set of indices. And this is prone to O(n^2) memory explosion. Current approach maintains reasonable memory guarantees and is also much simpler in comparison. """ - key = (start2, stop2) - if self.root is not None and self.cache == key: - return + key = (blo, bhi) + root = self._root + if root is None or self._cache != key: + root = [0, None, {}, -1] + b = self.b + junk = self.junk + last_len = 0 + last = root + for j in range(blo, bhi): + c = b[j] + if c in junk: + last_len = 0 + last = root + else: + last_len += 1 + curr = [last_len, None, {}, j] - self.root = root = [0, None, {}, -1] - seq2 = self.seq2 - junk = self.junk - last_len = 0 - last = root - for j in range(start2, stop2): - c = seq2[j] - if c in junk: - last_len = 0 - last = root - else: - last_len += 1 - curr = [last_len, None, {}, j] - - p = last - p_next = p[_NEXT] - while c not in p_next: - p_next[c] = curr - if p is root: - curr[_LINK] = root - break - p = p[_LINK] + p = last p_next = p[_NEXT] - else: - q = p_next[c] - p_length_p1 = p[_LENGTH] + 1 - if p_length_p1 == q[_LENGTH]: - curr[_LINK] = q + while c not in p_next: + p_next[c] = curr + if p is root: + curr[_LINK] = root + break + p = p[_LINK] + p_next = p[_NEXT] else: - # Copy `q[_POS]` to ensure leftmost match in seq2 - clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]] - while (p_next := p[_NEXT]).get(c) is q: - p_next[c] = clone - if p is root: - break - p = p[_LINK] - - q[_LINK] = curr[_LINK] = clone - - last = curr - - self.cache = key - - def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): - size1 = len(seq1) - size2 = len(self.seq2) - if stop1 is None or stop1 > size1: - stop1 = size1 - if stop2 is None or stop2 > size2: - stop2 = size2 - self._build(start2, stop2) - root = self.root + q = p_next[c] + p_length_p1 = p[_LENGTH] + 1 + if p_length_p1 == q[_LENGTH]: + curr[_LINK] = q + else: + # Copy `q[_POS]` to ensure leftmost match in b + clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]] + while (p_next := p[_NEXT]).get(c) is q: + p_next[c] = clone + if p is root: + break + p = p[_LINK] + + q[_LINK] = curr[_LINK] = clone + + last = curr + + self._root = root + self._cache = key + + return root + + def find(self, a, alo=0, ahi=None, blo=0, bhi=None): + alo, ahi = _adjust_indices(a, alo, ahi) + blo, bhi = _adjust_indices(self.b, blo, bhi) + if alo >= ahi or blo >= bhi: + return (alo, blo, 0) + + root = self._get_root(blo, bhi) junk = self.junk v = root l = 0 @@ -205,8 +229,8 @@ def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): best_state = None best_pos = 0 - for i in range(start1, stop1): - c = seq1[i] + for i in range(alo, ahi): + c = a[i] if c in junk: v = root l = 0 @@ -225,7 +249,7 @@ def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): best_pos = i if not best_len: - return (start1, start2, 0) + return (alo, blo, 0) start_in_s1 = best_pos + 1 - best_len end_in_s2 = best_state[_POS] @@ -492,7 +516,14 @@ def __chain_b(self): for elt in popular: # ditto; as fast for 1% deletion del bcounts[elt] - self._max_bcount = max(bcounts.values()) if bcounts else 0 + if not bcounts: + self._bcount_thres = 0 + else: + sum_bcount = sum(bcounts.values()) + avg_bcount = sum(c * c for c in bcounts.values()) / sum_bcount + max_bcount = max(bcounts.values()) + self._bcount_thres = avg_bcount * 0.8 + max_bcount * 0.2 + self._all_junk = all_junk = frozenset(junk | popular) self._lcsub_simple = _LCSUBSimple(b, all_junk) self._lcsub_automaton = _LCSUBAutomaton(b, all_junk) @@ -500,10 +531,7 @@ def __chain_b(self): @property def b2j(self): # NOTE: For backwards compatibility - simple_calc = self._lcsub_simple - if simple_calc.pos2 is None: - simple_calc._build() - return simple_calc.pos2 + return self._lcsub_simple._get_b2j() def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. @@ -596,12 +624,14 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): simple_calc = self._lcsub_simple automaton = self._lcsub_automaton + simple_cost = self._bcount_thres * tmp_asize + if not simple_calc.isbuilt(blo, bhi): + simple_cost += bsize + automaton_cost = tmp_asize - if automaton.cache != (blo, bhi): + if not automaton.isbuilt(blo, bhi): automaton_cost += bsize * 6 - simple_cost = self._max_bcount * tmp_asize - if simple_calc.pos2 is None: - simple_cost += bsize + if simple_cost < automaton_cost: calc = simple_calc else: From e5a51241d8d5c424b3385d5cb6704edf28a7c863 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sat, 24 Jan 2026 02:20:24 +0200 Subject: [PATCH 7/7] minor minor --- Lib/difflib.py | 142 +++++++++++++++++++++++-------------------------- 1 file changed, 68 insertions(+), 74 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 5e6a61e407814b..3a2f2ba0785ee3 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -39,7 +39,8 @@ def _adjust_indices(seq, start, stop): - assert start >= 0 + if start < 0: + raise ValueError('Starting index can not be negative') size = len(seq) if stop is None or stop > size: stop = size @@ -52,9 +53,10 @@ class _LCSUBSimple: Complexity: T: O(n1 + n2) best, O(n1 × n2) worst S: O(n2) + , where n1 = len(a), n2 = len(b) Members: - b2j for x in b, b2j[x] is a list of the indices (into b) + _b2j for x in b, b2j[x] is a list of the indices (into b) at which x appears; junk elements do not appear """ @@ -73,17 +75,18 @@ def isbuilt(self, blo, bhi): def _get_b2j(self): b2j = self._b2j - if b2j is None: - b2j = {} # positions of each element in b - for i, elt in enumerate(self.b): - indices = b2j.setdefault(elt, []) - indices.append(i) - junk = self.junk - if junk: - for elt in junk: - del b2j[elt] - self._b2j = b2j + if b2j is not None: + return b2j + b2j = {} # positions of each element in b + for i, elt in enumerate(self.b): + indices = b2j.setdefault(elt, []) + indices.append(i) + junk = self.junk + if junk: + for elt in junk: + del b2j[elt] + self._b2j = b2j return b2j def find(self, a, alo=0, ahi=None, blo=0, bhi=None): @@ -120,18 +123,13 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None): return besti, bestj, bestsize -_LENGTH = 0 -_LINK = 1 -_NEXT = 2 -_POS = 3 - - class _LCSUBAutomaton: """Suffix Automaton for finding longest common substring. Complexity: T: O(n1 + n2) - roughly 2 * n1 + 6 * n2 S: O(n2) - maximum nodes: 2 * n2 + 1 + , where n1 = len(a), n2 = len(b) Node spec: node: list = [length: int, link: list, next: dict, end_pos: int] @@ -157,62 +155,58 @@ def isbuilt(self, blo, bhi): def _get_root(self, blo, bhi): """ - Automaton needs to rebuild for every (blo, bhi) - This is made to cache the last one and only rebuild on new values - - Note that to construct Automaton that can be queried for any - (blo, bhi), each node would need to store a store a set of - indices. And this is prone to O(n^2) memory explosion. - Current approach maintains reasonable memory guarantees - and is also much simpler in comparison. + Automaton needs to rebuild for every (start2, stop2) + It is made to cache the last one and only rebuilds for new range """ key = (blo, bhi) root = self._root - if root is None or self._cache != key: - root = [0, None, {}, -1] - b = self.b - junk = self.junk - last_len = 0 - last = root - for j in range(blo, bhi): - c = b[j] - if c in junk: - last_len = 0 - last = root + if root is not None and self._cache == key: + return root + + LEN, LINK, NEXT, EPOS = 0, 1, 2, 3 + root = [0, None, {}, -1] + b = self.b + junk = self.junk + last_len = 0 + last = root + for j in range(blo, bhi): + c = b[j] + if c in junk: + last_len = 0 + last = root + else: + last_len += 1 + curr = [last_len, None, {}, j] + + p = last + p_next = p[NEXT] + while c not in p_next: + p_next[c] = curr + if p is root: + curr[LINK] = root + break + p = p[LINK] + p_next = p[NEXT] else: - last_len += 1 - curr = [last_len, None, {}, j] - - p = last - p_next = p[_NEXT] - while c not in p_next: - p_next[c] = curr - if p is root: - curr[_LINK] = root - break - p = p[_LINK] - p_next = p[_NEXT] + q = p_next[c] + p_len_p1 = p[LEN] + 1 + if p_len_p1 == q[LEN]: + curr[LINK] = q else: - q = p_next[c] - p_length_p1 = p[_LENGTH] + 1 - if p_length_p1 == q[_LENGTH]: - curr[_LINK] = q - else: - # Copy `q[_POS]` to ensure leftmost match in b - clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]] - while (p_next := p[_NEXT]).get(c) is q: - p_next[c] = clone - if p is root: - break - p = p[_LINK] - - q[_LINK] = curr[_LINK] = clone - - last = curr - - self._root = root - self._cache = key + # Copy `q[EPOS]` to ensure leftmost match in b + clone = [p_len_p1, q[LINK], q[NEXT].copy(), q[EPOS]] + while (p_next := p[NEXT]).get(c) is q: + p_next[c] = clone + if p is root: + break + p = p[LINK] + + q[LINK] = curr[LINK] = clone + + last = curr + self._root = root + self._cache = key return root def find(self, a, alo=0, ahi=None, blo=0, bhi=None): @@ -221,6 +215,7 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None): if alo >= ahi or blo >= bhi: return (alo, blo, 0) + LEN, LINK, NEXT, EPOS = 0, 1, 2, 3 root = self._get_root(blo, bhi) junk = self.junk v = root @@ -235,11 +230,11 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None): v = root l = 0 else: - while v is not root and c not in v[_NEXT]: - v = v[_LINK] - l = v[_LENGTH] + while v is not root and c not in v[NEXT]: + v = v[LINK] + l = v[LEN] - v_next = v[_NEXT] + v_next = v[NEXT] if c in v_next: v = v_next[c] l += 1 @@ -252,8 +247,7 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None): return (alo, blo, 0) start_in_s1 = best_pos + 1 - best_len - end_in_s2 = best_state[_POS] - start_in_s2 = end_in_s2 + 1 - best_len + start_in_s2 = best_state[EPOS] + 1 - best_len return (start_in_s1, start_in_s2, best_len)