1 /**
2  * This module implements the Python difflib module's sequence matcher and
3  * provides a diff() function.
4  *
5  * The function takes two slicable forward ranges of < and == comparable
6  * objects (e.g., two slices of strings, two strings, two slices of ints,
7  * two slices of "items") and diffs them using a slightly simplified
8  * version of the Python difflib's sequence matcher algorithm. The
9  * function returns a (possibly empty) slice of Diff structs, each with a
10  * tag (equal, insert, delete, replace) and the two relevant subslices.
11  *
12  * See tests.d for some examples including Test #17, #18 and #19.
13  *
14  * Authors: Mark Summerfield, mark@qtrac.eu
15  * License: Apache 2.0
16  * Copyright: © 2020 Mark Summerfield. All rights reserved.
17 */
18 module qtrac.ddiff;
19 
20 import std.range: ElementType, front, hasSlicing, isForwardRange;
21 import std.typecons: Tuple;
22 
23 /**
24  * Iterates two slices and diffs them.
25  * Params:
26  *  a = a forward sliceable range
27  *  b = a forward sliceable range
28  *  equalSpan = an EqualSpan
29  * Elements from a and b must be comparable for equality.
30  * If equalSpan is Drop (the default), then only Diff structs with Insert,
31  * Delete, or Replace tags will be returned; otherwise Diff structs
32  * covering all the input will be returned, so will additionally include
33  * Diff structs with Equal tags.
34  * Returns: A slice of Diff structs.
35 */
36 auto diff(R)(R a, R b, EqualSpan equalSpan=EqualSpan.Drop) if (
37         isForwardRange!R && // R is a range that can be iterated repeatedly
38         hasSlicing!R &&
39         is(typeof(R.init.front == R.init.front)) && // Elements support ==
40         is(typeof(R.init.front < R.init.front)) // Elements support <
41         ) {
42     auto diff = new Differ!R(a, b);
43     return diff.diff(equalSpan);
44 }
45 
46 /**
47  * A difference with a tag to specify what the difference is and the two
48  * relevant subslices.
49 */
50 struct Diff(E) {
51     Tag tag; /// What kind of difference this is.
52     E[] a; /// The relevant subslice of the original `a` slice.
53     E[] b; /// The relevant subslice of the original `b` slice.
54 
55     /**
56      * This method is primarily for testing.
57      * Returns: a string indicating the difference.
58     */
59     string toString() const {
60         import std.format: format;
61 
62         final switch (tag) {
63         case Tag.Equal: return format!"= %s"(a);
64         case Tag.Insert: return format!"+ %s"(b);
65         case Tag.Delete: return format!"- %s"(a);
66         case Tag.Replace: return format!"< %s |> %s"(a, b);
67         }
68     }
69 }
70 
71 /**
72  * Used to tell the diff() function whether to include Diff structs in
73  * its output slice that have all possible tags, including the Equal tag
74  * (Keep), or only difference tags, Insert, Delete, Replace (Drop).
75 */
76 enum EqualSpan { Drop, Keep }
77 
78 /**
79  * What kind of difference the Diff struct represents.
80 */
81 enum Tag { Equal, Insert, Delete, Replace }
82 
83 private alias Quad = Tuple!(int, "aStart", int, "aEnd",
84                             int, "bStart", int, "bEnd");
85 
86 private struct Match {
87     int aStart;
88     int bStart;
89     int length;
90 
91     int opCmp(const Match other) const {
92         return (aStart == other.aStart) ? (
93                     (bStart == other.bStart) ? length - other.length
94                                              : bStart - other.bStart)
95                     : aStart - other.aStart;
96     }
97 }
98 
99 private class Differ(R) if (
100         isForwardRange!R && // R is a range that can be iterated repeatedly
101         hasSlicing!R &&
102         is(typeof(R.init.front == R.init.front)) && // Elements support ==
103         is(typeof(R.init.front < R.init.front)) // Elements support <
104         ) {
105     import std.conv: to;
106     import std.math: floor;
107 
108     alias E = ElementType!R;
109 
110     private R a;
111     private R b;
112     private int[][E] b2j;
113 
114     private this(R a, R b) {
115         this.a = a;
116         this.b = b;
117         chainB();
118     }
119 
120     private void chainB() {
121         import std.algorithm: each, filter;
122         import std.range: iota, zip;
123 
124         alias Unit = void[0];
125         enum unit = Unit.init;
126 
127         each!(t => b2j[t[0]] ~= t[1].to!int)(zip(b, iota(b.length)));
128         Unit[E] popular;
129         auto len = b.length;
130         if (len > 200) {
131             immutable minLen = to!int(floor((to!double(len) / 100.0))) + 1;
132             each!(kv => popular[kv.key] = unit)(
133                 filter!(kv => kv.value.length > minLen)(b2j.byKeyValue));
134             each!(element => b2j.remove(element))(popular.byKey);
135         }
136     }
137 
138     private Match[] matches() {
139         import std.algorithm: sort;
140         import std.array: back, empty;
141         import std.range.primitives: popBack;
142 
143         immutable aLen = a.length.to!int;
144         immutable bLen = b.length.to!int;
145         Quad[] quads = [Quad(0, aLen, 0, bLen)];
146         Match[] matches;
147         while (!quads.empty) {
148             auto quad = quads.back();
149             quads.popBack();
150             auto match = longestMatch(quad);
151             auto i = match.aStart;
152             auto j = match.bStart;
153             auto k = match.length;
154             if (k > 0) {
155                 matches ~= match;
156                 if (quad.aStart < i && quad.bStart < j)
157                     quads ~= Quad(quad.aStart, i, quad.bStart, j);
158                 if (i + k < quad.aEnd && j + k < quad.bEnd)
159                     quads ~= Quad(i + k, quad.aEnd, j + k, quad.bEnd);
160             }
161         }
162         sort(matches);
163         int aStart;
164         int bStart;
165         int length;
166         Match[] nonAdjacent;
167         foreach (match; matches) {
168             if (aStart + length == match.aStart &&
169                     bStart + length == match.bStart)
170                 length += match.length;
171             else {
172                 if (length)
173                     nonAdjacent ~= Match(aStart, bStart, length);
174                 aStart = match.aStart;
175                 bStart = match.bStart;
176                 length = match.length;
177             }
178         }
179         if (length)
180             nonAdjacent ~= Match(aStart, bStart, length);
181         nonAdjacent ~= Match(aLen, bLen, 0);
182         return nonAdjacent;
183     }
184 
185     private Match longestMatch(Quad quad) {
186         int bestI = quad.aStart;
187         int bestJ = quad.bStart;
188         int bestSize;
189         int[int] j2Len;
190         for (int i = quad.aStart; i < quad.aEnd; i++) {
191             int[int] newJ2Len;
192             if (auto indexes = a[i] in b2j) {
193                 foreach (j; *indexes) {
194                     if (j < quad.bStart)
195                         continue;
196                     if (j >= quad.bEnd)
197                         break;
198                     immutable k = j2Len.get(j - 1, 0) + 1;
199                     newJ2Len[j] = k;
200                     if (k > bestSize) {
201                         bestI = i - k + 1;
202                         bestJ = j - k + 1;
203                         bestSize = k;
204                     }
205                 }
206             }
207             j2Len = newJ2Len;
208         }
209         while (bestI > quad.aStart && bestJ > quad.bStart &&
210                 a[bestI - 1] == b[bestJ - 1]) {
211             bestI--;
212             bestJ--;
213             bestSize++;
214         }
215         while (bestI + bestSize < quad.aEnd &&
216                 bestJ + bestSize < quad.bEnd &&
217                 a[bestI + bestSize] == b[bestJ + bestSize])
218             bestSize++;
219         return Match(bestI, bestJ, bestSize);
220     }
221 
222     private Diff!E[] diff(EqualSpan equalSpan) {
223         Diff!E[] diffs;
224         int i;
225         int j;
226         foreach (match; matches()) {
227             auto diff = Diff!E(Tag.Equal, a[i .. match.aStart],
228                                           b[j .. match.bStart]);
229             if (i < match.aStart && j < match.bStart)
230                 diff.tag = Tag.Replace;
231             else if (i < match.aStart)
232                 diff.tag = Tag.Delete;
233             else if (j < match.bStart)
234                 diff.tag = Tag.Insert;
235             if (diff.tag != Tag.Equal)
236                 diffs ~= diff;
237             i = match.aStart + match.length;
238             j = match.bStart + match.length;
239             if (match.length && equalSpan == EqualSpan.Keep)
240                 diffs ~= Diff!E(Tag.Equal, a[match.aStart .. i],
241                                            b[match.bStart .. j]);
242         }
243         return diffs;
244     }
245 }