1 /**
2  * This module provides a diff() function.
3  *
4  * The function takes two slicable forward ranges of < and == comparable
5  * objects (e.g., two slices of strings, two strings, two slices of ints,
6  * two slices of "items") and diffs them using a slightly simplified
7  * version of the Python difflib's sequence matcher algorithm. The
8  * function returns a (possibly empty) slice of Diff structs, each with a
9  * tag (equal, insert, delete, replace) and the two relevant subslices.
10  *
11  * See tests.d for some examples including Test #17, #18 and #19.
12  *
13  * Authors: Mark Summerfield, mark@qtrac.eu
14  * License: Apache 2.0
15  * Copyright: © 2020 Mark Summerfield. All rights reserved.
16 */
17 module ddiff;
18 
19 import std.container.rbtree: RedBlackTree;
20 import std.range: ElementType, front, hasSlicing, isForwardRange;
21 import std.typecons: Tuple;
22 
23 /**
24  * Iterates two slices and diffs them.
25  * Params:
26  *  a = a forward sliceable range
27  *  b = a forward sliceable range
28  *  equalSpan = an EqualSpan
29  * Elements from a and b must be comparable for equality.
30  * If equalSpan is Drop (the default), then only Diff structs with Insert,
31  * Delete, or Replace tags will be returned; otherwise Diff structs
32  * covering all the input will be returned, so will additionally include
33  * Diff structs with Equal tags.
34  * Returns: A slice of Diff structs.
35 */
36 auto diff(R)(R a, R b, EqualSpan equalSpan=EqualSpan.Drop) if (
37         isForwardRange!R && // R is a range that can be iterated repeatedly
38         hasSlicing!R &&
39         is(typeof(R.init.front == R.init.front)) && // Elements support ==
40         is(typeof(R.init.front < R.init.front)) // Elements support <
41         ) {
42     auto diff = new Differ!R(a, b);
43     return diff.diff(equalSpan);
44 }
45 
46 /**
47  * A difference with a tag to specify what the difference is and the two
48  * relevant subslices.
49 */
50 struct Diff(E) {
51     Tag tag; /// What kind of difference this is.
52     E[] a; /// The relevant subslice of the original `a` slice.
53     E[] b; /// The relevant subslice of the original `b` slice.
54 
55     /**
56      * This method is primarily for testing.
57      * Returns: a string indicating the difference.
58     */
59     string toString() const {
60         import std.format: format;
61 
62         final switch (tag) {
63         case Tag.Equal: return format!"= %s"(a);
64         case Tag.Insert: return format!"+ %s"(b);
65         case Tag.Delete: return format!"- %s"(a);
66         case Tag.Replace: return format!"< %s |> %s"(a, b);
67         }
68     }
69 }
70 
71 /**
72  * Used to tell the diff() function whether to include Diff structs in
73  * its output slice that have all possible tags, including the Equal tag
74  * (Keep), or only difference tags, Insert, Delete, Replace (Drop).
75 */
76 enum EqualSpan { Drop, Keep }
77 
78 /**
79  * What kind of difference the Diff struct represents.
80 */
81 enum Tag { Equal, Insert, Delete, Replace }
82 
83 private alias Quad = Tuple!(int, "aStart", int, "aEnd",
84                             int, "bStart", int, "bEnd");
85 
86 private struct Match {
87     int aStart;
88     int bStart;
89     int length;
90 
91     int opCmp(const Match other) const {
92         return (aStart == other.aStart) ? (
93                     (bStart == other.bStart) ? length - other.length
94                                              : bStart - other.bStart)
95                     : aStart - other.aStart;
96     }
97 }
98 
99 private class Differ(R) if (
100         isForwardRange!R && // R is a range that can be iterated repeatedly
101         hasSlicing!R &&
102         is(typeof(R.init.front == R.init.front)) && // Elements support ==
103         is(typeof(R.init.front < R.init.front)) // Elements support <
104         ) {
105     import std.conv: to;
106     import std.math: floor;
107 
108     alias E = ElementType!R;
109 
110     private R a;
111     private R b;
112     private int[][E] b2j;
113 
114     private this(R a, R b) {
115         this.a = a;
116         this.b = b;
117         chainB();
118     }
119 
120     private void chainB() {
121         import std.algorithm: each, filter;
122         import std.range: iota, zip;
123 
124         each!(t => b2j[t[0]] ~= t[1].to!int)(zip(b, iota(b.length)));
125         auto popular = new RedBlackTree!E;
126         auto len = b.length;
127         if (len > 200) {
128             immutable minLen = to!int(floor((to!double(len) / 100.0))) + 1;
129             each!(kv => popular.insert(kv.key))(
130                 filter!(kv => kv.value.length > minLen)(b2j.byKeyValue));
131             each!(element => b2j.remove(element))(popular);
132         }
133     }
134 
135     private Match[] matches() {
136         import std.algorithm: sort;
137         import std.array: back, empty;
138         import std.range.primitives: popBack;
139 
140         immutable aLen = a.length.to!int;
141         immutable bLen = b.length.to!int;
142         Quad[] quads = [Quad(0, aLen, 0, bLen)];
143         Match[] matches;
144         while (!quads.empty) {
145             auto quad = quads.back();
146             quads.popBack();
147             auto match = longestMatch(quad);
148             auto i = match.aStart;
149             auto j = match.bStart;
150             auto k = match.length;
151             if (k > 0) {
152                 matches ~= match;
153                 if (quad.aStart < i && quad.bStart < j)
154                     quads ~= Quad(quad.aStart, i, quad.bStart, j);
155                 if (i + k < quad.aEnd && j + k < quad.bEnd)
156                     quads ~= Quad(i + k, quad.aEnd, j + k, quad.bEnd);
157             }
158         }
159         sort(matches);
160         int aStart;
161         int bStart;
162         int length;
163         Match[] nonAdjacent;
164         foreach (match; matches) {
165             if (aStart + length == match.aStart &&
166                     bStart + length == match.bStart)
167                 length += match.length;
168             else {
169                 if (length)
170                     nonAdjacent ~= Match(aStart, bStart, length);
171                 aStart = match.aStart;
172                 bStart = match.bStart;
173                 length = match.length;
174             }
175         }
176         if (length)
177             nonAdjacent ~= Match(aStart, bStart, length);
178         nonAdjacent ~= Match(aLen, bLen, 0);
179         return nonAdjacent;
180     }
181 
182     private Match longestMatch(Quad quad) {
183         int bestI = quad.aStart;
184         int bestJ = quad.bStart;
185         int bestSize;
186         int[int] j2Len;
187         for (int i = quad.aStart; i < quad.aEnd; i++) {
188             int[int] newJ2Len;
189             if (auto indexes = a[i] in b2j) {
190                 foreach (j; *indexes) {
191                     if (j < quad.bStart)
192                         continue;
193                     if (j >= quad.bEnd)
194                         break;
195                     immutable k = j2Len.get(j - 1, 0) + 1;
196                     newJ2Len[j] = k;
197                     if (k > bestSize) {
198                         bestI = i - k + 1;
199                         bestJ = j - k + 1;
200                         bestSize = k;
201                     }
202                 }
203             }
204             j2Len = newJ2Len;
205         }
206         while (bestI > quad.aStart && bestJ > quad.bStart &&
207                 a[bestI - 1] == b[bestJ - 1]) {
208             bestI--;
209             bestJ--;
210             bestSize++;
211         }
212         while (bestI + bestSize < quad.aEnd &&
213                 bestJ + bestSize < quad.bEnd &&
214                 a[bestI + bestSize] == b[bestJ + bestSize])
215             bestSize++;
216         return Match(bestI, bestJ, bestSize);
217     }
218 
219     private Diff!E[] diff(EqualSpan equalSpan) {
220         Diff!E[] diffs;
221         int i;
222         int j;
223         foreach (match; matches()) {
224             auto diff = Diff!E(Tag.Equal, a[i .. match.aStart],
225                                           b[j .. match.bStart]);
226             if (i < match.aStart && j < match.bStart)
227                 diff.tag = Tag.Replace;
228             else if (i < match.aStart)
229                 diff.tag = Tag.Delete;
230             else if (j < match.bStart)
231                 diff.tag = Tag.Insert;
232             if (diff.tag != Tag.Equal)
233                 diffs ~= diff;
234             i = match.aStart + match.length;
235             j = match.bStart + match.length;
236             if (match.length && equalSpan == EqualSpan.Keep)
237                 diffs ~= Diff!E(Tag.Equal, a[match.aStart .. i],
238                                            b[match.bStart .. j]);
239         }
240         return diffs;
241     }
242 }