1 /** 2 * This module implements the Python difflib module's sequence matcher and 3 * provides a diff() function. 4 * 5 * The function takes two slicable forward ranges of < and == comparable 6 * objects (e.g., two slices of strings, two strings, two slices of ints, 7 * two slices of "items") and diffs them using a slightly simplified 8 * version of the Python difflib's sequence matcher algorithm. The 9 * function returns a (possibly empty) slice of Diff structs, each with a 10 * tag (equal, insert, delete, replace) and the two relevant subslices. 11 * 12 * See tests.d for some examples including Test #17, #18 and #19. 13 * 14 * Authors: Mark Summerfield, mark@qtrac.eu 15 * License: Apache 2.0 16 * Copyright: © 2020 Mark Summerfield. All rights reserved. 17 */ 18 module qtrac.ddiff; 19 20 import std.range: ElementType, front, hasSlicing, isForwardRange; 21 import std.typecons: Tuple; 22 23 /** 24 * Iterates two slices and diffs them. 25 * Params: 26 * a = a forward sliceable range 27 * b = a forward sliceable range 28 * equalSpan = an EqualSpan 29 * Elements from a and b must be comparable for equality. 30 * If equalSpan is Drop (the default), then only Diff structs with Insert, 31 * Delete, or Replace tags will be returned; otherwise Diff structs 32 * covering all the input will be returned, so will additionally include 33 * Diff structs with Equal tags. 34 * Returns: A slice of Diff structs. 35 */ 36 auto diff(R)(R a, R b, EqualSpan equalSpan=EqualSpan.Drop) if ( 37 isForwardRange!R && // R is a range that can be iterated repeatedly 38 hasSlicing!R && 39 is(typeof(R.init.front == R.init.front)) && // Elements support == 40 is(typeof(R.init.front < R.init.front)) // Elements support < 41 ) { 42 auto diff = new Differ!R(a, b); 43 return diff.diff(equalSpan); 44 } 45 46 /** 47 * A difference with a tag to specify what the difference is and the two 48 * relevant subslices. 49 */ 50 struct Diff(E) { 51 Tag tag; /// What kind of difference this is. 52 E[] a; /// The relevant subslice of the original `a` slice. 53 E[] b; /// The relevant subslice of the original `b` slice. 54 55 /** 56 * This method is primarily for testing. 57 * Returns: a string indicating the difference. 58 */ 59 string toString() const { 60 import std.format: format; 61 62 final switch (tag) { 63 case Tag.Equal: return format!"= %s"(a); 64 case Tag.Insert: return format!"+ %s"(b); 65 case Tag.Delete: return format!"- %s"(a); 66 case Tag.Replace: return format!"< %s |> %s"(a, b); 67 } 68 } 69 } 70 71 /** 72 * Used to tell the diff() function whether to include Diff structs in 73 * its output slice that have all possible tags, including the Equal tag 74 * (Keep), or only difference tags, Insert, Delete, Replace (Drop). 75 */ 76 enum EqualSpan { Drop, Keep } 77 78 /** 79 * What kind of difference the Diff struct represents. 80 */ 81 enum Tag { Equal, Insert, Delete, Replace } 82 83 private alias Quad = Tuple!(int, "aStart", int, "aEnd", 84 int, "bStart", int, "bEnd"); 85 86 private struct Match { 87 int aStart; 88 int bStart; 89 int length; 90 91 int opCmp(const Match other) const { 92 return (aStart == other.aStart) ? ( 93 (bStart == other.bStart) ? length - other.length 94 : bStart - other.bStart) 95 : aStart - other.aStart; 96 } 97 } 98 99 private class Differ(R) if ( 100 isForwardRange!R && // R is a range that can be iterated repeatedly 101 hasSlicing!R && 102 is(typeof(R.init.front == R.init.front)) && // Elements support == 103 is(typeof(R.init.front < R.init.front)) // Elements support < 104 ) { 105 import std.conv: to; 106 import std.math: floor; 107 108 alias E = ElementType!R; 109 110 private R a; 111 private R b; 112 private int[][E] b2j; 113 114 private this(R a, R b) { 115 this.a = a; 116 this.b = b; 117 chainB(); 118 } 119 120 private void chainB() { 121 import std.algorithm: each, filter; 122 import std.range: iota, zip; 123 124 alias Unit = void[0]; 125 enum unit = Unit.init; 126 127 each!(t => b2j[t[0]] ~= t[1].to!int)(zip(b, iota(b.length))); 128 Unit[E] popular; 129 auto len = b.length; 130 if (len > 200) { 131 immutable minLen = to!int(floor((to!double(len) / 100.0))) + 1; 132 each!(kv => popular[kv.key] = unit)( 133 filter!(kv => kv.value.length > minLen)(b2j.byKeyValue)); 134 each!(element => b2j.remove(element))(popular.byKey); 135 } 136 } 137 138 private Match[] matches() { 139 import std.algorithm: sort; 140 import std.array: back, empty; 141 import std.range.primitives: popBack; 142 143 immutable aLen = a.length.to!int; 144 immutable bLen = b.length.to!int; 145 Quad[] quads = [Quad(0, aLen, 0, bLen)]; 146 Match[] matches; 147 while (!quads.empty) { 148 auto quad = quads.back(); 149 quads.popBack(); 150 auto match = longestMatch(quad); 151 auto i = match.aStart; 152 auto j = match.bStart; 153 auto k = match.length; 154 if (k > 0) { 155 matches ~= match; 156 if (quad.aStart < i && quad.bStart < j) 157 quads ~= Quad(quad.aStart, i, quad.bStart, j); 158 if (i + k < quad.aEnd && j + k < quad.bEnd) 159 quads ~= Quad(i + k, quad.aEnd, j + k, quad.bEnd); 160 } 161 } 162 sort(matches); 163 int aStart; 164 int bStart; 165 int length; 166 Match[] nonAdjacent; 167 foreach (match; matches) { 168 if (aStart + length == match.aStart && 169 bStart + length == match.bStart) 170 length += match.length; 171 else { 172 if (length) 173 nonAdjacent ~= Match(aStart, bStart, length); 174 aStart = match.aStart; 175 bStart = match.bStart; 176 length = match.length; 177 } 178 } 179 if (length) 180 nonAdjacent ~= Match(aStart, bStart, length); 181 nonAdjacent ~= Match(aLen, bLen, 0); 182 return nonAdjacent; 183 } 184 185 private Match longestMatch(Quad quad) { 186 int bestI = quad.aStart; 187 int bestJ = quad.bStart; 188 int bestSize; 189 int[int] j2Len; 190 for (int i = quad.aStart; i < quad.aEnd; i++) { 191 int[int] newJ2Len; 192 if (auto indexes = a[i] in b2j) { 193 foreach (j; *indexes) { 194 if (j < quad.bStart) 195 continue; 196 if (j >= quad.bEnd) 197 break; 198 immutable k = j2Len.get(j - 1, 0) + 1; 199 newJ2Len[j] = k; 200 if (k > bestSize) { 201 bestI = i - k + 1; 202 bestJ = j - k + 1; 203 bestSize = k; 204 } 205 } 206 } 207 j2Len = newJ2Len; 208 } 209 while (bestI > quad.aStart && bestJ > quad.bStart && 210 a[bestI - 1] == b[bestJ - 1]) { 211 bestI--; 212 bestJ--; 213 bestSize++; 214 } 215 while (bestI + bestSize < quad.aEnd && 216 bestJ + bestSize < quad.bEnd && 217 a[bestI + bestSize] == b[bestJ + bestSize]) 218 bestSize++; 219 return Match(bestI, bestJ, bestSize); 220 } 221 222 private Diff!E[] diff(EqualSpan equalSpan) { 223 Diff!E[] diffs; 224 int i; 225 int j; 226 foreach (match; matches()) { 227 auto diff = Diff!E(Tag.Equal, a[i .. match.aStart], 228 b[j .. match.bStart]); 229 if (i < match.aStart && j < match.bStart) 230 diff.tag = Tag.Replace; 231 else if (i < match.aStart) 232 diff.tag = Tag.Delete; 233 else if (j < match.bStart) 234 diff.tag = Tag.Insert; 235 if (diff.tag != Tag.Equal) 236 diffs ~= diff; 237 i = match.aStart + match.length; 238 j = match.bStart + match.length; 239 if (match.length && equalSpan == EqualSpan.Keep) 240 diffs ~= Diff!E(Tag.Equal, a[match.aStart .. i], 241 b[match.bStart .. j]); 242 } 243 return diffs; 244 } 245 }