1 /** 2 * This module provides a diff() function. 3 * 4 * The function takes two slicable forward ranges of < and == comparable 5 * objects (e.g., two slices of strings, two strings, two slices of ints, 6 * two slices of "items") and diffs them using a slightly simplified 7 * version of the Python difflib's sequence matcher algorithm. The 8 * function returns a (possibly empty) slice of Diff structs, each with a 9 * tag (equal, insert, delete, replace) and the two relevant subslices. 10 * 11 * See tests.d for some examples including Test #17, #18 and #19. 12 * 13 * Authors: Mark Summerfield, mark@qtrac.eu 14 * License: Apache 2.0 15 * Copyright: © 2020 Mark Summerfield. All rights reserved. 16 */ 17 module ddiff; 18 19 import std.container.rbtree: RedBlackTree; 20 import std.range: ElementType, front, hasSlicing, isForwardRange; 21 import std.typecons: Tuple; 22 23 /** 24 * Iterates two slices and diffs them. 25 * Params: 26 * a = a forward sliceable range 27 * b = a forward sliceable range 28 * equalSpan = an EqualSpan 29 * Elements from a and b must be comparable for equality. 30 * If equalSpan is Drop (the default), then only Diff structs with Insert, 31 * Delete, or Replace tags will be returned; otherwise Diff structs 32 * covering all the input will be returned, so will additionally include 33 * Diff structs with Equal tags. 34 * Returns: A slice of Diff structs. 35 */ 36 auto diff(R)(R a, R b, EqualSpan equalSpan=EqualSpan.Drop) if ( 37 isForwardRange!R && // R is a range that can be iterated repeatedly 38 hasSlicing!R && 39 is(typeof(R.init.front == R.init.front)) && // Elements support == 40 is(typeof(R.init.front < R.init.front)) // Elements support < 41 ) { 42 auto diff = new Differ!R(a, b); 43 return diff.diff(equalSpan); 44 } 45 46 /** 47 * A difference with a tag to specify what the difference is and the two 48 * relevant subslices. 49 */ 50 struct Diff(E) { 51 Tag tag; /// What kind of difference this is. 52 E[] a; /// The relevant subslice of the original `a` slice. 53 E[] b; /// The relevant subslice of the original `b` slice. 54 55 /** 56 * This method is primarily for testing. 57 * Returns: a string indicating the difference. 58 */ 59 string toString() const { 60 import std.format: format; 61 62 final switch (tag) { 63 case Tag.Equal: return format!"= %s"(a); 64 case Tag.Insert: return format!"+ %s"(b); 65 case Tag.Delete: return format!"- %s"(a); 66 case Tag.Replace: return format!"< %s |> %s"(a, b); 67 } 68 } 69 } 70 71 /** 72 * Used to tell the diff() function whether to include Diff structs in 73 * its output slice that have all possible tags, including the Equal tag 74 * (Keep), or only difference tags, Insert, Delete, Replace (Drop). 75 */ 76 enum EqualSpan { Drop, Keep } 77 78 /** 79 * What kind of difference the Diff struct represents. 80 */ 81 enum Tag { Equal, Insert, Delete, Replace } 82 83 private alias Quad = Tuple!(int, "aStart", int, "aEnd", 84 int, "bStart", int, "bEnd"); 85 86 private struct Match { 87 int aStart; 88 int bStart; 89 int length; 90 91 int opCmp(const Match other) const { 92 return (aStart == other.aStart) ? ( 93 (bStart == other.bStart) ? length - other.length 94 : bStart - other.bStart) 95 : aStart - other.aStart; 96 } 97 } 98 99 private class Differ(R) if ( 100 isForwardRange!R && // R is a range that can be iterated repeatedly 101 hasSlicing!R && 102 is(typeof(R.init.front == R.init.front)) && // Elements support == 103 is(typeof(R.init.front < R.init.front)) // Elements support < 104 ) { 105 import std.conv: to; 106 import std.math: floor; 107 108 alias E = ElementType!R; 109 110 private R a; 111 private R b; 112 private int[][E] b2j; 113 114 private this(R a, R b) { 115 this.a = a; 116 this.b = b; 117 chainB(); 118 } 119 120 private void chainB() { 121 import std.algorithm: each, filter; 122 import std.range: iota, zip; 123 124 each!(t => b2j[t[0]] ~= t[1].to!int)(zip(b, iota(b.length))); 125 auto popular = new RedBlackTree!E; 126 auto len = b.length; 127 if (len > 200) { 128 immutable minLen = to!int(floor((to!double(len) / 100.0))) + 1; 129 each!(kv => popular.insert(kv.key))( 130 filter!(kv => kv.value.length > minLen)(b2j.byKeyValue)); 131 each!(element => b2j.remove(element))(popular); 132 } 133 } 134 135 private Match[] matches() { 136 import std.algorithm: sort; 137 import std.array: back, empty; 138 import std.range.primitives: popBack; 139 140 immutable aLen = a.length.to!int; 141 immutable bLen = b.length.to!int; 142 Quad[] quads = [Quad(0, aLen, 0, bLen)]; 143 Match[] matches; 144 while (!quads.empty) { 145 auto quad = quads.back(); 146 quads.popBack(); 147 auto match = longestMatch(quad); 148 auto i = match.aStart; 149 auto j = match.bStart; 150 auto k = match.length; 151 if (k > 0) { 152 matches ~= match; 153 if (quad.aStart < i && quad.bStart < j) 154 quads ~= Quad(quad.aStart, i, quad.bStart, j); 155 if (i + k < quad.aEnd && j + k < quad.bEnd) 156 quads ~= Quad(i + k, quad.aEnd, j + k, quad.bEnd); 157 } 158 } 159 sort(matches); 160 int aStart; 161 int bStart; 162 int length; 163 Match[] nonAdjacent; 164 foreach (match; matches) { 165 if (aStart + length == match.aStart && 166 bStart + length == match.bStart) 167 length += match.length; 168 else { 169 if (length) 170 nonAdjacent ~= Match(aStart, bStart, length); 171 aStart = match.aStart; 172 bStart = match.bStart; 173 length = match.length; 174 } 175 } 176 if (length) 177 nonAdjacent ~= Match(aStart, bStart, length); 178 nonAdjacent ~= Match(aLen, bLen, 0); 179 return nonAdjacent; 180 } 181 182 private Match longestMatch(Quad quad) { 183 int bestI = quad.aStart; 184 int bestJ = quad.bStart; 185 int bestSize; 186 int[int] j2Len; 187 for (int i = quad.aStart; i < quad.aEnd; i++) { 188 int[int] newJ2Len; 189 if (auto indexes = a[i] in b2j) { 190 foreach (j; *indexes) { 191 if (j < quad.bStart) 192 continue; 193 if (j >= quad.bEnd) 194 break; 195 immutable k = j2Len.get(j - 1, 0) + 1; 196 newJ2Len[j] = k; 197 if (k > bestSize) { 198 bestI = i - k + 1; 199 bestJ = j - k + 1; 200 bestSize = k; 201 } 202 } 203 } 204 j2Len = newJ2Len; 205 } 206 while (bestI > quad.aStart && bestJ > quad.bStart && 207 a[bestI - 1] == b[bestJ - 1]) { 208 bestI--; 209 bestJ--; 210 bestSize++; 211 } 212 while (bestI + bestSize < quad.aEnd && 213 bestJ + bestSize < quad.bEnd && 214 a[bestI + bestSize] == b[bestJ + bestSize]) 215 bestSize++; 216 return Match(bestI, bestJ, bestSize); 217 } 218 219 private Diff!E[] diff(EqualSpan equalSpan) { 220 Diff!E[] diffs; 221 int i; 222 int j; 223 foreach (match; matches()) { 224 auto diff = Diff!E(Tag.Equal, a[i .. match.aStart], 225 b[j .. match.bStart]); 226 if (i < match.aStart && j < match.bStart) 227 diff.tag = Tag.Replace; 228 else if (i < match.aStart) 229 diff.tag = Tag.Delete; 230 else if (j < match.bStart) 231 diff.tag = Tag.Insert; 232 if (diff.tag != Tag.Equal) 233 diffs ~= diff; 234 i = match.aStart + match.length; 235 j = match.bStart + match.length; 236 if (match.length && equalSpan == EqualSpan.Keep) 237 diffs ~= Diff!E(Tag.Equal, a[match.aStart .. i], 238 b[match.bStart .. j]); 239 } 240 return diffs; 241 } 242 }