File-Deduplicator/candidate_dupe.go at main · Seb-MCaw/File-Deduplicator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
// Structs which describe potential candidates for duplicates which might
// be used to deduplicate a file.
//
// These candidates are intended to be assembled gradually, and then compared
// with others to find the optimal duplicate which can then be recorded in
// the deduplicated file's index.

package main

// AnyCandidate is a union of the three types of duplicate candidates:
// CandidateDupe, RepeatingCandidateDupe and DupeIndexEntry.
type AnyCandidate interface {
	// Return the value of the candidates DupeOffset field; this is the start
	// of the duplicate, except in the case of a RepeatingCandidateDupe, for
	// which it is the start of the repeating part of the duplicate.
	DupeOff() uint64

	// Return the offset of the end of the duplicate part of the candidate
	// (ie the offset of the first byte which is not part of the duplicate)
	DupeEnd() uint64

	// Return the offset of the end of the matching part of the candidate
	// (ie the offset of the first byte which is not part of the match)
	MatchEnd() uint64
}

// A CandidateDupe describes a potentially incomplete duplicate in a file.
//
// Encapsulates the extent of a duplicate which has been discovered so far.
// When it has only been constructed based on whole chunk matches both
// PreceedingBytesChecked and FollowingBytesChecked will be false, but they will
// be set to true to indicate when the relevant sections of the file have been
// compared byte-by-byte.
type CandidateDupe struct {
	// The offset of the first byte (that we know of so far) of the potential
	// duplicate
	DupeOffset uint64
	// The offset of the first byte of the matching data. The data starting
	// here is the same as that starting at DupeOffset
	MatchOffset uint64
	// The number of bytes for which the data starting at DupeOffset matches
	// that starting at MatchOffset
	Length uint64
	// Bool whether the preceeding chunk has been checked byte for byte
	PreceedingBytesChecked bool
	// Bool whether the following chunk has been checked byte for byte
	FollowingBytesChecked bool
}

// Implement AnyCandidate interface
func (cd *CandidateDupe) DupeOff() uint64 {
	return cd.DupeOffset
}
func (cd *CandidateDupe) DupeEnd() uint64 {
	return cd.DupeOffset + cd.Length
}
func (cd *CandidateDupe) MatchEnd() uint64 {
	return cd.MatchOffset + cd.Length
}

// If we haven't yet successfully done so, check byte-by-byte whether the
// duplicate that this candidate represents extends earlier than its current
// start point. No more than chunkSize bytes will be checked.
//
// The candidate is initially compiled chunk-by-chunk, so this method checks how
// how much (if any) of the trailing part of the immediately preceeding chunk
// also matches.
//
// The data for the comparison is read from dataSource. Specifically, the
// chunkSize bytes immediately preceeding DupeOffset and MatchOffset will be
// read, and this candidate will only be modified if enough data was successfully
// retrieved to fully determine the extent of the duplicate.
func (cd *CandidateDupe) CheckPreceedingBytes(dataSource RangeGetter, chunkSize uint64) {
	if !cd.PreceedingBytesChecked {
		success, numBytes := CountMatchingPreceedingBytes(
			dataSource,
			cd.MatchOffset,
			cd.DupeOffset,
			chunkSize,
		)
		if success {
			cd.DupeOffset -= numBytes
			cd.MatchOffset -= numBytes
			cd.Length += numBytes
			cd.PreceedingBytesChecked = true
		}
	}
}

// If we haven't yet successfully done so, check byte-by-byte whether the
// duplicate that this candidate represents extends later than its current
// end point. No more than chunkSize bytes will be checked.
//
// The candidate is initially compiled chunk-by-chunk, so this method checks how
// how much (if any) of the leading part of the immediately following chunk
// also matches.
//
// The data for the comparison is read from dataSource. Specifically, the
// chunkSize bytes immediately following DupeEnd() and MatchEnd() will be
// read, and this candidate will only be modified if enough data was successfully
// retrieved to fully determine the extent of the duplicate.
func (cd *CandidateDupe) CheckFollowingBytes(dataSource RangeGetter, chunkSize uint64) {
	if !cd.FollowingBytesChecked {
		success, numBytes := CountMatchingFollowingBytes(
			dataSource,
			cd.MatchEnd(),
			cd.DupeEnd(),
			chunkSize,
		)
		if success {
			cd.Length += numBytes
			cd.FollowingBytesChecked = true
		}
	}
}

// Return an DupeIndexEntry describing the current state of this candidate
func (cd *CandidateDupe) ToDupeIndexEntry() DupeIndexEntry {
	return DupeIndexEntry{
		DupeOffset:  cd.DupeOffset,
		MatchOffset: cd.MatchOffset,
		Length:      cd.Length,
	}
}

// A RepeatingCandidateDupe contains a potentially incomplete description of
// a duplicate in a file which consists primarily of a repeating pattern of bytes.
//
// Encapsulates the extent of the duplicate which has been discovered so far.
// When it has only been constructed based on whole chunk matches both
// PreceedingBytesChecked and FollowingBytesChecked will be false, but they will
// be set to true to indicate when the relevant sections of the file have been
// compared byte-by-byte.
//
// Specifically, it describes a duplicate part and an earlier matching part
// which consist entirely of the same repeating pattern, but may have different
// lengths. It also records the length of any non-repeating sections immediately
// before their beginnings and after their ends which also match.
type RepeatingCandidateDupe struct {
	// The length of the pattern of bytes which is repeating
	PatternLength uint64
	// The offset of the start of the repeating part of the duplicate
	DupeOffset uint64
	// An offset near the start of the repeating part of the match.
	// This must correspond to the same position within the pattern as
	// DupeOffset does (ie the data starting here should match the data
	// starting there), so may be as much as PatternLength-1 after the actual
	// start of the repeating section.
	// MatchOffset should always be less than DupeOffset.
	MatchOffset uint64
	// The current length of the repeating part of the duplicate.
	DupeLength uint64
	// The current length of the repeating part of the match.
	MatchLength uint64
	// The number of contiguous bytes immediately preceeding MatchOffset
	// which match the corresponding bytes preceeding DupeOffset
	NumPreceedingBytes uint64
	// The number of contiguous bytes immediately following the end of the
	// repeating sections at the two locations which matched
	NumFollowingBytes uint64
	// Bool whether the chunks preceeding DupeOffset and MatchOffset have been
	// compared byte for byte
	PreceedingBytesChecked bool
	// Bool whether the chunks following the two repeating parts have been
	// compared byte for byte
	FollowingBytesChecked bool
}

// Implements AnyCandidate interface
func (rcd *RepeatingCandidateDupe) DupeOff() uint64 {
	return rcd.DupeOffset
}
func (rcd *RepeatingCandidateDupe) DupeEnd() uint64 {
	return rcd.DupeOffset + rcd.DupeLength + rcd.NumFollowingBytes
}
func (rcd *RepeatingCandidateDupe) MatchEnd() uint64 {
	return rcd.MatchOffset + rcd.MatchLength + rcd.NumFollowingBytes
}

// If we haven't yet successfully done so, check byte-by-byte whether the
// duplicate that this candidate represents extends earlier than its current
// start point. No more than chunkSize bytes will be checked.
//
// The candidate is initially compiled chunk-by-chunk, so this method checks how
// how much (if any) of the trailing part of the immediately preceeding chunk
// also matches. It checks both the extent of the repeating pattern (modifying
// DupeOffset, DupeLength, MatchOffset and MatchLength accordingly) and for
// any non-repeating bytes before the starts thereof which match (updating
// NumPreceedingBytes accordingly). Note that DupeOffset/MatchOffset and
// DupeEnd()/MatchEnd() do not include these non-repeating bytes.
//
// The data for the comparison is read from dataSource. Specifically, the
// chunkSize bytes immediately preceeding DupeOffset and MatchOffset will be
// read, as well as the first PatternLength bytes following each of those
// offsets, and this candidate will only be modified if enough data was
// successfully retrieved to fully determine the extent of the duplicate.
func (rcd *RepeatingCandidateDupe) CheckPreceedingBytes(dataSource RangeGetter, chunkSize uint64) {
	if !rcd.PreceedingBytesChecked {
		// Determine the extent of the repeats preceeding MatchOffset
		matchSuccess, numRptngBytsPrecMatch := CountMatchingPreceedingBytes(
			dataSource,
			rcd.MatchOffset,
			rcd.MatchOffset+rcd.PatternLength,
			chunkSize,
		)
		newMatchOffset := rcd.MatchOffset - numRptngBytsPrecMatch
		newMatchLength := rcd.MatchLength + numRptngBytsPrecMatch
		// Determine the extent of the repeats preceeding DupeOffset
		dupeSuccess, numRptngBytsPrecDupe := CountMatchingPreceedingBytes(
			dataSource,
			rcd.DupeOffset,
			rcd.DupeOffset+rcd.PatternLength,
			chunkSize,
		)
		newDupeOffset := rcd.DupeOffset - numRptngBytsPrecDupe
		newDupeLength := rcd.DupeLength + numRptngBytsPrecDupe
		// Ensure that DupeOffset corresponds to the same position
		// in the pattern as MatchOffset
		dupeRemainder := numRptngBytsPrecDupe % rcd.PatternLength
		matchRemainder := numRptngBytsPrecMatch % rcd.PatternLength
		if matchRemainder > dupeRemainder {
			newMatchOffset += matchRemainder - dupeRemainder
			newMatchLength -= matchRemainder - dupeRemainder
		} else {
			newDupeOffset += dupeRemainder - matchRemainder
			newDupeLength -= dupeRemainder - matchRemainder
		}
		// Ensure that DupeOffset is greater than MatchOffset
		if newDupeOffset <= newMatchOffset {
			newDupeLength -= newMatchOffset + rcd.PatternLength - newDupeOffset
			newDupeOffset = newMatchOffset + rcd.PatternLength
		}
		if matchSuccess && dupeSuccess {
			// Determine how many of the bytes immediately preceeding both repeating
			// sections are equal
			precSuccess, numPrecBytes := CountMatchingPreceedingBytes(
				dataSource,
				newMatchOffset,
				newDupeOffset,
				chunkSize-Max(numRptngBytsPrecMatch, numRptngBytsPrecDupe),
			)
			// If all of the above was successful, update the candidate accordingly
			if precSuccess {
				rcd.MatchOffset = newMatchOffset
				rcd.MatchLength = newMatchLength
				rcd.DupeOffset = newDupeOffset
				rcd.DupeLength = newDupeLength
				rcd.NumPreceedingBytes = numPrecBytes
				rcd.PreceedingBytesChecked = true
			}
		}
	}
}

// If we haven't yet successfully done so, check byte-by-byte whether the
// duplicate that this candidate represents extends later than its current
// end point. No more than chunkSize bytes will be checked.
//
// The candidate is initially compiled chunk-by-chunk, so this method checks how
// how much (if any) of the leading part of the immediately following chunk
// also matches. It checks both the extent of the repeating pattern (modifying
// DupeLength and MatchLength accordingly) and for any non-repeating bytes
// after the ends thereof which match (updating NumFollowingBytes accordingly).
//
// The data for the comparison is read from dataSource. Specifically, the
// chunkSize bytes immediately following DupeEnd() and MatchEnd() will be
// read, as well as the PatternLength bytes immediately preceeding each of those
// offsets, and this candidate will only be modified if enough data was
// successfully retrieved to fully determine the extent of the duplicate.
func (rcd *RepeatingCandidateDupe) CheckFollowingBytes(dataSource RangeGetter, chunkSize uint64) {
	if !rcd.FollowingBytesChecked {
		// Determine the extent of the repeats following the end of the match
		matchSuccess, numRptngBytsFollMatch := CountMatchingFollowingBytes(
			dataSource,
			rcd.MatchEnd()-rcd.PatternLength,
			rcd.MatchEnd(),
			chunkSize,
		)
		newMatchLength := rcd.MatchLength + numRptngBytsFollMatch
		// Determine the extent of the repeats following the end of the dupe
		dupeSuccess, numRptngBytsFollDupe := CountMatchingFollowingBytes(
			dataSource,
			rcd.DupeEnd()-rcd.PatternLength,
			rcd.DupeEnd(),
			chunkSize,
		)
		newDupeLength := rcd.DupeLength + numRptngBytsFollDupe
		if matchSuccess && dupeSuccess {
			// Determine how many of the bytes immediately following both repeating
			// sections are equal
			follSuccess, numFollBytes := CountMatchingFollowingBytes(
				dataSource,
				rcd.MatchOffset+newMatchLength,
				rcd.DupeOffset+newDupeLength,
				chunkSize-Max(numRptngBytsFollMatch, numRptngBytsFollDupe),
			)
			// If all of the above was successful, update the candidate accordingly
			if follSuccess {
				rcd.MatchLength = newMatchLength
				rcd.DupeLength = newDupeLength
				rcd.NumFollowingBytes = numFollBytes
				rcd.FollowingBytesChecked = true
			}
		}
	}
}

// Return DupeIndexEntrys describing the one or two potentially optimal
// duplicates that this repeating pattern can produce so far.
//
// To be more precise, the duplicates returned will be optimal for this
// candidate, assuming that there are no overlapping entries already present
// in the index. There are rare cases when a repeating candidate can do better
// by extending an existing entry in a way that the duplicates this method
// returns can't.
func (rcd *RepeatingCandidateDupe) BestDupes() []DupeIndexEntry {
	// Either line up the starts of the two repeating sections, or the ends;
	// anything else will give a shorter duplicate length
	if rcd.DupeLength == rcd.MatchLength {
		// The two repeating sections are the same length, so we can achieve
		// both of the above simultaneously
		e := DupeIndexEntry{
			DupeOffset:  rcd.DupeOffset - rcd.NumPreceedingBytes,
			MatchOffset: rcd.MatchOffset - rcd.NumPreceedingBytes,
			Length:      rcd.DupeLength + rcd.NumPreceedingBytes + rcd.NumFollowingBytes,
		}
		return []DupeIndexEntry{e}
	} else {
		repeatingLength := Min(rcd.DupeLength, rcd.MatchLength)

		// Line up the starts
		e1 := DupeIndexEntry{
			DupeOffset:  rcd.DupeOffset - rcd.NumPreceedingBytes,
			MatchOffset: rcd.MatchOffset - rcd.NumPreceedingBytes,
			Length:      repeatingLength + rcd.NumPreceedingBytes,
		}

		dupeRepeatEnd := rcd.DupeOffset + rcd.DupeLength
		matchRepeatEnd := rcd.MatchOffset + rcd.MatchLength
		if matchRepeatEnd > rcd.DupeOffset {
			// This is a self match, so e1 is optimal (and the below would
			// give something nonsensical)
			return []DupeIndexEntry{e1}
		}

		if rcd.DupeLength%rcd.PatternLength != rcd.MatchLength%rcd.PatternLength {
			// The ends of the the two repeating sections do not correspond to
			// the same part of the pattern, so there can be no matching following
			// bytes.
			shortenMatchBy := ModuloDiff(rcd.MatchLength, rcd.DupeLength, rcd.PatternLength)
			repeatingLength := Min(rcd.DupeLength, rcd.MatchLength-shortenMatchBy)
			e2 := DupeIndexEntry{
				DupeOffset:  dupeRepeatEnd - repeatingLength,
				MatchOffset: matchRepeatEnd - shortenMatchBy - repeatingLength,
				Length:      repeatingLength,
			}
			return []DupeIndexEntry{e1, e2}
		} else {
			// The ends of the two repeating regions do match each other, so
			// we can produce a duplicate by lining up the ends and including
			// the following bytes
			e2 := DupeIndexEntry{
				DupeOffset:  dupeRepeatEnd - repeatingLength,
				MatchOffset: matchRepeatEnd - repeatingLength,
				Length:      repeatingLength + rcd.NumFollowingBytes,
			}
			return []DupeIndexEntry{e1, e2}
		}
	}
}