-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcandidate_dupe.go
More file actions
370 lines (353 loc) · 15 KB
/
candidate_dupe.go
File metadata and controls
370 lines (353 loc) · 15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
// Structs which describe potential candidates for duplicates which might
// be used to deduplicate a file.
//
// These candidates are intended to be assembled gradually, and then compared
// with others to find the optimal duplicate which can then be recorded in
// the deduplicated file's index.
package main
// AnyCandidate is a union of the three types of duplicate candidates:
// CandidateDupe, RepeatingCandidateDupe and DupeIndexEntry.
type AnyCandidate interface {
// Return the value of the candidates DupeOffset field; this is the start
// of the duplicate, except in the case of a RepeatingCandidateDupe, for
// which it is the start of the repeating part of the duplicate.
DupeOff() uint64
// Return the offset of the end of the duplicate part of the candidate
// (ie the offset of the first byte which is not part of the duplicate)
DupeEnd() uint64
// Return the offset of the end of the matching part of the candidate
// (ie the offset of the first byte which is not part of the match)
MatchEnd() uint64
}
// A CandidateDupe describes a potentially incomplete duplicate in a file.
//
// Encapsulates the extent of a duplicate which has been discovered so far.
// When it has only been constructed based on whole chunk matches both
// PreceedingBytesChecked and FollowingBytesChecked will be false, but they will
// be set to true to indicate when the relevant sections of the file have been
// compared byte-by-byte.
type CandidateDupe struct {
// The offset of the first byte (that we know of so far) of the potential
// duplicate
DupeOffset uint64
// The offset of the first byte of the matching data. The data starting
// here is the same as that starting at DupeOffset
MatchOffset uint64
// The number of bytes for which the data starting at DupeOffset matches
// that starting at MatchOffset
Length uint64
// Bool whether the preceeding chunk has been checked byte for byte
PreceedingBytesChecked bool
// Bool whether the following chunk has been checked byte for byte
FollowingBytesChecked bool
}
// Implement AnyCandidate interface
func (cd *CandidateDupe) DupeOff() uint64 {
return cd.DupeOffset
}
func (cd *CandidateDupe) DupeEnd() uint64 {
return cd.DupeOffset + cd.Length
}
func (cd *CandidateDupe) MatchEnd() uint64 {
return cd.MatchOffset + cd.Length
}
// If we haven't yet successfully done so, check byte-by-byte whether the
// duplicate that this candidate represents extends earlier than its current
// start point. No more than chunkSize bytes will be checked.
//
// The candidate is initially compiled chunk-by-chunk, so this method checks how
// how much (if any) of the trailing part of the immediately preceeding chunk
// also matches.
//
// The data for the comparison is read from dataSource. Specifically, the
// chunkSize bytes immediately preceeding DupeOffset and MatchOffset will be
// read, and this candidate will only be modified if enough data was successfully
// retrieved to fully determine the extent of the duplicate.
func (cd *CandidateDupe) CheckPreceedingBytes(dataSource RangeGetter, chunkSize uint64) {
if !cd.PreceedingBytesChecked {
success, numBytes := CountMatchingPreceedingBytes(
dataSource,
cd.MatchOffset,
cd.DupeOffset,
chunkSize,
)
if success {
cd.DupeOffset -= numBytes
cd.MatchOffset -= numBytes
cd.Length += numBytes
cd.PreceedingBytesChecked = true
}
}
}
// If we haven't yet successfully done so, check byte-by-byte whether the
// duplicate that this candidate represents extends later than its current
// end point. No more than chunkSize bytes will be checked.
//
// The candidate is initially compiled chunk-by-chunk, so this method checks how
// how much (if any) of the leading part of the immediately following chunk
// also matches.
//
// The data for the comparison is read from dataSource. Specifically, the
// chunkSize bytes immediately following DupeEnd() and MatchEnd() will be
// read, and this candidate will only be modified if enough data was successfully
// retrieved to fully determine the extent of the duplicate.
func (cd *CandidateDupe) CheckFollowingBytes(dataSource RangeGetter, chunkSize uint64) {
if !cd.FollowingBytesChecked {
success, numBytes := CountMatchingFollowingBytes(
dataSource,
cd.MatchEnd(),
cd.DupeEnd(),
chunkSize,
)
if success {
cd.Length += numBytes
cd.FollowingBytesChecked = true
}
}
}
// Return an DupeIndexEntry describing the current state of this candidate
func (cd *CandidateDupe) ToDupeIndexEntry() DupeIndexEntry {
return DupeIndexEntry{
DupeOffset: cd.DupeOffset,
MatchOffset: cd.MatchOffset,
Length: cd.Length,
}
}
// A RepeatingCandidateDupe contains a potentially incomplete description of
// a duplicate in a file which consists primarily of a repeating pattern of bytes.
//
// Encapsulates the extent of the duplicate which has been discovered so far.
// When it has only been constructed based on whole chunk matches both
// PreceedingBytesChecked and FollowingBytesChecked will be false, but they will
// be set to true to indicate when the relevant sections of the file have been
// compared byte-by-byte.
//
// Specifically, it describes a duplicate part and an earlier matching part
// which consist entirely of the same repeating pattern, but may have different
// lengths. It also records the length of any non-repeating sections immediately
// before their beginnings and after their ends which also match.
type RepeatingCandidateDupe struct {
// The length of the pattern of bytes which is repeating
PatternLength uint64
// The offset of the start of the repeating part of the duplicate
DupeOffset uint64
// An offset near the start of the repeating part of the match.
// This must correspond to the same position within the pattern as
// DupeOffset does (ie the data starting here should match the data
// starting there), so may be as much as PatternLength-1 after the actual
// start of the repeating section.
// MatchOffset should always be less than DupeOffset.
MatchOffset uint64
// The current length of the repeating part of the duplicate.
DupeLength uint64
// The current length of the repeating part of the match.
MatchLength uint64
// The number of contiguous bytes immediately preceeding MatchOffset
// which match the corresponding bytes preceeding DupeOffset
NumPreceedingBytes uint64
// The number of contiguous bytes immediately following the end of the
// repeating sections at the two locations which matched
NumFollowingBytes uint64
// Bool whether the chunks preceeding DupeOffset and MatchOffset have been
// compared byte for byte
PreceedingBytesChecked bool
// Bool whether the chunks following the two repeating parts have been
// compared byte for byte
FollowingBytesChecked bool
}
// Implements AnyCandidate interface
func (rcd *RepeatingCandidateDupe) DupeOff() uint64 {
return rcd.DupeOffset
}
func (rcd *RepeatingCandidateDupe) DupeEnd() uint64 {
return rcd.DupeOffset + rcd.DupeLength + rcd.NumFollowingBytes
}
func (rcd *RepeatingCandidateDupe) MatchEnd() uint64 {
return rcd.MatchOffset + rcd.MatchLength + rcd.NumFollowingBytes
}
// If we haven't yet successfully done so, check byte-by-byte whether the
// duplicate that this candidate represents extends earlier than its current
// start point. No more than chunkSize bytes will be checked.
//
// The candidate is initially compiled chunk-by-chunk, so this method checks how
// how much (if any) of the trailing part of the immediately preceeding chunk
// also matches. It checks both the extent of the repeating pattern (modifying
// DupeOffset, DupeLength, MatchOffset and MatchLength accordingly) and for
// any non-repeating bytes before the starts thereof which match (updating
// NumPreceedingBytes accordingly). Note that DupeOffset/MatchOffset and
// DupeEnd()/MatchEnd() do not include these non-repeating bytes.
//
// The data for the comparison is read from dataSource. Specifically, the
// chunkSize bytes immediately preceeding DupeOffset and MatchOffset will be
// read, as well as the first PatternLength bytes following each of those
// offsets, and this candidate will only be modified if enough data was
// successfully retrieved to fully determine the extent of the duplicate.
func (rcd *RepeatingCandidateDupe) CheckPreceedingBytes(dataSource RangeGetter, chunkSize uint64) {
if !rcd.PreceedingBytesChecked {
// Determine the extent of the repeats preceeding MatchOffset
matchSuccess, numRptngBytsPrecMatch := CountMatchingPreceedingBytes(
dataSource,
rcd.MatchOffset,
rcd.MatchOffset+rcd.PatternLength,
chunkSize,
)
newMatchOffset := rcd.MatchOffset - numRptngBytsPrecMatch
newMatchLength := rcd.MatchLength + numRptngBytsPrecMatch
// Determine the extent of the repeats preceeding DupeOffset
dupeSuccess, numRptngBytsPrecDupe := CountMatchingPreceedingBytes(
dataSource,
rcd.DupeOffset,
rcd.DupeOffset+rcd.PatternLength,
chunkSize,
)
newDupeOffset := rcd.DupeOffset - numRptngBytsPrecDupe
newDupeLength := rcd.DupeLength + numRptngBytsPrecDupe
// Ensure that DupeOffset corresponds to the same position
// in the pattern as MatchOffset
dupeRemainder := numRptngBytsPrecDupe % rcd.PatternLength
matchRemainder := numRptngBytsPrecMatch % rcd.PatternLength
if matchRemainder > dupeRemainder {
newMatchOffset += matchRemainder - dupeRemainder
newMatchLength -= matchRemainder - dupeRemainder
} else {
newDupeOffset += dupeRemainder - matchRemainder
newDupeLength -= dupeRemainder - matchRemainder
}
// Ensure that DupeOffset is greater than MatchOffset
if newDupeOffset <= newMatchOffset {
newDupeLength -= newMatchOffset + rcd.PatternLength - newDupeOffset
newDupeOffset = newMatchOffset + rcd.PatternLength
}
if matchSuccess && dupeSuccess {
// Determine how many of the bytes immediately preceeding both repeating
// sections are equal
precSuccess, numPrecBytes := CountMatchingPreceedingBytes(
dataSource,
newMatchOffset,
newDupeOffset,
chunkSize-Max(numRptngBytsPrecMatch, numRptngBytsPrecDupe),
)
// If all of the above was successful, update the candidate accordingly
if precSuccess {
rcd.MatchOffset = newMatchOffset
rcd.MatchLength = newMatchLength
rcd.DupeOffset = newDupeOffset
rcd.DupeLength = newDupeLength
rcd.NumPreceedingBytes = numPrecBytes
rcd.PreceedingBytesChecked = true
}
}
}
}
// If we haven't yet successfully done so, check byte-by-byte whether the
// duplicate that this candidate represents extends later than its current
// end point. No more than chunkSize bytes will be checked.
//
// The candidate is initially compiled chunk-by-chunk, so this method checks how
// how much (if any) of the leading part of the immediately following chunk
// also matches. It checks both the extent of the repeating pattern (modifying
// DupeLength and MatchLength accordingly) and for any non-repeating bytes
// after the ends thereof which match (updating NumFollowingBytes accordingly).
//
// The data for the comparison is read from dataSource. Specifically, the
// chunkSize bytes immediately following DupeEnd() and MatchEnd() will be
// read, as well as the PatternLength bytes immediately preceeding each of those
// offsets, and this candidate will only be modified if enough data was
// successfully retrieved to fully determine the extent of the duplicate.
func (rcd *RepeatingCandidateDupe) CheckFollowingBytes(dataSource RangeGetter, chunkSize uint64) {
if !rcd.FollowingBytesChecked {
// Determine the extent of the repeats following the end of the match
matchSuccess, numRptngBytsFollMatch := CountMatchingFollowingBytes(
dataSource,
rcd.MatchEnd()-rcd.PatternLength,
rcd.MatchEnd(),
chunkSize,
)
newMatchLength := rcd.MatchLength + numRptngBytsFollMatch
// Determine the extent of the repeats following the end of the dupe
dupeSuccess, numRptngBytsFollDupe := CountMatchingFollowingBytes(
dataSource,
rcd.DupeEnd()-rcd.PatternLength,
rcd.DupeEnd(),
chunkSize,
)
newDupeLength := rcd.DupeLength + numRptngBytsFollDupe
if matchSuccess && dupeSuccess {
// Determine how many of the bytes immediately following both repeating
// sections are equal
follSuccess, numFollBytes := CountMatchingFollowingBytes(
dataSource,
rcd.MatchOffset+newMatchLength,
rcd.DupeOffset+newDupeLength,
chunkSize-Max(numRptngBytsFollMatch, numRptngBytsFollDupe),
)
// If all of the above was successful, update the candidate accordingly
if follSuccess {
rcd.MatchLength = newMatchLength
rcd.DupeLength = newDupeLength
rcd.NumFollowingBytes = numFollBytes
rcd.FollowingBytesChecked = true
}
}
}
}
// Return DupeIndexEntrys describing the one or two potentially optimal
// duplicates that this repeating pattern can produce so far.
//
// To be more precise, the duplicates returned will be optimal for this
// candidate, assuming that there are no overlapping entries already present
// in the index. There are rare cases when a repeating candidate can do better
// by extending an existing entry in a way that the duplicates this method
// returns can't.
func (rcd *RepeatingCandidateDupe) BestDupes() []DupeIndexEntry {
// Either line up the starts of the two repeating sections, or the ends;
// anything else will give a shorter duplicate length
if rcd.DupeLength == rcd.MatchLength {
// The two repeating sections are the same length, so we can achieve
// both of the above simultaneously
e := DupeIndexEntry{
DupeOffset: rcd.DupeOffset - rcd.NumPreceedingBytes,
MatchOffset: rcd.MatchOffset - rcd.NumPreceedingBytes,
Length: rcd.DupeLength + rcd.NumPreceedingBytes + rcd.NumFollowingBytes,
}
return []DupeIndexEntry{e}
} else {
repeatingLength := Min(rcd.DupeLength, rcd.MatchLength)
// Line up the starts
e1 := DupeIndexEntry{
DupeOffset: rcd.DupeOffset - rcd.NumPreceedingBytes,
MatchOffset: rcd.MatchOffset - rcd.NumPreceedingBytes,
Length: repeatingLength + rcd.NumPreceedingBytes,
}
dupeRepeatEnd := rcd.DupeOffset + rcd.DupeLength
matchRepeatEnd := rcd.MatchOffset + rcd.MatchLength
if matchRepeatEnd > rcd.DupeOffset {
// This is a self match, so e1 is optimal (and the below would
// give something nonsensical)
return []DupeIndexEntry{e1}
}
if rcd.DupeLength%rcd.PatternLength != rcd.MatchLength%rcd.PatternLength {
// The ends of the the two repeating sections do not correspond to
// the same part of the pattern, so there can be no matching following
// bytes.
shortenMatchBy := ModuloDiff(rcd.MatchLength, rcd.DupeLength, rcd.PatternLength)
repeatingLength := Min(rcd.DupeLength, rcd.MatchLength-shortenMatchBy)
e2 := DupeIndexEntry{
DupeOffset: dupeRepeatEnd - repeatingLength,
MatchOffset: matchRepeatEnd - shortenMatchBy - repeatingLength,
Length: repeatingLength,
}
return []DupeIndexEntry{e1, e2}
} else {
// The ends of the two repeating regions do match each other, so
// we can produce a duplicate by lining up the ends and including
// the following bytes
e2 := DupeIndexEntry{
DupeOffset: dupeRepeatEnd - repeatingLength,
MatchOffset: matchRepeatEnd - repeatingLength,
Length: repeatingLength + rcd.NumFollowingBytes,
}
return []DupeIndexEntry{e1, e2}
}
}
}