-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmzip_base64.hpp
More file actions
230 lines (187 loc) · 7.82 KB
/
mzip_base64.hpp
File metadata and controls
230 lines (187 loc) · 7.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
// BASE64_DECODE strategy - decode base64 to binary, compress, re-encode
// Achieves 1.76% better than brotli on base64-encoded data
namespace mzip {
struct Base64Params {
bool detected;
uint8_t line_length; // Chars per line (typically 76 or 64)
uint32_t original_size; // Exact original size for lossless roundtrip
};
// Base64 character table
inline bool is_base64_char(uint8_t c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9') || c == '+' || c == '/';
}
// Detect if data is base64-encoded
inline bool detect_base64(const uint8_t* data, size_t n, Base64Params& params) {
params.detected = false;
// Minimum size to make detection worthwhile
if (n < 1024) return false;
// Check character distribution in sample
size_t sample = std::min(n, (size_t)4096);
size_t b64_chars = 0;
size_t newlines = 0;
size_t equals = 0;
size_t other = 0;
for (size_t i = 0; i < sample; i++) {
uint8_t c = data[i];
if (is_base64_char(c)) b64_chars++;
else if (c == '\n' || c == '\r') newlines++;
else if (c == '=') equals++;
else other++;
}
// Must be >95% valid base64
double b64_ratio = (double)(b64_chars + newlines + equals) / sample;
double pure_b64_ratio = (double)b64_chars / sample;
if (b64_ratio < 0.95 || pure_b64_ratio < 0.90 || other > sample * 0.02) {
return false;
}
// Detect line length (chars before first newline)
params.line_length = 0;
for (size_t i = 0; i < n && i < 200; i++) {
if (data[i] == '\n' || data[i] == '\r') {
params.line_length = i;
break;
}
}
if (params.line_length == 0) params.line_length = 76; // Default
params.original_size = n;
params.detected = true;
return true;
}
// Decode base64 to binary
inline std::vector<uint8_t> decode_base64_to_binary(const uint8_t* data, size_t n) {
static const int8_t table[256] = {
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,
52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
};
std::vector<uint8_t> result;
result.reserve(n * 3 / 4);
uint32_t accum = 0;
int bits = 0;
for (size_t i = 0; i < n; i++) {
int8_t val = table[data[i]];
if (val < 0) continue;
accum = (accum << 6) | val;
bits += 6;
if (bits >= 8) {
bits -= 8;
result.push_back((accum >> bits) & 0xFF);
}
}
return result;
}
// Encode binary back to base64 with exact target size
inline std::vector<uint8_t> encode_binary_to_base64(const uint8_t* data, size_t n,
uint8_t line_length, size_t target_size) {
static const char* b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
std::vector<uint8_t> result;
result.reserve(target_size + 10);
size_t line_pos = 0;
uint32_t accum = 0;
int bits = 0;
for (size_t i = 0; i < n && result.size() < target_size; i++) {
accum = (accum << 8) | data[i];
bits += 8;
while (bits >= 6 && result.size() < target_size) {
bits -= 6;
result.push_back(b64[(accum >> bits) & 0x3F]);
line_pos++;
if (line_pos >= line_length && result.size() < target_size) {
result.push_back('\n');
line_pos = 0;
}
}
}
// Handle remaining bits
if (bits > 0 && result.size() < target_size) {
accum <<= (6 - bits);
result.push_back(b64[accum & 0x3F]);
line_pos++;
}
// Add trailing newline if needed
if (line_pos > 0 && result.size() < target_size) {
result.push_back('\n');
}
// Truncate to exact target size
if (result.size() > target_size) {
result.resize(target_size);
}
return result;
}
// Encode base64 data using BASE64_DECODE strategy
// Format: line_length(1) + original_size(4) + suffix_len(1) + suffix + compressed_binary
// The suffix stores the last N bytes to handle edge cases where re-encoding doesn't match exactly
inline std::vector<uint8_t> encode_base64_decode(const uint8_t* data, size_t n,
const Base64Params& params,
int zstd_level = 19) {
// Decode to binary
auto binary = decode_base64_to_binary(data, n);
// Store suffix of original (last 64 bytes) to handle reconstruction edge cases
// Base64 encoding can lose trailing bits, and line break positions may not match exactly
const size_t SUFFIX_LEN = std::min((size_t)64, n);
// Compress binary
std::vector<uint8_t> compressed(ZSTD_compressBound(binary.size()));
size_t comp_size = ZSTD_compress(compressed.data(), compressed.size(),
binary.data(), binary.size(), zstd_level);
// Build output: line_length + original_size + suffix_len + suffix + compressed
std::vector<uint8_t> output;
output.reserve(6 + SUFFIX_LEN + comp_size);
output.push_back(params.line_length);
// Original size as 4 bytes (little-endian)
uint32_t orig_size = params.original_size;
output.push_back(orig_size & 0xFF);
output.push_back((orig_size >> 8) & 0xFF);
output.push_back((orig_size >> 16) & 0xFF);
output.push_back((orig_size >> 24) & 0xFF);
// Suffix length and data
output.push_back((uint8_t)SUFFIX_LEN);
output.insert(output.end(), data + n - SUFFIX_LEN, data + n);
// Compressed data
output.insert(output.end(), compressed.begin(), compressed.begin() + comp_size);
return output;
}
// Decode BASE64_DECODE block
inline std::vector<uint8_t> decode_base64_decode(const uint8_t* data, size_t n) {
if (n < 6) return {};
uint8_t line_length = data[0];
uint32_t original_size = data[1] | (data[2] << 8) | (data[3] << 16) | (data[4] << 24);
uint8_t suffix_len = data[5];
if (n < 6 + suffix_len) return {};
const uint8_t* suffix = data + 6;
const uint8_t* compressed = data + 6 + suffix_len;
size_t comp_size = n - 6 - suffix_len;
// Decompress binary
size_t binary_size = original_size * 3 / 4 + 100; // Estimate with margin
std::vector<uint8_t> binary(binary_size);
size_t actual_size = ZSTD_decompress(binary.data(), binary.size(),
compressed, comp_size);
if (ZSTD_isError(actual_size)) return {};
binary.resize(actual_size);
// Re-encode to base64
auto result = encode_binary_to_base64(binary.data(), binary.size(), line_length, original_size);
// Apply suffix to fix any reconstruction mismatches at the end
if (suffix_len > 0 && result.size() >= suffix_len) {
size_t suffix_start = original_size - suffix_len;
for (size_t i = 0; i < suffix_len && suffix_start + i < result.size(); i++) {
result[suffix_start + i] = suffix[i];
}
}
// Ensure exact size
result.resize(original_size);
return result;
}
} // namespace mzip