Skip to content

Commit 3d2b422

Browse files
committed
Add benchmark compare against STL
1 parent 24e33ab commit 3d2b422

2 files changed

Lines changed: 74 additions & 25 deletions

File tree

README.md

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,27 +18,39 @@ may break at any time without the MAJOR version number being incremented.
1818
The table below compares the single threaded throughput in bytes/s (real time) between
1919
libhat and [two other](test/benchmark/vendor) commonly used implementations for pattern
2020
scanning. The input buffers were randomly generated using a fixed seed, and the pattern
21-
scanned does not contain any match in the buffer. The benchmark was run on a system with
22-
an i7-9700K (which supports libhat's [AVX2](src/arch/x86/AVX2.cpp) scanner implementation).
21+
scanned does not contain any match in the buffer. The benchmark was compiled on Windows
22+
with `clang-cl` 21.1.1, using the MSVC 14.44.35207 toolchain and the default release mode
23+
flags (`/GR /EHsc /MD /O2 /Ob2`). The benchmark was run on a system with an i7-14700K
24+
(supporting [AVX2](src/arch/x86/AVX2.cpp)) and 64GB (4x16GB) DDR5 6000 MT/s (30-38-38-96).
2325
The full source code is available [here](test/benchmark/Compare.cpp).
2426
```
25-
---------------------------------------------------------------------------------------
26-
Benchmark Time CPU Iterations bytes_per_second
27-
---------------------------------------------------------------------------------------
28-
BM_Throughput_Libhat/4MiB 131578 ns 48967 ns 21379 29.6876Gi/s
29-
BM_Throughput_Libhat/16MiB 813977 ns 413524 ns 3514 19.1959Gi/s
30-
BM_Throughput_Libhat/128MiB 6910936 ns 3993486 ns 403 18.0873Gi/s
31-
BM_Throughput_Libhat/256MiB 13959379 ns 8121906 ns 202 17.9091Gi/s
32-
33-
BM_Throughput_UC1/4MiB 4739731 ns 2776015 ns 591 843.93Mi/s
34-
BM_Throughput_UC1/16MiB 19011485 ns 10841837 ns 147 841.597Mi/s
35-
BM_Throughput_UC1/128MiB 152277511 ns 82465278 ns 18 840.571Mi/s
36-
BM_Throughput_UC1/256MiB 304964544 ns 180555556 ns 9 839.442Mi/s
37-
38-
BM_Throughput_UC2/4MiB 9633499 ns 4617698 ns 291 415.218Mi/s
39-
BM_Throughput_UC2/16MiB 38507193 ns 22474315 ns 73 415.507Mi/s
40-
BM_Throughput_UC2/128MiB 307989100 ns 164930556 ns 9 415.599Mi/s
41-
BM_Throughput_UC2/256MiB 616449240 ns 331250000 ns 5 415.282Mi/s
27+
---------------------------------------------------------------------------------------------------
28+
Benchmark Time CPU Iterations bytes_per_second
29+
---------------------------------------------------------------------------------------------------
30+
BM_Throughput_libhat/4MiB 67686 ns 67816 ns 82254 57.7110Gi/s
31+
BM_Throughput_libhat/16MiB 319801 ns 319558 ns 18287 48.8585Gi/s
32+
BM_Throughput_libhat/128MiB 5325733 ns 5282315 ns 1056 23.4709Gi/s
33+
BM_Throughput_libhat/256MiB 10921878 ns 10814951 ns 510 22.8898Gi/s
34+
35+
BM_Throughput_std_search/4MiB 1364050 ns 1361672 ns 4108 2.86372Gi/s
36+
BM_Throughput_std_search/16MiB 5470025 ns 5458783 ns 1019 2.85648Gi/s
37+
BM_Throughput_std_search/128MiB 43622456 ns 43483527 ns 129 2.86550Gi/s
38+
BM_Throughput_std_search/256MiB 88093320 ns 87158203 ns 64 2.83790Gi/s
39+
40+
BM_Throughput_std_find_std_equal/4MiB 178567 ns 178586 ns 31410 21.8755Gi/s
41+
BM_Throughput_std_find_std_equal/16MiB 806394 ns 805228 ns 7005 19.3764Gi/s
42+
BM_Throughput_std_find_std_equal/128MiB 8944718 ns 8953652 ns 623 13.9747Gi/s
43+
BM_Throughput_std_find_std_equal/256MiB 18092713 ns 18102751 ns 309 13.8177Gi/s
44+
45+
BM_Throughput_UC1/4MiB 1727027 ns 1721236 ns 3268 2.26183Gi/s
46+
BM_Throughput_UC1/16MiB 6878188 ns 6849054 ns 819 2.27167Gi/s
47+
BM_Throughput_UC1/128MiB 55181849 ns 55300245 ns 102 2.26524Gi/s
48+
BM_Throughput_UC1/256MiB 110209374 ns 110000000 ns 50 2.26841Gi/s
49+
50+
BM_Throughput_UC2/4MiB 4011942 ns 4001524 ns 1394 997.023Mi/s
51+
BM_Throughput_UC2/16MiB 16136510 ns 16166908 ns 346 991.540Mi/s
52+
BM_Throughput_UC2/128MiB 130954740 ns 130087209 ns 43 977.437Mi/s
53+
BM_Throughput_UC2/256MiB 261157833 ns 261160714 ns 21 980.250Mi/s
4254
```
4355

4456
## Platforms

test/benchmark/Compare.cpp

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ static auto gen_random_buffer(const size_t size) {
2020
return buffer;
2121
}
2222

23-
static void BM_Throughput_Libhat(benchmark::State& state) {
23+
static void BM_Throughput_libhat(benchmark::State& state) {
2424
const size_t size = state.range(0);
2525
const auto buf = gen_random_buffer(size);
2626
const auto begin = std::to_address(buf.begin());
@@ -33,6 +33,34 @@ static void BM_Throughput_Libhat(benchmark::State& state) {
3333
state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * size));
3434
}
3535

36+
static void BM_Throughput_std_search(benchmark::State& state) {
37+
const size_t size = state.range(0);
38+
const auto buf = gen_random_buffer(size);
39+
const auto begin = std::to_address(buf.begin());
40+
const auto end = std::to_address(buf.end());
41+
42+
const auto sig = hat::parse_signature(test_pattern).value();
43+
for (auto _ : state) {
44+
benchmark::DoNotOptimize(std::search(begin, end, sig.begin(), sig.end()));
45+
}
46+
state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * size));
47+
}
48+
49+
static void BM_Throughput_std_find_std_equal(benchmark::State& state) {
50+
const size_t size = state.range(0);
51+
const auto buf = gen_random_buffer(size);
52+
const auto begin = std::to_address(buf.begin());
53+
const auto end = std::to_address(buf.end());
54+
55+
// libhat's "Single" implementation uses std::find + std::equal
56+
const auto sig = hat::parse_signature(test_pattern).value();
57+
const auto context = hat::detail::scan_context::create<hat::detail::scan_mode::Single>(sig, hat::scan_alignment::X1, hat::scan_hint::none);
58+
for (auto _ : state) {
59+
benchmark::DoNotOptimize(context.scan(begin, end));
60+
}
61+
state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * size));
62+
}
63+
3664
static void BM_Throughput_UC1(benchmark::State& state) {
3765
const size_t size = state.range(0);
3866
const auto buf = gen_random_buffer(size);
@@ -58,11 +86,20 @@ static void BM_Throughput_UC2(benchmark::State& state) {
5886
state.SetBytesProcessed(static_cast<int64_t>(state.iterations() * size));
5987
}
6088

61-
static int64_t rangeStart = 1 << 22; // 4 MiB
62-
static int64_t rangeLimit = 1 << 28; // 256 MiB
89+
static constexpr int64_t rangeStart = 1 << 22; // 4 MiB
90+
static constexpr int64_t rangeLimit = 1 << 28; // 256 MiB
91+
92+
#define LIBHAT_BENCHMARK(...) BENCHMARK(__VA_ARGS__) \
93+
->Threads(1) \
94+
->MinWarmUpTime(2) \
95+
->MinTime(4) \
96+
->Range(rangeStart, rangeLimit) \
97+
->UseRealTime();
6398

64-
BENCHMARK(BM_Throughput_Libhat)->Threads(1)->MinWarmUpTime(1)->MinTime(2)->Range(rangeStart, rangeLimit)->UseRealTime();
65-
BENCHMARK(BM_Throughput_UC1)->Threads(1)->MinWarmUpTime(1)->MinTime(2)->Range(rangeStart, rangeLimit)->UseRealTime();
66-
BENCHMARK(BM_Throughput_UC2)->Threads(1)->MinWarmUpTime(1)->MinTime(2)->Range(rangeStart, rangeLimit)->UseRealTime();
99+
LIBHAT_BENCHMARK(BM_Throughput_libhat);
100+
LIBHAT_BENCHMARK(BM_Throughput_std_search);
101+
LIBHAT_BENCHMARK(BM_Throughput_std_find_std_equal);
102+
LIBHAT_BENCHMARK(BM_Throughput_UC1);
103+
LIBHAT_BENCHMARK(BM_Throughput_UC2);
67104

68105
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)