Skip to content

Commit ca90ba0

Browse files
committed
Merge branch 'master' into release/2.x
2 parents eeec80f + 800e212 commit ca90ba0

File tree

17 files changed

+19631
-18411
lines changed

17 files changed

+19631
-18411
lines changed

.github/workflows/test.yml

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ on:
1212
- 'LICENSE'
1313
- 'doc/**'
1414
- 'tools/**'
15+
workflow_dispatch:
1516

1617
jobs:
1718
define-matrix:
@@ -32,7 +33,8 @@ jobs:
3233
3334
macos_map = {
3435
'macos-14': "15.4",
35-
'macos-15': "16.2"
36+
'macos-15': "16.4",
37+
'macos-26': "26.2"
3638
}
3739
3840
win_paltforms = ["x64", "Win32"]
@@ -60,7 +62,8 @@ jobs:
6062
}
6163
emscripten_versions = [
6264
'3.1.74',
63-
'4.0.15',
65+
'4.0.22',
66+
'5.0.0'
6467
]
6568
6669
hosts = []
@@ -177,6 +180,12 @@ jobs:
177180
shell: bash
178181
run: |
179182
cmake $CMAKE_GENERATOR -S . -B out $CMAKE_ARGS -DCMAKE_BUILD_TYPE=MinSizeRel
183+
if [[ '${{ matrix.os }}' == windows-* ]]; then
184+
if [ -f out/test/pythonloc.txt ]; then
185+
pythloc=$(cat out/test/pythonloc.txt)
186+
echo "$pythloc" >> $GITHUB_PATH
187+
fi
188+
fi
180189
181190
- name: Build and Test
182191
shell: bash
@@ -337,15 +346,15 @@ jobs:
337346
fail-fast: false
338347
matrix:
339348
python-version: [
340-
"3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t",
349+
"3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14",
341350
"pypy-3.9", "pypy-3.10", "pypy-3.11"
342351
]
343352
steps:
344353
- name: Checkout
345354
uses: actions/checkout@v4
346355

347356
- name: Set up Python
348-
uses: actions/setup-python@v5
357+
uses: actions/setup-python@v6
349358
id: setup-python
350359
with:
351360
python-version: ${{matrix.python-version}}

CHANGELOG.md

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,138 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55

66
## Unreleased
77

8+
### Added
9+
- Support for Unicode 17
10+
11+
### Fixed
12+
- Regression in ICU 77+ StringByteSink causing memory corruption
13+
- Bogus warnings on newer clang-cl
14+
- Debug assertions when using Python strings with Python 3.14
15+
16+
## [3.7] - 2025-10-14
17+
18+
### Added
19+
- This library now builds and works properly on Haiku OS
20+
21+
## [3.6] - 2025-09-26
22+
23+
### Added
24+
- On Emscripten it is now possible to construct `sys_string` directly from `__externref_t` and convert `sys_string` back to it.
25+
This is faster and usually more ergonomic that going through `Emval`.
26+
- On Emscripten it is now possible to make `sys_string` use [WASM JavaScript Builtins](https://developer.mozilla.org/en-US/docs/WebAssembly/Guides/JavaScript_builtins) (if your WASM platform supports them). This can make construction of `sys_string` from `__externref_t` significantly faster.
27+
Define `SYS_STRING_USE_WASM_JS_STRING` to 1 to enable this functionality.
28+
29+
### Fixed
30+
- This library can now be used with `clang-cl` on Windows
31+
- Bugs in BSTR storage support on 32-bit x86
32+
- Warnings on clang 21
33+
34+
## [3.5] - 2025-07-16
35+
36+
### Fixed
37+
- Incorrect CMake test for Python presence at build time
38+
39+
## [3.4] - 2025-06-27
40+
41+
### Fixed
42+
- Inadvertent pessimization in `compare_no_case`
43+
- Concepts rejecting `Char *` as eligible for addition to `sys_string`
44+
45+
## [3.3] - 2025-05-12
46+
47+
### Fixed
48+
49+
- CMake build now correctly handles when Python detected on the build machine is too old to be used
50+
- Tests now run correctly with PyPy Python installation
51+
52+
## [3.2] - 2025-02-10
53+
54+
### Added
55+
56+
- `sys_string` can now be `normalize()`-ed to NFC and NFD.
57+
- It is now possible to use ICU instead of internal data/code for case conversion, grapheme
58+
iteration and normalization. This makes those operations a tiny bit slower but saves
59+
~100kB in a final executable if you use all of them.
60+
61+
### Changed
62+
63+
- This library is now header only
64+
- Speed improvements to case conversions, case insensitive comparisons and grapheme iteration.
65+
66+
### Fixed
67+
68+
- Addressed some false positive sanitizer warnings.
69+
70+
## [3.1] - 2025-01-10
71+
72+
### Added
73+
74+
- `grapheme_view` and `graphemes` adapter which provide ability to iterate over grapheme clusters in `sys_string` and any UTF range.
75+
76+
### Changed
77+
78+
- Unicode data has been optimized for better size/speed balance
79+
- `sys_string_t::hash_type` has been changed from `unsigned` to `size_t` on some platforms.
80+
81+
### Fixed
82+
83+
- Invalid character access in unicode mappings.
84+
- Crash when sys_string_builder is re-used after `build()` on Apple and Python platforms.
85+
- `utf_ref_view` and `utf_owning_view` now actually work with forward and input underlying ranges
86+
- MSVC warnings when using `std::hash<sys_string>`
87+
88+
## [3.0] - 2024-12-02
89+
90+
This is a major release with some breaking changes
91+
92+
### Changed
93+
94+
- C++20 or higher is now required for compilation. In particular, the following C++20 features must be available:
95+
- Ranges support in standard library (`__cpp_lib_ranges >= 201911`)
96+
- Three-way comparison (spaceship operator)
97+
- `char8_t` type
98+
- `std::endian` support in standard library (`__cpp_lib_endian >= 201907`)
99+
- Minimal compilers known to work include: GCC 12, Clang 16, Apple Clang 15.4 and MSVC 17.6.
100+
- The library has been _range_-ified.
101+
- All methods that used to accept iterator pairs now take iterator/sentinel pairs.
102+
- All these methods now also have overloads that accept ranges
103+
- Existing informal ranges (`sys_string::char_access`, `sys_string::utf_view`, etc.) are now
104+
formal ranges or views.
105+
- As part of the above `sys_string::utfX_view` classes has been renamed to `sys_string::utfX_access` (because they are
106+
not formally views as defined by standard library). The old names have been retained for compatibility but annotated
107+
as deprecated. Note that `sys_string_builder::utf_view` remains under the same name since it *is* a view.
108+
- Breaking change: as part of the above change the `sys_string::utf_access` and `sys_string_builder::utf_view` now
109+
return distinct iterators and sentinels (that is they no longer satisfy `std::ranges::common_range` concept).
110+
You will need to use ranges algorithms with their iterators (e.g. `std::ranges::for_each` rather than `std::for_each`).
111+
- The global `utf_view` template has been split into two: `utf_ref_view` that takes underlying range by reference (similar
112+
to `std::ref_view`) and `utf_owning_view` that owns a movable underlying range (similar to `std::owning_view`). These
113+
are automatically produced by `as_utf` range adapter closures (see below in Added section)
114+
- Breaking change: the non-standard `Cursor` classes has been removed.
115+
- The library has been _concept_-ified.
116+
- Most templated library calls now have concepts checks that validate their argument types.
117+
- Primitive `std::enable_if` used before have been subsumed by these and removed.
118+
- Unicode data used for case folding and whitespace detection has been updated to version 16.0.0
119+
120+
### Added
121+
- `sys_string_t` can now be `+`-ed with any forward range of any type of character (including C strings and std::string).
122+
This results in a the same optimized addition as when adding `sys_string_t` objects.
123+
- `sys_string_t` objects can now be formatted via `std::format` (if available in your library). On platforms
124+
where `wchar_t` is UTF-16 or UTF-32 you can also use wide character formatting.
125+
- `sys_string_t::std_format` method. This formats a new `sys_string_t` (similar to the existing `sys_string_t::format`)
126+
but uses `std::format` machinery and formatting string syntax.
127+
- Range adapter closures: `as_utf8`, `as_utf18`, `as_utf32` and generic `as_utf<encoding>` .
128+
- These can be used to create `utf_ref_view`/`utf_owning_view` from any range/view. For example `as_utf16(std::string("abc"))`
129+
- If you library supports custom adapter closures (usually `__cpp_lib_ranges >= 202202L`) they can be used in
130+
view pipelines like `std::string("abc") | as_utf16 | std::views::take(2)` etc.
131+
132+
### Fixed
133+
- Printing `sys_string_t` objects into `std::ostream` (and `std::wostream` if available) now functions correctly in presence
134+
of stream formatting flags. Flags are currently ignored. This might change in a future version.
135+
- Printing/formatting `sys_string_t` objects that use `char` storage type now does not perform sanitizing transcoding. The content
136+
of the string is printed as-is. This allows faithful round-tripping and support for invalid Unicode for those scenarios. Similar
137+
behavior applies to `wchar_t` on platform where it is UTF-16 or UTF-32.
138+
- `operator<<` no longer pollutes global namespace
139+
8140
## [2.22] - 2025-10-14
9141

10142
## Added
@@ -226,3 +358,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
226358
[2.20]: https://github.com/gershnik/sys_string/releases/v2.20
227359
[2.21]: https://github.com/gershnik/sys_string/releases/v2.21
228360
[2.22]: https://github.com/gershnik/sys_string/releases/v2.22
361+
[3.0]: https://github.com/gershnik/sys_string/releases/v3.0
362+
[3.1]: https://github.com/gershnik/sys_string/releases/v3.1
363+
[3.2]: https://github.com/gershnik/sys_string/releases/v3.2
364+
[3.3]: https://github.com/gershnik/sys_string/releases/v3.3
365+
[3.4]: https://github.com/gershnik/sys_string/releases/v3.4
366+
[3.5]: https://github.com/gershnik/sys_string/releases/v3.5
367+
[3.6]: https://github.com/gershnik/sys_string/releases/v3.6
368+
[3.7]: https://github.com/gershnik/sys_string/releases/v3.7

lib/inc/sys_string/impl/platforms/python_any.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,18 @@ namespace sysstr::util
460460
obj.ready = value;
461461
}
462462

463+
template <typename T, typename = int>
464+
struct HasStaticallyAllocatedMember : std::false_type { };
465+
466+
template <typename T>
467+
struct HasStaticallyAllocatedMember <T, decltype((void)T::statically_allocated, 0)> : std::true_type { };
468+
469+
template<class T>
470+
constexpr void gccIsAPieceOfShit_assignStaticallyAllocated(T & obj, int value) {
471+
if constexpr(HasStaticallyAllocatedMember<T>::value)
472+
obj.statically_allocated = value;
473+
}
474+
463475
template<class T>
464476
constexpr void gccIsAPieceOfShit_assignData(T & obj, void * data) {
465477
obj.any = data;
@@ -483,6 +495,7 @@ namespace sysstr::util
483495
gccIsAPieceOfShit_assignHash(this->_base._base); //pypy lacks hash in PyASCIIObject
484496
this->_base._base.state.kind = Kind;
485497
gccIsAPieceOfShit_assignReady(this->_base._base.state, 1);
498+
gccIsAPieceOfShit_assignStaticallyAllocated(this->_base._base.state, 1);
486499
gccIsAPieceOfShit_assignData(this->data, const_cast<void *>(chars));
487500
if constexpr (Kind == PyUnicode_1BYTE_KIND)
488501
{

lib/inc/sys_string/impl/platforms/windows_bstr.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ namespace sysstr::util
145145
}
146146
data(const data & src) noexcept
147147
{
148-
memcpy(this, &src, sizeof(data));
148+
memcpy((void*)this, &src, sizeof(data));
149149
if (flags.value == dynamic_flag && dynamic_data.ptr)
150150
{
151151
dynamic_data.ptr = dynamic_bstr::allocate(src.dynamic_data.ptr->size());
@@ -154,7 +154,7 @@ namespace sysstr::util
154154
}
155155
data(data && src) noexcept
156156
{
157-
memcpy(this, &src, sizeof(data));
157+
memcpy((void*)this, &src, sizeof(data));
158158
src.init();
159159
}
160160
data & operator=(const data & rhs) noexcept
@@ -172,9 +172,9 @@ namespace sysstr::util
172172
void swap(data & other) noexcept
173173
{
174174
data temp;
175-
memcpy(&temp, &other, sizeof(data));
176-
memcpy(&other, this, sizeof(data));
177-
memcpy(this, &temp, sizeof(m_data));
175+
memcpy((void*)&temp, &other, sizeof(data));
176+
memcpy((void*)&other, this, sizeof(data));
177+
memcpy((void*)this, &temp, sizeof(m_data));
178178
}
179179

180180
void init() noexcept

lib/inc/sys_string/impl/unicode/mappings.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@ namespace sysstr::util::unicode
5151

5252
static const std::array<uint8_t, 2048> stage1;
5353

54-
static const std::array<std::array<uint16_t, 32>, 64> stage2;
54+
static const std::array<std::array<uint16_t, 32>, 65> stage2;
5555

56-
static const std::array<std::array<uint16_t, 16>, 359> stage3;
56+
static const std::array<std::array<uint16_t, 16>, 366> stage3;
5757

58-
static const std::array<uint32_t, 754> stage4;
58+
static const std::array<uint32_t, 752> stage4;
5959

6060
public:
6161
using value = decltype(stage4)::value_type;
@@ -84,7 +84,7 @@ namespace sysstr::util::unicode
8484
};
8585

8686

87-
static const char16_t cased_data[3940];
87+
static const char16_t cased_data[4046];
8888

8989
public:
9090
static constexpr size_t data_size = lookup::data_size + sizeof(cased_data);
@@ -189,9 +189,9 @@ namespace sysstr::util::unicode
189189

190190
static const std::array<uint8_t, 2048> stage1;
191191

192-
static const std::array<std::array<uint16_t, 32>, 54> stage2;
192+
static const std::array<std::array<uint16_t, 32>, 55> stage2;
193193

194-
static const std::array<std::array<uint16_t, 16>, 308> stage3;
194+
static const std::array<std::array<uint16_t, 16>, 312> stage3;
195195

196196
static const std::array<uint16_t, 498> stage4;
197197

0 commit comments

Comments
 (0)