diff options
-rw-r--r-- | CHANGELOG.txt | 7 | ||||
-rw-r--r-- | u8c/include/u8c/format.h | 4 | ||||
-rw-r--r-- | u8c/include/u8c/u8c.h | 20 | ||||
-rw-r--r-- | u8c/source/format/decode_utf8.c | 46 | ||||
-rw-r--r-- | u8c/source/format/decode_utf8_length.c | 14 |
5 files changed, 68 insertions, 23 deletions
diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 6ede68a..7091b5a 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,10 @@ +# 26 + +* Update comments +* Fix u8c_decode_utf8_length not validating +* Add attribute u8c_DEPRECATED +* Deprecate u8c_encode_utf16 and u8c_encode_utf16_length as they're untested (this is not permanent) + # 25 * Rename source directory: src => source diff --git a/u8c/include/u8c/format.h b/u8c/include/u8c/format.h index 60a85e9..d019581 100644 --- a/u8c/include/u8c/format.h +++ b/u8c/include/u8c/format.h @@ -42,8 +42,8 @@ u8c_NO_DISCARD u8c_NO_THROW size_t u8c_decode_utf16_length(uint_least16_t const* u8c_NO_THROW size_t u8c_encode_utf8(char* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count); u8c_NO_THROW size_t u8c_decode_utf8(uint_least32_t* u8c_RESTRICT _buffer, char const* u8c_RESTRICT _source, size_t _count); -u8c_NO_THROW size_t u8c_encode_utf16(uint_least16_t* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count); -u8c_NO_THROW size_t u8c_decode_utf16(uint_least32_t* u8c_RESTRICT _buffer, uint_least16_t const* u8c_RESTRICT _source, size_t _count); +u8c_DEPRECATED("utf-16 may not be safe") u8c_NO_THROW size_t u8c_encode_utf16(uint_least16_t* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count); +u8c_DEPRECATED("utf-16 may not be safe") u8c_NO_THROW size_t u8c_decode_utf16(uint_least32_t* u8c_RESTRICT _buffer, uint_least16_t const* u8c_RESTRICT _source, size_t _count); #ifdef __cplusplus } diff --git a/u8c/include/u8c/u8c.h b/u8c/include/u8c/u8c.h index 94791e3..05d9089 100644 --- a/u8c/include/u8c/u8c.h +++ b/u8c/include/u8c/u8c.h @@ -56,35 +56,39 @@ #ifdef __GNUC__ -#define u8c_ALWAYS_INLINE __attribute__ ((__always_inline__)) -#define u8c_NO_DISCARD __attribute__ ((__warn_unused_result__)) -#define u8c_NO_THROW __attribute__ ((__nothrow__)) -#define u8c_UNSEQUENCED __attribute__ ((__const__)) +#define u8c_ALWAYS_INLINE __attribute__ ((__always_inline__)) +#define u8c_DEPRECATED(_m) __attribute__ ((__deprecated__((_m)))) +#define u8c_NO_DISCARD __attribute__ ((__warn_unused_result__)) +#define u8c_NO_THROW __attribute__ ((__nothrow__)) +#define u8c_UNSEQUENCED __attribute__ ((__const__)) #elif __STDC_VERSION__ >= 202311 #define u8c_ALWAYS_INLINE -#define u8c_NO_DISCARD [[nodiscard]] +#define u8c_DEPRECATED(_m) [[deprecated((_m))]] +#define u8c_NO_DISCARD [[nodiscard]] #define u8c_NO_THROW -#define u8c_UNSEQUENCED [[unsequenced]] +#define u8c_UNSEQUENCED [[unsequenced]] #elif __cplusplus >= 201703 #define u8c_ALWAYS_INLINE -#define u8c_NO_DISCARD [[nodiscard]] +#define u8c_DEPRECATED(_m) [[deprecated((_m))]] +#define u8c_NO_DISCARD [[nodiscard]] #define u8c_NO_THROW #define u8c_UNSEQUENCED #else #define u8c_ALWAYS_INLINE +#define u8c_DEPRECATED(_m) #define u8c_NO_DISCARD #define u8c_NO_THROW #define u8c_UNSEQUENCED #endif -#define u8c_VERSION ((uint_least32_t)+UINT32_C(0x1D)) +#define u8c_VERSION ((uint_least32_t)+UINT32_C(0x1E)) #define u8c_MAXIMUM_CODE_POINT ((uint_least32_t)+UINT32_C(0x0010FFFF)) diff --git a/u8c/source/format/decode_utf8.c b/u8c/source/format/decode_utf8.c index 999095f..de92eaf 100644 --- a/u8c/source/format/decode_utf8.c +++ b/u8c/source/format/decode_utf8.c @@ -39,7 +39,31 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const uint_least32_t code_point = UINT32_C(0x0); + // For each octet in the input, we assert the + // following: + // + // 1. It has an appropriate value for its position. + // 2. The ammount of remaining octets is + // sufficient to fully decode the current + // sequence. + // + // If these predicates are not true, the octet is + // discard and the replacement character U+FFFD + // written set its place. + // + // If the decoded code point lies outside the + // defined valid range of a UTF-32 value - that is, + // it's a surrogate point or larger than + // U+0010FFFF - it is likewise replaced. + // + // If an octet sequence with an otherwise valid + // initiating octet contains any ammount of invalid + // values, it is skipped in its entirety and + // replaced. + if ((octet & UINT32_C(0xF8)) == UINT32_C(0xF0)) { + // Four octets: + if (remaining < 0x3u) { code_point = UINT32_C(0xFFFD); } else { @@ -48,9 +72,9 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const uint_least32_t const octet3 = (uint_least32_t)source[index_in + 0x3]; if ( - (octet1 & 0xC0) != 0x80 - || (octet2 & 0xC0) != 0x80 - || (octet3 & 0xC0) != 0x80 + (octet1 & UINT32_C(0xC0)) != UINT32_C(0x80) + || (octet2 & UINT32_C(0xC0)) != UINT32_C(0x80) + || (octet3 & UINT32_C(0xC0)) != UINT32_C(0x80) ) { code_point = UINT32_C(0xFFFD); } else { @@ -63,6 +87,8 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const index_in += 0x4; } else if ((octet & UINT32_C(0xF0)) == UINT32_C(0xE0)) { + // Three octets: + if (remaining < 0x2u) { code_point = UINT32_C(0xFFFD); } else { @@ -70,8 +96,8 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const uint_least32_t const octet2 = (uint_least32_t)source[index_in + 0x2]; if ( - (octet1 & 0xC0) != 0x80 - || (octet2 & 0xC0) != 0x80 + (octet1 & UINT32_C(0xC0)) != UINT32_C(0x80) + || (octet2 & UINT32_C(0xC0)) != UINT32_C(0x80) ) { code_point = UINT32_C(0xFFFD); } else { @@ -82,13 +108,15 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const } index_in += 0x3; - } else if ((octet & UINT32_C(0xE0)) == 0xC0) { + } else if ((octet & UINT32_C(0xE0)) == UINT32_C(0xC0)) { + // Two octets: + if (remaining < 0x1u) { code_point = UINT32_C(0xFFFD); } else { uint_least32_t const octet1 = (uint_least32_t)source[index_in + 0x1]; - if ((octet1 & 0xC0) != 0x80) { + if ((octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)) { code_point = UINT32_C(0xFFFD); } else { code_point |= (octet ^ UINT32_C(0xC0)) << UINT32_C(0x6); @@ -98,10 +126,14 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const index_in += 0x2; } else if ((octet & UINT32_C(0x80)) == UINT32_C(0x0)) { + // One octet: + code_point |= octet; ++index_in; } else { + // Invalid: + code_point = UINT32_C(0xFFFD); ++index_in; diff --git a/u8c/source/format/decode_utf8_length.c b/u8c/source/format/decode_utf8_length.c index 09d27a4..1259ab8 100644 --- a/u8c/source/format/decode_utf8_length.c +++ b/u8c/source/format/decode_utf8_length.c @@ -34,13 +34,15 @@ size_t u8c_decode_utf8_length(char const* const restrict _source, size_t const c for (ptrdiff_t index = 0x0; index < (ptrdiff_t)count; ++length) { char unsigned const octet = source[index]; - if (octet >= 0xF0u) { - index += 0x4u; - } else if (octet >= 0xE0u) { - index += 0x3u; - } else if (octet >= 0xC0u) { - index += 0x2u; + if ((octet & UINT32_C(0xF8)) == UINT32_C(0xF0)) { + index += 0x4; + } else if ((octet & UINT32_C(0xF0)) == UINT32_C(0xE0)) { + index += 0x3; + } else if ((octet & UINT32_C(0xE0)) == UINT32_C(0xC0)) { + index += 0x2; } else { + // Valid or not, this is decoded as a single code + // point. ++index; } } |