summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.txt7
-rw-r--r--u8c/include/u8c/format.h4
-rw-r--r--u8c/include/u8c/u8c.h20
-rw-r--r--u8c/source/format/decode_utf8.c46
-rw-r--r--u8c/source/format/decode_utf8_length.c14
5 files changed, 68 insertions, 23 deletions
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 6ede68a..7091b5a 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -1,3 +1,10 @@
+# 26
+
+* Update comments
+* Fix u8c_decode_utf8_length not validating
+* Add attribute u8c_DEPRECATED
+* Deprecate u8c_encode_utf16 and u8c_encode_utf16_length as they're untested (this is not permanent)
+
# 25
* Rename source directory: src => source
diff --git a/u8c/include/u8c/format.h b/u8c/include/u8c/format.h
index 60a85e9..d019581 100644
--- a/u8c/include/u8c/format.h
+++ b/u8c/include/u8c/format.h
@@ -42,8 +42,8 @@ u8c_NO_DISCARD u8c_NO_THROW size_t u8c_decode_utf16_length(uint_least16_t const*
u8c_NO_THROW size_t u8c_encode_utf8(char* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count);
u8c_NO_THROW size_t u8c_decode_utf8(uint_least32_t* u8c_RESTRICT _buffer, char const* u8c_RESTRICT _source, size_t _count);
-u8c_NO_THROW size_t u8c_encode_utf16(uint_least16_t* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count);
-u8c_NO_THROW size_t u8c_decode_utf16(uint_least32_t* u8c_RESTRICT _buffer, uint_least16_t const* u8c_RESTRICT _source, size_t _count);
+u8c_DEPRECATED("utf-16 may not be safe") u8c_NO_THROW size_t u8c_encode_utf16(uint_least16_t* u8c_RESTRICT _buffer, uint_least32_t const* u8c_RESTRICT _source, size_t _count);
+u8c_DEPRECATED("utf-16 may not be safe") u8c_NO_THROW size_t u8c_decode_utf16(uint_least32_t* u8c_RESTRICT _buffer, uint_least16_t const* u8c_RESTRICT _source, size_t _count);
#ifdef __cplusplus
}
diff --git a/u8c/include/u8c/u8c.h b/u8c/include/u8c/u8c.h
index 94791e3..05d9089 100644
--- a/u8c/include/u8c/u8c.h
+++ b/u8c/include/u8c/u8c.h
@@ -56,35 +56,39 @@
#ifdef __GNUC__
-#define u8c_ALWAYS_INLINE __attribute__ ((__always_inline__))
-#define u8c_NO_DISCARD __attribute__ ((__warn_unused_result__))
-#define u8c_NO_THROW __attribute__ ((__nothrow__))
-#define u8c_UNSEQUENCED __attribute__ ((__const__))
+#define u8c_ALWAYS_INLINE __attribute__ ((__always_inline__))
+#define u8c_DEPRECATED(_m) __attribute__ ((__deprecated__((_m))))
+#define u8c_NO_DISCARD __attribute__ ((__warn_unused_result__))
+#define u8c_NO_THROW __attribute__ ((__nothrow__))
+#define u8c_UNSEQUENCED __attribute__ ((__const__))
#elif __STDC_VERSION__ >= 202311
#define u8c_ALWAYS_INLINE
-#define u8c_NO_DISCARD [[nodiscard]]
+#define u8c_DEPRECATED(_m) [[deprecated((_m))]]
+#define u8c_NO_DISCARD [[nodiscard]]
#define u8c_NO_THROW
-#define u8c_UNSEQUENCED [[unsequenced]]
+#define u8c_UNSEQUENCED [[unsequenced]]
#elif __cplusplus >= 201703
#define u8c_ALWAYS_INLINE
-#define u8c_NO_DISCARD [[nodiscard]]
+#define u8c_DEPRECATED(_m) [[deprecated((_m))]]
+#define u8c_NO_DISCARD [[nodiscard]]
#define u8c_NO_THROW
#define u8c_UNSEQUENCED
#else
#define u8c_ALWAYS_INLINE
+#define u8c_DEPRECATED(_m)
#define u8c_NO_DISCARD
#define u8c_NO_THROW
#define u8c_UNSEQUENCED
#endif
-#define u8c_VERSION ((uint_least32_t)+UINT32_C(0x1D))
+#define u8c_VERSION ((uint_least32_t)+UINT32_C(0x1E))
#define u8c_MAXIMUM_CODE_POINT ((uint_least32_t)+UINT32_C(0x0010FFFF))
diff --git a/u8c/source/format/decode_utf8.c b/u8c/source/format/decode_utf8.c
index 999095f..de92eaf 100644
--- a/u8c/source/format/decode_utf8.c
+++ b/u8c/source/format/decode_utf8.c
@@ -39,7 +39,31 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
uint_least32_t code_point = UINT32_C(0x0);
+ // For each octet in the input, we assert the
+ // following:
+ //
+ // 1. It has an appropriate value for its position.
+ // 2. The ammount of remaining octets is
+ // sufficient to fully decode the current
+ // sequence.
+ //
+ // If these predicates are not true, the octet is
+ // discard and the replacement character U+FFFD
+ // written set its place.
+ //
+ // If the decoded code point lies outside the
+ // defined valid range of a UTF-32 value - that is,
+ // it's a surrogate point or larger than
+ // U+0010FFFF - it is likewise replaced.
+ //
+ // If an octet sequence with an otherwise valid
+ // initiating octet contains any ammount of invalid
+ // values, it is skipped in its entirety and
+ // replaced.
+
if ((octet & UINT32_C(0xF8)) == UINT32_C(0xF0)) {
+ // Four octets:
+
if (remaining < 0x3u) {
code_point = UINT32_C(0xFFFD);
} else {
@@ -48,9 +72,9 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
uint_least32_t const octet3 = (uint_least32_t)source[index_in + 0x3];
if (
- (octet1 & 0xC0) != 0x80
- || (octet2 & 0xC0) != 0x80
- || (octet3 & 0xC0) != 0x80
+ (octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)
+ || (octet2 & UINT32_C(0xC0)) != UINT32_C(0x80)
+ || (octet3 & UINT32_C(0xC0)) != UINT32_C(0x80)
) {
code_point = UINT32_C(0xFFFD);
} else {
@@ -63,6 +87,8 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
index_in += 0x4;
} else if ((octet & UINT32_C(0xF0)) == UINT32_C(0xE0)) {
+ // Three octets:
+
if (remaining < 0x2u) {
code_point = UINT32_C(0xFFFD);
} else {
@@ -70,8 +96,8 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
uint_least32_t const octet2 = (uint_least32_t)source[index_in + 0x2];
if (
- (octet1 & 0xC0) != 0x80
- || (octet2 & 0xC0) != 0x80
+ (octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)
+ || (octet2 & UINT32_C(0xC0)) != UINT32_C(0x80)
) {
code_point = UINT32_C(0xFFFD);
} else {
@@ -82,13 +108,15 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
}
index_in += 0x3;
- } else if ((octet & UINT32_C(0xE0)) == 0xC0) {
+ } else if ((octet & UINT32_C(0xE0)) == UINT32_C(0xC0)) {
+ // Two octets:
+
if (remaining < 0x1u) {
code_point = UINT32_C(0xFFFD);
} else {
uint_least32_t const octet1 = (uint_least32_t)source[index_in + 0x1];
- if ((octet1 & 0xC0) != 0x80) {
+ if ((octet1 & UINT32_C(0xC0)) != UINT32_C(0x80)) {
code_point = UINT32_C(0xFFFD);
} else {
code_point |= (octet ^ UINT32_C(0xC0)) << UINT32_C(0x6);
@@ -98,10 +126,14 @@ size_t u8c_decode_utf8(uint_least32_t* const restrict buffer, char const* const
index_in += 0x2;
} else if ((octet & UINT32_C(0x80)) == UINT32_C(0x0)) {
+ // One octet:
+
code_point |= octet;
++index_in;
} else {
+ // Invalid:
+
code_point = UINT32_C(0xFFFD);
++index_in;
diff --git a/u8c/source/format/decode_utf8_length.c b/u8c/source/format/decode_utf8_length.c
index 09d27a4..1259ab8 100644
--- a/u8c/source/format/decode_utf8_length.c
+++ b/u8c/source/format/decode_utf8_length.c
@@ -34,13 +34,15 @@ size_t u8c_decode_utf8_length(char const* const restrict _source, size_t const c
for (ptrdiff_t index = 0x0; index < (ptrdiff_t)count; ++length) {
char unsigned const octet = source[index];
- if (octet >= 0xF0u) {
- index += 0x4u;
- } else if (octet >= 0xE0u) {
- index += 0x3u;
- } else if (octet >= 0xC0u) {
- index += 0x2u;
+ if ((octet & UINT32_C(0xF8)) == UINT32_C(0xF0)) {
+ index += 0x4;
+ } else if ((octet & UINT32_C(0xF0)) == UINT32_C(0xE0)) {
+ index += 0x3;
+ } else if ((octet & UINT32_C(0xE0)) == UINT32_C(0xC0)) {
+ index += 0x2;
} else {
+ // Valid or not, this is decoded as a single code
+ // point.
++index;
}
}