aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLars Henriksen <LarsHenriksen@get2net.dk>2017-11-29 22:19:10 +0100
committerLukas Fleischer <lfleischer@calcurse.org>2017-12-07 09:02:58 +0100
commit95c5d576fafa2f705e6562f57bab9a9d583c8776 (patch)
tree475b7ef3fb9831a094aec2eb562bef38a94be0f2
parentedc44d613bdc57566a48ea855af86a9df0b3d13d (diff)
downloadcalcurse-95c5d576fafa2f705e6562f57bab9a9d583c8776.tar.gz
calcurse-95c5d576fafa2f705e6562f57bab9a9d583c8776.zip
Update UTF-8 base code
UTF-8 encodes characters in one to four bytes (since 2003). Because 0 is a valid code point, the decode function utf8_ord() should return -1, not 0, on error. As a consequence utf8_width() should return 0 for a continuation byte (as it did previously). Signed-off-by: Lukas Fleischer <lfleischer@calcurse.org>
-rw-r--r--src/calcurse.h9
-rw-r--r--src/utf8.c25
2 files changed, 11 insertions, 23 deletions
diff --git a/src/calcurse.h b/src/calcurse.h
index f4f0e6c..5bf32cf 100644
--- a/src/calcurse.h
+++ b/src/calcurse.h
@@ -225,13 +225,10 @@
#define TOSTRING(x) STRINGIFY(x)
#define __FILE_POS__ __FILE__ ":" TOSTRING(__LINE__)
-#define UTF8_MAXLEN 6
-#define UTF8_LENGTH(ch) ((unsigned char)ch >= 0xFC ? 6 : \
- ((unsigned char)ch >= 0xF8 ? 5 : \
- ((unsigned char)ch >= 0xF0 ? 4 : \
+#define UTF8_MAXLEN 4
+#define UTF8_LENGTH(ch) ((unsigned char)ch >= 0xF0 ? 4 : \
((unsigned char)ch >= 0xE0 ? 3 : \
- ((unsigned char)ch >= 0xC0 ? 2 : 1)))))
-#define UTF8_ISMULTI(ch) ((unsigned char)ch >= 0x80)
+ ((unsigned char)ch >= 0xC0 ? 2 : 1)))
#define UTF8_ISCONT(ch) ((unsigned char)ch >= 0x80 && \
(unsigned char)ch <= 0xBF)
diff --git a/src/utf8.c b/src/utf8.c
index 47d83dc..e7754ae 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -269,11 +269,11 @@ static const struct utf8_range utf8_widthtab[] = {
{0xe0100, 0xe01ef, 0}
};
-/* Decode a UTF-8 code point. */
+/* Decode a UTF-8 encoded character. Return the Unicode code point. */
int utf8_ord(const char *s)
{
if (UTF8_ISCONT(*s))
- return 0;
+ return -1;
switch (UTF8_LENGTH(*s)) {
case 1:
@@ -285,17 +285,9 @@ int utf8_ord(const char *s)
(s[0] & 0x0f) << 12;
case 4:
return (((s[3] & 0x3f) | (s[2] & 0x3f) << 6) |
- (s[1] & 0x3f) << 12) | (s[0] & 0x3f) << 18;
- case 5:
- return ((((s[4] & 0x3f) | (s[3] & 0x3f) << 6) |
- (s[2] & 0x3f) << 12) | (s[1] & 0x3f) << 18) |
- (s[0] & 0x3f) << 24;
- case 6:
- return (((((s[5] & 0x3f) | (s[4] & 0x3f) << 6) |
- (s[3] & 0x3f) << 12) | (s[2] & 0x3f) << 18) |
- (s[1] & 0x3f) << 24) | (s[0] & 0x3f) << 30;
+ (s[1] & 0x3f) << 12) | (s[0] & 0x7) << 18;
default:
- return 0;
+ return -1;
}
}
@@ -304,6 +296,8 @@ int utf8_width(char *s)
{
int val, low, high, cur;
+ if (UTF8_ISCONT(*s))
+ return 0;
val = utf8_ord(s);
low = 0;
high = ARRAY_SIZE(utf8_widthtab);
@@ -328,11 +322,8 @@ int utf8_strwidth(char *s)
{
int width = 0;
- for (; s && *s; s++) {
- if (!UTF8_ISCONT(*s))
- width += utf8_width(s);
- }
-
+ for (; *s; s++)
+ width += utf8_width(s);
return width;
}