From 496f0d98f8b1852a2e7721682e792a850ae66483 Mon Sep 17 00:00:00 2001 From: Lukas Fleischer Date: Thu, 9 Jun 2011 18:09:22 +0200 Subject: utf8_width() performance improvements * Sort character width lookup table by character ranges. * Use binary search instead of linear search for UTF-8 character width lookups which will speed up utf8_width() (O(log n) instead of O(n)). Signed-off-by: Lukas Fleischer --- src/utf8.c | 89 +++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 50 insertions(+), 39 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index 3ce85d5..1bb199c 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -152,6 +152,9 @@ static const struct utf8_range utf8_widthtab[] = { { 0x01082, 0x0108d, 0 }, { 0x0108f, 0x0108f, 0 }, { 0x0109a, 0x0109d, 0 }, + { 0x01100, 0x0115f, 2 }, + { 0x011a3, 0x011a7, 2 }, + { 0x011fa, 0x011ff, 2 }, { 0x0135f, 0x0135f, 0 }, { 0x01712, 0x01714, 0 }, { 0x01732, 0x01734, 0 }, @@ -182,10 +185,29 @@ static const struct utf8_range utf8_widthtab[] = { { 0x01dc0, 0x01de6, 0 }, { 0x01dfd, 0x01dff, 0 }, { 0x020d0, 0x020f0, 0 }, + { 0x02329, 0x0232a, 2 }, { 0x02cef, 0x02cf1, 0 }, { 0x02de0, 0x02dff, 0 }, + { 0x02e80, 0x02e99, 2 }, + { 0x02e9b, 0x02ef3, 2 }, + { 0x02f00, 0x02fd5, 2 }, + { 0x02ff0, 0x02ffb, 2 }, + { 0x03000, 0x03029, 2 }, { 0x0302a, 0x0302f, 0 }, + { 0x03030, 0x0303e, 2 }, + { 0x03041, 0x03096, 2 }, { 0x03099, 0x0309a, 0 }, + { 0x0309b, 0x030ff, 2 }, + { 0x03105, 0x0312d, 2 }, + { 0x03131, 0x0318e, 2 }, + { 0x03190, 0x031b7, 2 }, + { 0x031c0, 0x031e3, 2 }, + { 0x031f0, 0x0321e, 2 }, + { 0x03220, 0x03247, 2 }, + { 0x03250, 0x032fe, 2 }, + { 0x03300, 0x04dbf, 2 }, + { 0x04e00, 0x0a48c, 2 }, + { 0x0a490, 0x0a4c6, 2 }, { 0x0a66f, 0x0a672, 0 }, { 0x0a67c, 0x0a67d, 0 }, { 0x0a6f0, 0x0a6f1, 0 }, @@ -198,6 +220,7 @@ static const struct utf8_range utf8_widthtab[] = { { 0x0a8e0, 0x0a8f1, 0 }, { 0x0a926, 0x0a92d, 0 }, { 0x0a947, 0x0a953, 0 }, + { 0x0a960, 0x0a97c, 2 }, { 0x0a980, 0x0a983, 0 }, { 0x0a9b3, 0x0a9c0, 0 }, { 0x0aa29, 0x0aa36, 0 }, @@ -211,9 +234,19 @@ static const struct utf8_range utf8_widthtab[] = { { 0x0aac1, 0x0aac1, 0 }, { 0x0abe3, 0x0abea, 0 }, { 0x0abec, 0x0abed, 0 }, + { 0x0ac00, 0x0d7a3, 2 }, + { 0x0d7b0, 0x0d7c6, 2 }, + { 0x0d7cb, 0x0d7fb, 2 }, + { 0x0f900, 0x0faff, 2 }, { 0x0fb1e, 0x0fb1e, 0 }, { 0x0fe00, 0x0fe0f, 0 }, + { 0x0fe10, 0x0fe19, 2 }, { 0x0fe20, 0x0fe26, 0 }, + { 0x0fe30, 0x0fe52, 2 }, + { 0x0fe54, 0x0fe66, 2 }, + { 0x0fe68, 0x0fe6b, 2 }, + { 0x0ff01, 0x0ff60, 2 }, + { 0x0ffe0, 0x0ffe6, 2 }, { 0x101fd, 0x101fd, 0 }, { 0x10a01, 0x10a03, 0 }, { 0x10a05, 0x10a06, 0 }, @@ -228,52 +261,19 @@ static const struct utf8_range utf8_widthtab[] = { { 0x1d185, 0x1d18b, 0 }, { 0x1d1aa, 0x1d1ad, 0 }, { 0x1d242, 0x1d244, 0 }, - { 0xe0100, 0xe01ef, 0 }, - { 0x01100, 0x0115f, 2 }, - { 0x011a3, 0x011a7, 2 }, - { 0x011fa, 0x011ff, 2 }, - { 0x02329, 0x0232a, 2 }, - { 0x02e80, 0x02e99, 2 }, - { 0x02e9b, 0x02ef3, 2 }, - { 0x02f00, 0x02fd5, 2 }, - { 0x02ff0, 0x02ffb, 2 }, - { 0x03000, 0x03029, 2 }, - { 0x03030, 0x0303e, 2 }, - { 0x03041, 0x03096, 2 }, - { 0x0309b, 0x030ff, 2 }, - { 0x03105, 0x0312d, 2 }, - { 0x03131, 0x0318e, 2 }, - { 0x03190, 0x031b7, 2 }, - { 0x031c0, 0x031e3, 2 }, - { 0x031f0, 0x0321e, 2 }, - { 0x03220, 0x03247, 2 }, - { 0x03250, 0x032fe, 2 }, - { 0x03300, 0x04dbf, 2 }, - { 0x04e00, 0x0a48c, 2 }, - { 0x0a490, 0x0a4c6, 2 }, - { 0x0a960, 0x0a97c, 2 }, - { 0x0ac00, 0x0d7a3, 2 }, - { 0x0d7b0, 0x0d7c6, 2 }, - { 0x0d7cb, 0x0d7fb, 2 }, - { 0x0f900, 0x0faff, 2 }, - { 0x0fe10, 0x0fe19, 2 }, - { 0x0fe30, 0x0fe52, 2 }, - { 0x0fe54, 0x0fe66, 2 }, - { 0x0fe68, 0x0fe6b, 2 }, - { 0x0ff01, 0x0ff60, 2 }, - { 0x0ffe0, 0x0ffe6, 2 }, { 0x1f200, 0x1f200, 2 }, { 0x1f210, 0x1f231, 2 }, { 0x1f240, 0x1f248, 2 }, { 0x20000, 0x2fffd, 2 }, - { 0x30000, 0x3fffd, 2 } + { 0x30000, 0x3fffd, 2 }, + { 0xe0100, 0xe01ef, 0 } }; /* Get the width of a UTF-8 character. */ int utf8_width (char *s) { - int val, i; + int val, low, high, cur; if (UTF8_ISCONT (*s)) return 0; @@ -308,11 +308,22 @@ utf8_width (char *s) return 0; } - for (i = 0; i < sizeof(utf8_widthtab) / sizeof(utf8_widthtab[0]); i++) + low = 0; + high = sizeof(utf8_widthtab) / sizeof(utf8_widthtab[0]); + do { - if (val >= utf8_widthtab[i].min && val <= utf8_widthtab[i].max) - return utf8_widthtab[i].width; + cur = (low + high) / 2; + if (val >= utf8_widthtab[cur].min) + { + if (val <= utf8_widthtab[cur].max) + return utf8_widthtab[cur].width; + else + low = cur + 1; + } + else + high = cur - 1; } + while (low <= high); return 1; } -- cgit v1.2.3-54-g00ecf