From 070f1999168d14051103dd59a25500fcba1f80e9 Mon Sep 17 00:00:00 2001 From: Michael Jumper Date: Mon, 5 Aug 2013 14:00:16 -0700 Subject: [PATCH] Implement additional UTF-8 read/write functions for convenience. --- src/libguac/unicode.c | 117 ++++++++++++++++++++++++++++++++++++++++-- src/libguac/unicode.h | 28 ++++++++++ 2 files changed, 140 insertions(+), 5 deletions(-) diff --git a/src/libguac/unicode.c b/src/libguac/unicode.c index f634c10c..babdeff1 100644 --- a/src/libguac/unicode.c +++ b/src/libguac/unicode.c @@ -42,11 +42,10 @@ size_t guac_utf8_charsize(unsigned char c) { /* Determine size in bytes of character */ - if ((c >>= 1) == 0x7E) return 6; - if ((c >>= 1) == 0x3E) return 5; - if ((c >>= 1) == 0x1E) return 4; - if ((c >>= 1) == 0x0E) return 3; - if ((c >>= 1) == 0x06) return 2; + if ((c | 0x7F) == 0x7F) return 1; + if ((c | 0x1F) == 0xDF) return 2; + if ((c | 0x0F) == 0xEF) return 3; + if ((c | 0x07) == 0xF7) return 4; /* Default to one character */ return 1; @@ -85,3 +84,111 @@ size_t guac_utf8_strlen(const char* str) { } +int guac_utf8_write(int codepoint, char* utf8, int length) { + + int i; + int mask, bytes; + + /* If not even one byte, cannot write */ + if (length <= 0) + return 0; + + /* Determine size and initial byte mask */ + if (codepoint <= 0x007F) { + mask = 0x00; + bytes = 1; + } + else if (codepoint <= 0x7FF) { + mask = 0xC0; + bytes = 2; + } + else if (codepoint <= 0xFFFF) { + mask = 0xE0; + bytes = 3; + } + else if (codepoint <= 0x1FFFFF) { + mask = 0xF0; + bytes = 4; + } + + /* Otherwise, invalid codepoint */ + else { + *(utf8++) = '?'; + return 1; + } + + /* If not enough room, don't write anything */ + if (bytes > length) + return 0; + + /* Offset buffer by size */ + utf8 += bytes - 1; + + /* Add trailing bytes, if any */ + for (i=1; i>= 6; + } + + /* Set initial byte */ + *utf8 = mask | codepoint; + + /* Done */ + return bytes; + +} + +int guac_utf8_read(const char* utf8, int length, int* codepoint) { + + char initial; + int bytes; + int result; + + /* If not even one byte, cannot read */ + if (length <= 0) + return 0; + + /* Read initial byte */ + initial = *(utf8++); + + /* 0xxxxxxx */ + if ((initial | 0x7F) == 0x7F) { + result = initial; + bytes = 1; + } + + /* 110xxxxx 10xxxxxx */ + else if ((initial | 0x1F) == 0xDF) { + result = initial & 0x1F; + bytes = 2; + } + + /* 1110xxxx 10xxxxxx 10xxxxxx */ + else if ((initial | 0x0F) == 0xEF) { + result = initial & 0x0F; + bytes = 3; + } + + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + else if ((initial | 0x07) == 0xF7) { + result = initial & 0x07; + bytes = 4; + } + + /* Otherwise, invalid codepoint */ + else { + *codepoint = 0xFFFD; /* Replacement character */ + return 1; + } + + /* If not enough room, don't read anything */ + if (bytes > length) + return 0; + + /* STUB: Read. */ + + *codepoint = result; + return bytes; + +} + diff --git a/src/libguac/unicode.h b/src/libguac/unicode.h index cace74e2..fac3a651 100644 --- a/src/libguac/unicode.h +++ b/src/libguac/unicode.h @@ -64,5 +64,33 @@ size_t guac_utf8_charsize(unsigned char c); */ size_t guac_utf8_strlen(const char* str); +/** + * Given destination buffer and its length, writes the given codepoint as UTF-8 + * to the buffer, returning the number of bytes written. If there is not enough + * space in the buffer to write the character, no bytes are written at all. + * + * @param codepoint The Unicode codepoint to write to the buffer. + * @param utf8 The buffer to write to. + * @param length The length of the buffer, in bytes. + * @return The number of bytes written, which may be zero if there is not + * enough space in the buffer to write the UTF-8 character. + */ +int guac_utf8_write(int codepoint, char* utf8, int length); + +/** + * Given a buffer containing UTF-8 characters, reads the first codepoint in the + * buffer, returning the length of the codepoint in bytes. If no codepoint + * could be read, zero is returned. + * + * @param utf8 A buffer containing UTF-8 characters. + * @param length The length of the buffer, in bytes. + * @param codepoint A pointer to an integer which will contain the codepoint + * read, if any. If no character can be read, the integer + * will be left untouched. + * @return The number of bytes read, which may be zero if there is not enough + * space in the buffer to read a character. + */ +int guac_utf8_read(const char* utf8, int length, int* codepoint); + #endif