Implement additional UTF-8 read/write functions for convenience.

2013-08-05 14:00:16 -07:00 · 2013-08-05 14:00:16 -07:00 · 070f199916
commit 070f199916
parent 8099a57fc6
2 changed files with 140 additions and 5 deletions
--- a/src/libguac/unicode.c
+++ b/src/libguac/unicode.c
@ -42,11 +42,10 @@
 size_t guac_utf8_charsize(unsigned char c) {

    /* Determine size in bytes of character */
-    if ((c >>= 1) == 0x7E) return 6;
-    if ((c >>= 1) == 0x3E) return 5;
-    if ((c >>= 1) == 0x1E) return 4;
-    if ((c >>= 1) == 0x0E) return 3;
-    if ((c >>= 1) == 0x06) return 2;
+    if ((c | 0x7F) == 0x7F) return 1;
+    if ((c | 0x1F) == 0xDF) return 2;
+    if ((c | 0x0F) == 0xEF) return 3;
+    if ((c | 0x07) == 0xF7) return 4;

    /* Default to one character */
    return 1;
@ -85,3 +84,111 @@ size_t guac_utf8_strlen(const char* str) {

 }

+int guac_utf8_write(int codepoint, char* utf8, int length) {
+
+    int i;
+    int mask, bytes;
+
+    /* If not even one byte, cannot write */
+    if (length <= 0)
+        return 0;
+
+    /* Determine size and initial byte mask */
+    if (codepoint <= 0x007F) {
+        mask  = 0x00;
+        bytes = 1;
+    }
+    else if (codepoint <= 0x7FF) {
+        mask  = 0xC0;
+        bytes = 2;
+    }
+    else if (codepoint <= 0xFFFF) {
+        mask  = 0xE0;
+        bytes = 3;
+    }
+    else if (codepoint <= 0x1FFFFF) {
+        mask  = 0xF0;
+        bytes = 4;
+    }
+
+    /* Otherwise, invalid codepoint */
+    else {
+        *(utf8++) = '?';
+        return 1;
+    }
+
+    /* If not enough room, don't write anything */
+    if (bytes > length)
+        return 0;
+
+    /* Offset buffer by size */
+    utf8 += bytes - 1;
+
+    /* Add trailing bytes, if any */
+    for (i=1; i<bytes; i++) {
+        *(utf8--) = 0x80 | (codepoint & 0x3F);
+        codepoint >>= 6;
+    }
+
+    /* Set initial byte */
+    *utf8 = mask | codepoint;
+
+    /* Done */
+    return bytes;
+
+}
+
+int guac_utf8_read(const char* utf8, int length, int* codepoint) {
+
+    char initial;
+    int bytes;
+    int result;
+
+    /* If not even one byte, cannot read */
+    if (length <= 0)
+        return 0;
+
+    /* Read initial byte */
+    initial = *(utf8++);
+
+    /* 0xxxxxxx */
+    if ((initial | 0x7F) == 0x7F) {
+        result = initial;
+        bytes  = 1;
+    }
+
+    /* 110xxxxx 10xxxxxx */
+    else if ((initial | 0x1F) == 0xDF) {
+        result = initial & 0x1F;
+        bytes  = 2;
+    }
+
+    /* 1110xxxx 10xxxxxx 10xxxxxx */
+    else if ((initial | 0x0F) == 0xEF) {
+        result = initial & 0x0F;
+        bytes  = 3;
+    }
+
+    /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+    else if ((initial | 0x07) == 0xF7) {
+        result = initial & 0x07;
+        bytes  = 4;
+    }
+
+    /* Otherwise, invalid codepoint */
+    else {
+        *codepoint = 0xFFFD; /* Replacement character */
+        return 1;
+    }
+
+    /* If not enough room, don't read anything */
+    if (bytes > length)
+        return 0;
+
+    /* STUB: Read. */
+
+    *codepoint = result;
+    return bytes;
+
+}
+
--- a/src/libguac/unicode.h
+++ b/src/libguac/unicode.h
@ -64,5 +64,33 @@ size_t guac_utf8_charsize(unsigned char c);
 */
 size_t guac_utf8_strlen(const char* str);

+/**
+ * Given destination buffer and its length, writes the given codepoint as UTF-8
+ * to the buffer, returning the number of bytes written. If there is not enough
+ * space in the buffer to write the character, no bytes are written at all.
+ *
+ * @param codepoint The Unicode codepoint to write to the buffer.
+ * @param utf8 The buffer to write to.
+ * @param length The length of the buffer, in bytes.
+ * @return The number of bytes written, which may be zero if there is not
+ *         enough space in the buffer to write the UTF-8 character.
+ */
+int guac_utf8_write(int codepoint, char* utf8, int length);
+
+/**
+ * Given a buffer containing UTF-8 characters, reads the first codepoint in the
+ * buffer, returning the length of the codepoint in bytes. If no codepoint
+ * could be read, zero is returned.
+ *
+ * @param utf8 A buffer containing UTF-8 characters.
+ * @param length The length of the buffer, in bytes.
+ * @param codepoint A pointer to an integer which will contain the codepoint
+ *                  read, if any. If no character can be read, the integer
+ *                  will be left untouched.
+ * @return The number of bytes read, which may be zero if there is not enough
+ *         space in the buffer to read a character.
+ */
+int guac_utf8_read(const char* utf8, int length, int* codepoint);
+
 #endif