341 lines
7.6 KiB
C
341 lines
7.6 KiB
C
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
|
|
/*
|
|
* Authors: Michael Zucchi <notzed@ximian.com>
|
|
*
|
|
* Copyright 2003 Ximian, Inc. (www.ximian.com)
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
|
|
*
|
|
*/
|
|
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include <glib.h>
|
|
#include "camel-utf8.h"
|
|
|
|
|
|
/**
|
|
* camel_utf8_putc:
|
|
* @ptr:
|
|
* @c:
|
|
*
|
|
* Output a 32 bit unicode character as utf8 octets. At most 4 octets will
|
|
* be written to @ptr. @ptr will be advanced to the next character position.
|
|
**/
|
|
void
|
|
camel_utf8_putc(unsigned char **ptr, guint32 c)
|
|
{
|
|
register unsigned char *p = *ptr;
|
|
|
|
if (c <= 0x7f)
|
|
*p++ = c;
|
|
else if (c <= 0x7ff) {
|
|
*p++ = 0xc0 | c >> 6;
|
|
*p++ = 0x80 | (c & 0x3f);
|
|
} else if (c <= 0xffff) {
|
|
*p++ = 0xe0 | c >> 12;
|
|
*p++ = 0x80 | ((c >> 6) & 0x3f);
|
|
*p++ = 0x80 | (c & 0x3f);
|
|
} else {
|
|
/* see unicode standard 3.0, S 3.8, max 4 octets */
|
|
*p++ = 0xf0 | c >> 18;
|
|
*p++ = 0x80 | ((c >> 12) & 0x3f);
|
|
*p++ = 0x80 | ((c >> 6) & 0x3f);
|
|
*p++ = 0x80 | (c & 0x3f);
|
|
}
|
|
|
|
*ptr = p;
|
|
}
|
|
|
|
/**
|
|
* camel_utf8_getc:
|
|
* @ptr:
|
|
*
|
|
* Get a Unicode character from a utf8 stream. @ptr will be advanced
|
|
* to the next character position. Invalid utf8 characters will be
|
|
* silently skipped. @ptr should point to a NUL terminated array.
|
|
*
|
|
* Return value: The next Unicode character. @ptr will be advanced to
|
|
* the next character always.
|
|
**/
|
|
guint32
|
|
camel_utf8_getc(const unsigned char **ptr)
|
|
{
|
|
register unsigned char *p = (unsigned char *)*ptr;
|
|
register unsigned char c, r;
|
|
register guint32 v, m;
|
|
|
|
again:
|
|
r = *p++;
|
|
loop:
|
|
if (r < 0x80) {
|
|
*ptr = p;
|
|
v = r;
|
|
} else if (r < 0xf8) { /* valid start char? (max 4 octets) */
|
|
v = r;
|
|
m = 0x7f80; /* used to mask out the length bits */
|
|
do {
|
|
c = *p++;
|
|
if ((c & 0xc0) != 0x80) {
|
|
r = c;
|
|
goto loop;
|
|
}
|
|
v = (v<<6) | (c & 0x3f);
|
|
r<<=1;
|
|
m<<=5;
|
|
} while (r & 0x40);
|
|
|
|
*ptr = p;
|
|
|
|
v &= ~m;
|
|
} else {
|
|
goto again;
|
|
}
|
|
|
|
return v;
|
|
}
|
|
|
|
/**
|
|
* camel_utf8_getc_limit:
|
|
* @ptr:
|
|
* @end: must not be NULL.
|
|
*
|
|
* Get the next utf8 char at @ptr, and return it, advancing @ptr to
|
|
* the next character. If @end is reached before a full utf8
|
|
* character can be read, then the invalid Unicode char 0xffff is
|
|
* returned as a sentinel (Unicode 3.1, section 2.7), and @ptr is not
|
|
* advanced.
|
|
*
|
|
* Return value: The next utf8 char, or 0xffff.
|
|
**/
|
|
guint32
|
|
camel_utf8_getc_limit(const unsigned char **ptr, const unsigned char *end)
|
|
{
|
|
register unsigned char *p = (unsigned char *)*ptr;
|
|
register unsigned char c, r;
|
|
register guint32 v = 0xffff, m;
|
|
|
|
again:
|
|
while (p < end) {
|
|
r = *p++;
|
|
loop:
|
|
if (r < 0x80) {
|
|
*ptr = p;
|
|
return r;
|
|
} else if (r < 0xf8) { /* valid start char? (max 4 octets) */
|
|
v = r;
|
|
m = 0x7f80; /* used to mask out the length bits */
|
|
do {
|
|
if (p >= end)
|
|
return 0xffff;
|
|
|
|
c = *p++;
|
|
if ((c & 0xc0) != 0x80) {
|
|
r = c;
|
|
goto loop;
|
|
}
|
|
v = (v<<6) | (c & 0x3f);
|
|
r<<=1;
|
|
m<<=5;
|
|
} while (r & 0x40);
|
|
|
|
*ptr = p;
|
|
|
|
v &= ~m;
|
|
return v;
|
|
} else {
|
|
goto again;
|
|
}
|
|
}
|
|
|
|
return 0xffff;
|
|
}
|
|
|
|
void
|
|
g_string_append_u(GString *out, guint32 c)
|
|
{
|
|
unsigned char buffer[8];
|
|
unsigned char *p = buffer;
|
|
|
|
camel_utf8_putc(&p, c);
|
|
*p = 0;
|
|
g_string_append(out, buffer);
|
|
}
|
|
|
|
static char *utf7_alphabet =
|
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
|
|
|
|
static unsigned char utf7_rank[256] = {
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff,
|
|
0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
|
|
0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
|
|
0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
|
};
|
|
|
|
/**
|
|
* camel_utf7_utf8:
|
|
* @ptr:
|
|
*
|
|
* Convert a modified utf7 string to utf8. If the utf7 string
|
|
* contains 8 bit characters, they are treated as iso-8859-1.
|
|
*
|
|
* The IMAP rules [rfc2060] are used in the utf7 encoding.
|
|
*
|
|
* Return value: The converted string.
|
|
**/
|
|
char *
|
|
camel_utf7_utf8(const char *ptr)
|
|
{
|
|
const unsigned char *p = (unsigned char *)ptr;
|
|
unsigned int c;
|
|
guint32 v=0, x;
|
|
GString *out;
|
|
int i=0;
|
|
int state = 0;
|
|
char *ret;
|
|
|
|
out = g_string_new("");
|
|
do {
|
|
c = *p++;
|
|
switch(state) {
|
|
case 0:
|
|
if (c == '&')
|
|
state = 1;
|
|
else
|
|
g_string_append_u(out, c);
|
|
break;
|
|
case 1:
|
|
if (c == '-') {
|
|
g_string_append_c(out, '&');
|
|
state = 0;
|
|
} else if (utf7_rank[c] != 0xff) {
|
|
v = utf7_rank[c];
|
|
i = 6;
|
|
state = 2;
|
|
} else {
|
|
/* invalid */
|
|
g_string_append(out, "&-");
|
|
state = 0;
|
|
}
|
|
break;
|
|
case 2:
|
|
if (c == '-') {
|
|
state = 0;
|
|
} else if (utf7_rank[c] != 0xff) {
|
|
v = (v<<6) | utf7_rank[c];
|
|
i+=6;
|
|
if (i >= 16) {
|
|
x = (v >> (i-16)) & 0xffff;
|
|
g_string_append_u(out, x);
|
|
i-=16;
|
|
}
|
|
} else {
|
|
g_string_append_u(out, c);
|
|
state = 0;
|
|
}
|
|
break;
|
|
}
|
|
} while (c);
|
|
|
|
ret = g_strdup(out->str);
|
|
g_string_free(out, TRUE);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void utf7_closeb64(GString *out, guint32 v, guint32 i)
|
|
{
|
|
guint32 x;
|
|
|
|
if (i>0) {
|
|
x = (v << (6-i)) & 0x3f;
|
|
g_string_append_c(out, utf7_alphabet[x]);
|
|
}
|
|
g_string_append_c(out, '-');
|
|
}
|
|
|
|
/**
|
|
* camel_utf8_utf7:
|
|
* @ptr:
|
|
*
|
|
* Convert a utf8 string to a modified utf7 format.
|
|
*
|
|
* The IMAP rules [rfc2060] are used in the utf7 encoding.
|
|
*
|
|
* Return value:
|
|
**/
|
|
char *
|
|
camel_utf8_utf7(const char *ptr)
|
|
{
|
|
const unsigned char *p = (unsigned char *)ptr;
|
|
unsigned int c;
|
|
guint32 x, v = 0;
|
|
int state = 0;
|
|
GString *out;
|
|
int i = 0;
|
|
char *ret;
|
|
|
|
out = g_string_new("");
|
|
|
|
while ( (c = camel_utf8_getc(&p)) ) {
|
|
if (c >= 0x20 && c <= 0x7e) {
|
|
if (state == 1) {
|
|
utf7_closeb64(out, v, i);
|
|
state = 0;
|
|
i = 0;
|
|
}
|
|
if (c == '&')
|
|
g_string_append(out, "&-");
|
|
else
|
|
g_string_append_c(out, c);
|
|
} else {
|
|
if (state == 0) {
|
|
g_string_append_c(out, '&');
|
|
state = 1;
|
|
}
|
|
v = (v << 16) | c;
|
|
i += 16;
|
|
while (i >= 6) {
|
|
x = (v >> (i-6)) & 0x3f;
|
|
g_string_append_c(out, utf7_alphabet[x]);
|
|
i -= 6;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (state == 1)
|
|
utf7_closeb64(out, v, i);
|
|
|
|
ret = g_strdup(out->str);
|
|
g_string_free(out, TRUE);
|
|
|
|
return ret;
|
|
}
|