2003-02-20 Jeffrey Stedfast <fejj@ximian.com> * camel.c (camel_init): Call camel_iconv_init(). (camel_shutdown): Call camel_iconv_shutdown(). * camel-sasl-digest-md5.c (digest_response): Updated to use camel-iconv and the new camel-charset-map functions. * camel-mime-utils.c: Updated to use camel-iconv and the new camel-charset-map functions. * camel-mime-part-utils.c (check_html_charset): Use camel_charset_canonical_name() instead of e_iconv_charset_name() which is longer available. (convert_buffer): Use camel-iconv. (simple_data_wrapper_construct_from_parser): Since camel_charset_iso_to_windows() returns the charset in it's canonical format, no need to re-canonicalise it. * camel-mime-part.c (process_header): Use camel_charset_canonical_name() instead of e_iconv_charset_name() which is longer available. * camel-mime-message.c (process_header): Use camel_charset_canonical_name() instead of e_iconv_charset_name() which is longer available. * camel-mime-filter-charset.c: Use camel-iconv. * camel-folder-summary.c (message_info_new): Use camel_charset_canonical_name() instead of e_iconv_charset_name() which is longer available. (content_info_new): Use camel_charset_locale_name(). (camel_message_info_new_from_header): Same as message_info_new(). * camel-search-private.c: Use g_alloca() instead of alloca(). * camel-filter-search.c (check_header): Use camel_charset_canonical_name() instead of e_iconv_charset_name() which is longer available. * camel-charset-map.c (camel_charset_locale_name): New function, replaces e_iconv_locale_charset(). (camel_charset_canonical_name): New function, similar to e_iconv_charset_name() but instead of returning the iconv-friendly name, it returns the canonical name. (g_iconv will do the iconv-friendly name conversions for us). svn path=/trunk/; revision=19977
528 lines
12 KiB
C
528 lines
12 KiB
C
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */
|
|
|
|
/*
|
|
* Authors:
|
|
* Michael Zucchi <notzed@ximian.com>
|
|
* Jeffrey Stedfast <fejj@ximian.com>
|
|
* Dan Winship <danw@ximian.com>
|
|
*
|
|
* Copyright 2000, 2003 Ximian, Inc. (www.ximian.com)
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of version 2 of the GNU General Public
|
|
* License as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
|
* USA
|
|
*/
|
|
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
|
|
/*
|
|
if you want to build the charset map, compile this with something like:
|
|
gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags`
|
|
(plus any -I/-L/-l flags you need for iconv), then run it as
|
|
./a.out > camel-charset-map-private.h
|
|
|
|
Note that the big-endian variant isn't tested...
|
|
|
|
The tables genereated work like this:
|
|
|
|
An indirect array for each page of unicode character
|
|
Each array element has an indirect pointer to one of the bytes of
|
|
the generated bitmask.
|
|
*/
|
|
|
|
#ifdef BUILD_MAP
|
|
#include <iconv.h>
|
|
#include <glib.h>
|
|
|
|
static struct {
|
|
char *name;
|
|
unsigned int bit; /* assigned bit */
|
|
} tables[] = {
|
|
/* These are the 8bit character sets (other than iso-8859-1,
|
|
* which is special-cased) which are supported by both other
|
|
* mailers and the GNOME environment. Note that the order
|
|
* they're listed in is the order they'll be tried in, so put
|
|
* the more-popular ones first.
|
|
*/
|
|
{ "iso-8859-2", 0 }, /* Central/Eastern European */
|
|
{ "iso-8859-4", 0 }, /* Baltic */
|
|
{ "koi8-r", 0 }, /* Russian */
|
|
{ "koi8-u", 0 }, /* Ukranian */
|
|
{ "iso-8859-5", 0 }, /* Least-popular Russian encoding */
|
|
{ "iso-8859-7", 0 }, /* Greek */
|
|
{ "iso-8859-8", 0 }, /* Hebrew; Visual */
|
|
{ "iso-8859-9", 0 }, /* Turkish */
|
|
{ "iso-8859-13", 0 }, /* Baltic again */
|
|
{ "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most
|
|
* programs that support this support UTF8
|
|
*/
|
|
{ "windows-1251", 0 }, /* Russian */
|
|
{ 0, 0 }
|
|
};
|
|
|
|
unsigned int encoding_map[256 * 256];
|
|
|
|
#if G_BYTE_ORDER == G_BIG_ENDIAN
|
|
#define UCS "UCS-4BE"
|
|
#else
|
|
#define UCS "UCS-4LE"
|
|
#endif
|
|
|
|
int main (void)
|
|
{
|
|
int i, j;
|
|
int max, min;
|
|
int bit = 0x01;
|
|
int k;
|
|
int bytes;
|
|
iconv_t cd;
|
|
char in[128];
|
|
guint32 out[128];
|
|
char *inptr, *outptr;
|
|
size_t inlen, outlen;
|
|
|
|
/* dont count the terminator */
|
|
bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
|
|
|
|
for (i = 0; i < 128; i++)
|
|
in[i] = i + 128;
|
|
|
|
for (j = 0; tables[j].name; j++) {
|
|
cd = iconv_open (UCS, tables[j].name);
|
|
inptr = in;
|
|
outptr = (char *)(out);
|
|
inlen = sizeof (in);
|
|
outlen = sizeof (out);
|
|
while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
|
|
if (errno == EILSEQ) {
|
|
inptr++;
|
|
inlen--;
|
|
} else {
|
|
printf ("%s\n", strerror (errno));
|
|
exit (1);
|
|
}
|
|
}
|
|
iconv_close (cd);
|
|
|
|
for (i = 0; i < 128 - outlen / 4; i++) {
|
|
encoding_map[i] |= bit;
|
|
encoding_map[out[i]] |= bit;
|
|
}
|
|
|
|
tables[j].bit = bit;
|
|
bit <<= 1;
|
|
}
|
|
|
|
printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
|
|
|
|
for (i=0;i<256;i++) {
|
|
/* first, do we need this block? */
|
|
for (k=0;k<bytes;k++) {
|
|
for (j=0;j<256;j++) {
|
|
if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
|
|
break;
|
|
}
|
|
if (j < 256) {
|
|
/* yes, dump it */
|
|
printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
|
|
for (j=0;j<256;j++) {
|
|
printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
|
|
if (((j+1)&7) == 0 && j<255)
|
|
printf("\n\t");
|
|
}
|
|
printf("\n};\n\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
printf("struct {\n");
|
|
for (k=0;k<bytes;k++) {
|
|
printf("\tunsigned char *bits%d;\n", k);
|
|
}
|
|
printf("} camel_charmap[256] = {\n\t");
|
|
for (i=0;i<256;i++) {
|
|
/* first, do we need this block? */
|
|
printf("{ ");
|
|
for (k=0;k<bytes;k++) {
|
|
for (j=0;j<256;j++) {
|
|
if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
|
|
break;
|
|
}
|
|
if (j < 256) {
|
|
printf("m%02x%x, ", i, k);
|
|
} else {
|
|
printf("0, ");
|
|
}
|
|
}
|
|
printf("}, ");
|
|
if (((i+1)&7) == 0 && i<255)
|
|
printf("\n\t");
|
|
}
|
|
printf("\n};\n\n");
|
|
|
|
printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
|
|
for (j=0;tables[j].name;j++) {
|
|
printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
|
|
}
|
|
printf("};\n\n");
|
|
|
|
printf("#define charset_mask(x) \\\n");
|
|
for (k=0;k<bytes;k++) {
|
|
if (k!=0)
|
|
printf("\t| ");
|
|
else
|
|
printf("\t");
|
|
printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
|
|
if (k<bytes-1)
|
|
printf("\t\\\n");
|
|
}
|
|
printf("\n\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
#else
|
|
|
|
#include "camel-charset-map.h"
|
|
#include "camel-charset-map-private.h"
|
|
#include "string-utils.h"
|
|
|
|
#include <glib.h>
|
|
#include <glib/gunicode.h>
|
|
#include <locale.h>
|
|
#include <ctype.h>
|
|
#ifdef ENABLE_THREADS
|
|
#include <pthread.h>
|
|
#endif
|
|
#ifdef HAVE_CODESET
|
|
#include <langinfo.h>
|
|
#endif
|
|
|
|
void
|
|
camel_charset_init (CamelCharset *c)
|
|
{
|
|
c->mask = ~0;
|
|
c->level = 0;
|
|
}
|
|
|
|
void
|
|
camel_charset_step (CamelCharset *c, const char *in, int len)
|
|
{
|
|
register unsigned int mask;
|
|
register int level;
|
|
const char *inptr = in, *inend = in+len;
|
|
|
|
mask = c->mask;
|
|
level = c->level;
|
|
|
|
/* check what charset a given string will fit in */
|
|
while (inptr < inend) {
|
|
gunichar c;
|
|
const char *newinptr;
|
|
newinptr = g_utf8_next_char(inptr);
|
|
c = g_utf8_get_char(inptr);
|
|
if (newinptr == NULL || !g_unichar_validate (c)) {
|
|
inptr++;
|
|
continue;
|
|
}
|
|
|
|
inptr = newinptr;
|
|
if (c<=0xffff) {
|
|
mask &= charset_mask(c);
|
|
|
|
if (c>=128 && c<256)
|
|
level = MAX(level, 1);
|
|
else if (c>=256)
|
|
level = MAX(level, 2);
|
|
} else {
|
|
mask = 0;
|
|
level = MAX(level, 2);
|
|
}
|
|
}
|
|
|
|
c->mask = mask;
|
|
c->level = level;
|
|
}
|
|
|
|
/* gets the best charset from the mask of chars in it */
|
|
static const char *
|
|
camel_charset_best_mask(unsigned int mask)
|
|
{
|
|
int i;
|
|
|
|
for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
|
|
if (camel_charinfo[i].bit & mask)
|
|
return camel_charinfo[i].name;
|
|
}
|
|
return "UTF-8";
|
|
}
|
|
|
|
const char *
|
|
camel_charset_best_name (CamelCharset *charset)
|
|
{
|
|
if (charset->level == 1)
|
|
return "ISO-8859-1";
|
|
else if (charset->level == 2)
|
|
return camel_charset_best_mask (charset->mask);
|
|
else
|
|
return NULL;
|
|
|
|
}
|
|
|
|
/* finds the minimum charset for this string NULL means US-ASCII */
|
|
const char *
|
|
camel_charset_best (const char *in, int len)
|
|
{
|
|
CamelCharset charset;
|
|
|
|
camel_charset_init (&charset);
|
|
camel_charset_step (&charset, in, len);
|
|
return camel_charset_best_name (&charset);
|
|
}
|
|
|
|
|
|
#ifdef G_THREADS_ENABLED
|
|
static GStaticMutex lock = G_STATIC_MUTEX_INIT;
|
|
#define LOCK() g_static_mutex_lock(&lock)
|
|
#define UNLOCK() g_static_mutex_unlock(&lock)
|
|
#else
|
|
#define LOCK()
|
|
#define UNLOCK()
|
|
#endif
|
|
|
|
static char *locale_charset = NULL;
|
|
static GHashTable *canon_charsets = NULL;
|
|
|
|
static void
|
|
canon_charsets_init (int keep)
|
|
{
|
|
char *locale;
|
|
|
|
LOCK ();
|
|
|
|
if (canon_charsets != NULL) {
|
|
if (!keep)
|
|
UNLOCK ();
|
|
return;
|
|
}
|
|
|
|
canon_charsets = g_hash_table_new (g_str_hash, g_str_equal);
|
|
|
|
locale = setlocale (LC_ALL, NULL);
|
|
|
|
if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) {
|
|
/* The locale "C" or "POSIX" is a portable locale; its
|
|
* LC_CTYPE part corresponds to the 7-bit ASCII character
|
|
* set.
|
|
*/
|
|
|
|
locale_charset = NULL;
|
|
} else {
|
|
#ifdef HAVE_CODESET
|
|
locale_charset = g_strdup (nl_langinfo (CODESET));
|
|
g_ascii_strdown (locale_charset, -1);
|
|
#else
|
|
/* A locale name is typically of the form language[_terri-
|
|
* tory][.codeset][@modifier], where language is an ISO 639
|
|
* language code, territory is an ISO 3166 country code, and
|
|
* codeset is a character set or encoding identifier like
|
|
* ISO-8859-1 or UTF-8.
|
|
*/
|
|
char *codeset, *p;
|
|
|
|
codeset = strchr (locale, '.');
|
|
if (codeset) {
|
|
codeset++;
|
|
|
|
/* ; is a hack for debian systems and / is a hack for Solaris systems */
|
|
for (p = codeset; *p && !strchr ("@;/", *p); p++)
|
|
;
|
|
locale_charset = g_strndup (codeset, p - codeset);
|
|
g_ascii_strdown (locale_charset, -1);
|
|
} else {
|
|
/* charset unknown */
|
|
locale_charset = NULL;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if (!keep)
|
|
UNLOCK ();
|
|
}
|
|
|
|
|
|
/**
|
|
* camel_charset_locale_name:
|
|
*
|
|
* Returns the name of the system's locale charset.
|
|
**/
|
|
const char *
|
|
camel_charset_locale_name (void)
|
|
{
|
|
canon_charsets_init (FALSE);
|
|
|
|
return locale_charset;
|
|
}
|
|
|
|
|
|
/**
|
|
* camel_charset_canonical_name:
|
|
* @charset: charset to canonicalise
|
|
*
|
|
* Returns the charset in its canonical format. This is currently only
|
|
* needed for iso charsets but also handles canonicalisation of
|
|
* windows charsets. May need to expand this to handle canincalisation
|
|
* of more charsets in the future?
|
|
**/
|
|
const char *
|
|
camel_charset_canonical_name (const char *charset)
|
|
{
|
|
char *name, *canon, *tmp;
|
|
|
|
if (charset == NULL)
|
|
return NULL;
|
|
|
|
name = g_alloca (strlen (charset));
|
|
strcpy (name, charset);
|
|
g_ascii_strdown (name, -1);
|
|
|
|
canon_charsets_init (TRUE);
|
|
canon = g_hash_table_lookup (canon_charsets, name);
|
|
if (canon != NULL) {
|
|
UNLOCK ();
|
|
return canon;
|
|
}
|
|
|
|
/* Unknown, try canonicalise some basic charset types to something that should work */
|
|
if (strncmp (name, "iso", 3) == 0) {
|
|
/* Convert iso-nnnn-n or isonnnn-n or iso_nnnn-n to iso-nnnn-n or isonnnn-n */
|
|
int iso, codepage;
|
|
char *p;
|
|
|
|
tmp = name + 3;
|
|
if (*tmp == '-' || *tmp == '_')
|
|
tmp++;
|
|
|
|
iso = strtoul (tmp, &p, 10);
|
|
|
|
if (iso == 10646) {
|
|
/* they all become iso-10646 */
|
|
canon = g_strdup ("iso-10646");
|
|
} else {
|
|
/* iso-8859-# */
|
|
tmp = p;
|
|
if (*tmp == '-' || *tmp == '_')
|
|
tmp++;
|
|
|
|
codepage = strtoul (tmp, &p, 10);
|
|
|
|
if (p > tmp) {
|
|
/* codepage is numeric */
|
|
canon = g_strdup_printf ("iso-%d-%d", iso, codepage);
|
|
} else {
|
|
/* codepage is a string - probably iso-2022-jp or something */
|
|
canon = g_strdup_printf ("iso-%d-%s", iso, p);
|
|
}
|
|
}
|
|
} else if (strncmp (name, "windows-", 8) == 0) {
|
|
/* Convert windows-#### and windows-cp#### to windows-cp#### */
|
|
tmp = name + 8;
|
|
if (!strncmp (tmp, "cp", 2))
|
|
tmp += 2;
|
|
canon = g_strdup_printf ("windows-cp%s", tmp);
|
|
} else if (strncmp (name, "microsoft-", 10) == 0) {
|
|
/* Convert microsoft-#### or microsoft-cp#### to windows-cp#### */
|
|
tmp = name + 10;
|
|
if (!strncmp (tmp, "cp", 2))
|
|
tmp += 2;
|
|
canon = g_strdup_printf ("windows-cp%s", tmp);
|
|
} else if (strncmp (name, "cp125", 5) == 0) {
|
|
/* Convert cp125# to windows-cp#### */
|
|
canon = g_strdup_printf ("windows-%s", name);
|
|
} else {
|
|
/* Just assume its ok enough as is, case and all */
|
|
canon = g_strdup (charset);
|
|
}
|
|
|
|
g_hash_table_insert (canon_charsets, g_strdup (name), canon);
|
|
UNLOCK ();
|
|
|
|
return canon;
|
|
}
|
|
|
|
|
|
/**
|
|
* camel_charset_iso_to_windows:
|
|
* @isocharset: a canonicalised ISO charset
|
|
*
|
|
* Returns the equivalent Windows charset.
|
|
**/
|
|
const char *
|
|
camel_charset_iso_to_windows (const char *isocharset)
|
|
{
|
|
/* According to http://czyborra.com/charsets/codepages.html,
|
|
* the charset mapping is as follows:
|
|
*
|
|
* us-ascii maps to windows-cp1252
|
|
* iso-8859-1 maps to windows-cp1252
|
|
* iso-8859-2 maps to windows-cp1250
|
|
* iso-8859-3 maps to windows-cp????
|
|
* iso-8859-4 maps to windows-cp????
|
|
* iso-8859-5 maps to windows-cp1251
|
|
* iso-8859-6 maps to windows-cp1256
|
|
* iso-8859-7 maps to windows-cp1253
|
|
* iso-8859-8 maps to windows-cp1255
|
|
* iso-8859-9 maps to windows-cp1254
|
|
* iso-8859-10 maps to windows-cp????
|
|
* iso-8859-11 maps to windows-cp????
|
|
* iso-8859-12 maps to windows-cp????
|
|
* iso-8859-13 maps to windows-cp1257
|
|
*
|
|
* Assumptions:
|
|
* - I'm going to assume that since iso-8859-4 and
|
|
* iso-8859-13 are Baltic that it also maps to
|
|
* windows-cp1257.
|
|
*/
|
|
|
|
if (!strcasecmp (isocharset, "iso-8859-1") || !strcasecmp (isocharset, "us-ascii"))
|
|
return "windows-cp1252";
|
|
else if (!strcasecmp (isocharset, "iso-8859-2"))
|
|
return "windows-cp1250";
|
|
else if (!strcasecmp (isocharset, "iso-8859-4"))
|
|
return "windows-cp1257";
|
|
else if (!strcasecmp (isocharset, "iso-8859-5"))
|
|
return "windows-cp1251";
|
|
else if (!strcasecmp (isocharset, "iso-8859-6"))
|
|
return "windows-cp1256";
|
|
else if (!strcasecmp (isocharset, "iso-8859-7"))
|
|
return "windows-cp1253";
|
|
else if (!strcasecmp (isocharset, "iso-8859-8"))
|
|
return "windows-cp1255";
|
|
else if (!strcasecmp (isocharset, "iso-8859-9"))
|
|
return "windows-cp1254";
|
|
else if (!strcasecmp (isocharset, "iso-8859-13"))
|
|
return "windows-cp1257";
|
|
|
|
return isocharset;
|
|
}
|
|
|
|
#endif /* !BUILD_MAP */
|