* camel-charset-map.c: Redo the BUILD_MAP code to not depend on libunicode. Now it only generates a map of "popular" 8bit encodings. (It's not worthwhile to support obscure encodings, because any mailer that supports them will support UTF8 too. And Chinese and Japanese use mostly the same UTF8 characters so you need to decide between those encodings based on the locale or the charset of the message you're replying to or the input method you used. So this is sufficient for camel_charset_best's use.) * camel-charset-map-private.h: Regenerated. * camel.c (camel_shutdown): Move #ifdefs around to prevent a warning. svn path=/trunk/; revision=10055
322 lines
7.5 KiB
C
322 lines
7.5 KiB
C
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */
|
|
|
|
/*
|
|
* Authors:
|
|
* Michael Zucchi <notzed@ximian.com>
|
|
* Dan Winship <danw@ximian.com>
|
|
*
|
|
* Copyright 2000, 2001 Ximian, Inc. (http://www.ximian.com)
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation; either version 2 of the
|
|
* License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
|
* USA
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include <stdio.h>
|
|
|
|
/*
|
|
if you want to build the charset map, compile this with something like:
|
|
gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags`
|
|
(plus any -I/-L/-l flags you need for iconv), then run it as
|
|
./a.out > camel-charset-map-private.h
|
|
|
|
Note that the big-endian variant isn't tested...
|
|
|
|
The tables genereated work like this:
|
|
|
|
An indirect array for each page of unicode character
|
|
Each array element has an indirect pointer to one of the bytes of
|
|
the generated bitmask.
|
|
*/
|
|
|
|
#ifdef BUILD_MAP
|
|
#include <iconv.h>
|
|
#include <glib.h>
|
|
|
|
static struct {
|
|
char *name;
|
|
unsigned int bit; /* assigned bit */
|
|
} tables[] = {
|
|
/* These are the 8bit character sets (other than iso-8859-1,
|
|
* which is special-cased) which are supported by both other
|
|
* mailers and the GNOME environment. Note that the order
|
|
* they're listed in is the order they'll be tried in, so put
|
|
* the more-popular ones first.
|
|
*/
|
|
{ "iso-8859-2", 0 }, /* Central/Eastern European */
|
|
{ "iso-8859-4", 0 }, /* Baltic */
|
|
{ "koi8-r", 0 }, /* Russian */
|
|
{ "windows-1251", 0 }, /* Russian */
|
|
{ "koi8-u", 0 }, /* Ukranian */
|
|
{ "iso-8859-5", 0 }, /* Least-popular Russian encoding */
|
|
{ "iso-8859-7", 0 }, /* Greek */
|
|
{ "iso-8859-9", 0 }, /* Turkish */
|
|
{ "iso-8859-13", 0 }, /* Baltic again */
|
|
{ "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most
|
|
* programs that support this support UTF8
|
|
*/
|
|
{ 0, 0 }
|
|
};
|
|
|
|
unsigned int encoding_map[256 * 256];
|
|
|
|
#if G_BYTE_ORDER == G_BIG_ENDIAN
|
|
#define UCS "UCS-4BE"
|
|
#else
|
|
#define UCS "UCS-4LE"
|
|
#endif
|
|
|
|
void main(void)
|
|
{
|
|
int i, j;
|
|
int max, min;
|
|
int bit = 0x01;
|
|
int k;
|
|
int bytes;
|
|
iconv_t cd;
|
|
char in[128];
|
|
guint32 out[128];
|
|
char *inptr, *outptr;
|
|
size_t inlen, outlen;
|
|
|
|
/* dont count the terminator */
|
|
bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
|
|
|
|
for (i = 0; i < 128; i++)
|
|
in[i] = i + 128;
|
|
|
|
for (j = 0; tables[j].name; j++) {
|
|
cd = iconv_open (UCS, tables[j].name);
|
|
inptr = in;
|
|
outptr = (char *)(out);
|
|
inlen = sizeof (in);
|
|
outlen = sizeof (out);
|
|
while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
|
|
if (errno == EILSEQ) {
|
|
inptr++;
|
|
inlen--;
|
|
} else {
|
|
printf ("%s\n", strerror (errno));
|
|
exit (1);
|
|
}
|
|
}
|
|
iconv_close (cd);
|
|
|
|
for (i = 0; i < 128 - outlen / 4; i++) {
|
|
encoding_map[i] |= bit;
|
|
encoding_map[out[i]] |= bit;
|
|
}
|
|
|
|
tables[j].bit = bit;
|
|
bit <<= 1;
|
|
}
|
|
|
|
printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
|
|
|
|
for (i=0;i<256;i++) {
|
|
/* first, do we need this block? */
|
|
for (k=0;k<bytes;k++) {
|
|
for (j=0;j<256;j++) {
|
|
if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
|
|
break;
|
|
}
|
|
if (j < 256) {
|
|
/* yes, dump it */
|
|
printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
|
|
for (j=0;j<256;j++) {
|
|
printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
|
|
if (((j+1)&7) == 0 && j<255)
|
|
printf("\n\t");
|
|
}
|
|
printf("\n};\n\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
printf("struct {\n");
|
|
for (k=0;k<bytes;k++) {
|
|
printf("\tunsigned char *bits%d;\n", k);
|
|
}
|
|
printf("} camel_charmap[256] = {\n\t");
|
|
for (i=0;i<256;i++) {
|
|
/* first, do we need this block? */
|
|
printf("{ ");
|
|
for (k=0;k<bytes;k++) {
|
|
for (j=0;j<256;j++) {
|
|
if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
|
|
break;
|
|
}
|
|
if (j < 256) {
|
|
printf("m%02x%x, ", i, k);
|
|
} else {
|
|
printf("0, ");
|
|
}
|
|
}
|
|
printf("}, ");
|
|
if (((i+1)&7) == 0 && i<255)
|
|
printf("\n\t");
|
|
}
|
|
printf("\n};\n\n");
|
|
|
|
printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
|
|
for (j=0;tables[j].name;j++) {
|
|
printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
|
|
}
|
|
printf("};\n\n");
|
|
|
|
printf("#define charset_mask(x) \\\n");
|
|
for (k=0;k<bytes;k++) {
|
|
if (k!=0)
|
|
printf("\t| ");
|
|
else
|
|
printf("\t");
|
|
printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
|
|
if (k<bytes-1)
|
|
printf("\t\\\n");
|
|
}
|
|
printf("\n\n");
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#include "camel-charset-map.h"
|
|
#include "camel-charset-map-private.h"
|
|
#include <gal/unicode/gunicode.h>
|
|
#include <locale.h>
|
|
#include <string.h>
|
|
#include <glib.h>
|
|
|
|
void camel_charset_init(CamelCharset *c)
|
|
{
|
|
c->mask = ~0;
|
|
c->level = 0;
|
|
}
|
|
|
|
void
|
|
camel_charset_step(CamelCharset *c, const char *in, int len)
|
|
{
|
|
register unsigned int mask;
|
|
register int level;
|
|
const char *inptr = in, *inend = in+len;
|
|
|
|
mask = c->mask;
|
|
level = c->level;
|
|
|
|
/* check what charset a given string will fit in */
|
|
while (inptr < inend) {
|
|
gunichar c;
|
|
const char *newinptr;
|
|
newinptr = g_utf8_next_char(inptr);
|
|
c = g_utf8_get_char(inptr);
|
|
if (newinptr == NULL || !g_unichar_validate (c)) {
|
|
inptr++;
|
|
continue;
|
|
}
|
|
|
|
inptr = newinptr;
|
|
if (c<=0xffff) {
|
|
mask &= charset_mask(c);
|
|
|
|
if (c>=128 && c<256)
|
|
level = MAX(level, 1);
|
|
else if (c>=256)
|
|
level = MAX(level, 2);
|
|
} else {
|
|
mask = 0;
|
|
level = MAX(level, 2);
|
|
}
|
|
}
|
|
|
|
c->mask = mask;
|
|
c->level = level;
|
|
}
|
|
|
|
/* gets the best charset from the mask of chars in it */
|
|
static const char *
|
|
camel_charset_best_mask(unsigned int mask)
|
|
{
|
|
int i;
|
|
|
|
for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
|
|
if (camel_charinfo[i].bit & mask)
|
|
return camel_charinfo[i].name;
|
|
}
|
|
return "UTF-8";
|
|
}
|
|
|
|
const char *camel_charset_best_name(CamelCharset *charset)
|
|
{
|
|
if (charset->level == 1)
|
|
return "ISO-8859-1";
|
|
else if (charset->level == 2)
|
|
return camel_charset_best_mask(charset->mask);
|
|
else
|
|
return NULL;
|
|
|
|
}
|
|
|
|
/* finds the minimum charset for this string NULL means US-ASCII */
|
|
const char *
|
|
camel_charset_best(const char *in, int len)
|
|
{
|
|
CamelCharset charset;
|
|
|
|
camel_charset_init(&charset);
|
|
camel_charset_step(&charset, in, len);
|
|
return camel_charset_best_name(&charset);
|
|
}
|
|
|
|
char *
|
|
camel_charset_locale_name (void)
|
|
{
|
|
char *locale, *charset = NULL;
|
|
|
|
locale = setlocale (LC_ALL, NULL);
|
|
|
|
if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) {
|
|
/* The locale "C" or "POSIX" is a portable locale; its
|
|
* LC_CTYPE part corresponds to the 7-bit ASCII character
|
|
* set.
|
|
*/
|
|
|
|
return NULL;
|
|
} else {
|
|
/* A locale name is typically of the form language[_terri-
|
|
* tory][.codeset][@modifier], where language is an ISO 639
|
|
* language code, territory is an ISO 3166 country code, and
|
|
* codeset is a character set or encoding identifier like
|
|
* ISO-8859-1 or UTF-8.
|
|
*/
|
|
char *p;
|
|
int len;
|
|
|
|
p = strchr (locale, '@');
|
|
len = p ? (p - locale) : strlen (locale);
|
|
if ((p = strchr (locale, '.'))) {
|
|
charset = g_strndup (p + 1, len - (p - locale) + 1);
|
|
g_strdown (charset);
|
|
}
|
|
}
|
|
|
|
return charset;
|
|
}
|
|
|
|
#endif /* !BUILD_MAP */
|
|
|