evolution/camel/camel-charset-map.c


#include <stdio.h>

/*
  if you want to build the charset map, add the root directory of
  libunicode to the include path and define BUILD_MAP,
  then run it as
    ./a.out > camel-charset-map-private.h

  The tables genereated work like this:

   An indirect array for each page of unicode character
   Each array element has an indirect pointer to one of the bytes of
   the generated bitmask.
*/

#ifdef BUILD_MAP
#include "iso/iso8859-2.h"
#include "iso/iso8859-3.h"
#include "iso/iso8859-4.h"
#include "iso/iso8859-5.h"
#include "iso/iso8859-6.h"
#include "iso/iso8859-7.h"
#include "iso/iso8859-8.h"
#include "iso/iso8859-9.h"
#include "iso/iso8859-10.h"
#include "iso/iso8859-13.h"
#include "iso/iso8859-14.h"
#include "iso/iso8859-15.h"
#include "iso/windows-1250.h"
#include "iso/windows-1252.h"
#include "iso/windows-1257.h"
#include "iso/koi8-r.h"
#include "iso/koi8-u.h"
#include "iso/tis620.2533-1.h"
#include "iso/armscii-8.h"
#include "iso/georgian-academy.h"
#include "iso/georgian-ps.h"
#include "msft/cp932.h"
#include "jis/shiftjis.h"

static struct {
	unsigned short *table;
	char *name;
	int type;		/* type of table */
	unsigned int bit;	/* assigned bit */
} tables[] = {
	{ iso8859_2_table, "iso-8859-2", 0, 0} ,
	{ iso8859_3_table, "iso-8859-3", 0, 0} ,
	{ iso8859_4_table, "iso-8859-4", 0, 0},
	{ iso8859_5_table, "iso-8859-5", 0, 0},
/* apparently -6 has special digits? */
	{ iso8859_6_table, "iso-8859-6", 0, 0},
	{ iso8859_7_table, "iso-8859-7", 0, 0},
	{ iso8859_8_table, "iso-8859-8", 0, 0},
	{ iso8859_9_table, "iso-8859-9", 0, 0},
	{ iso8859_10_table, "iso-8859-10", 0, 0},
	{ iso8859_13_table, "iso-8859-13", 0, 0},
	{ iso8859_14_table, "iso-8859-14", 0, 0},
	{ iso8859_15_table, "iso-8859-15", 0, 0},
	{ windows_1250_table, "windows-1250", 0, 0},
	{ windows_1252_table, "windows-1252", 0, 0},
	{ windows_1257_table, "windows-1257", 0, 0},
	{ koi8_r_table, "koi8-r", 0, 0},
	{ koi8_u_table, "koi8-u", 0, 0},
	{ tis_620_table, "tis620.2533-1", 0, 0},
	{ armscii_8_table, "armscii-8", 0, 0},
	{ georgian_academy_table, "georgian-academy", 0, 0},
	{ georgian_ps_table, "georgian-ps", 0, 0},
	{ cp932_table, "CP932", 1, 0},
	{ sjis_table, "Shift-JIS", 1, 0},
	{ 0, 0}
};

unsigned int encoding_map[256 * 256];

static void
add_bigmap(unsigned short **table, int bit)
{
	int i;
	int j;

	for (i=0;i<256;i++) {
		unsigned short *tab = table[i];
		if (tab) {
			for (j=0;j<256;j++) {
				if (tab[j])
					encoding_map[tab[j]] |= bit;
			}
		}
	}
}

void main(void)
{
	int i, j;
	unsigned short *tab;
	int max, min;
	int bit = 0x01;
	int k;
	int bytes;

#if 0
	/* iso-latin-1 (not needed-detected in code) */
	for (i=0;i<256;i++) {
		encoding_map[i] |= bit;
	}
	bit <<= 1;
#endif

	/* dont count the terminator */
	bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;

	/* the other latin charsets */
	for (j=0;tables[j].table;j++) {
		switch (tables[j].type) {
		case 0:		/* table from 128-256 */
			tab = tables[j].table;
			for (i=0;i<128;i++) {
				/* 0-127 is the common */
				encoding_map[i] |= bit;
				encoding_map[tab[i]] |= bit;
			}
			break;
		case 1:		/* sparse table */
			add_bigmap(tables[j].table, bit);
			break;
		}
		tables[j].bit = bit;
		bit <<= 1;
	}

	printf("/* This file is automatically generated: DO NOT EDIT */\n\n");

	for (i=0;i<256;i++) {
		/* first, do we need this block? */
		for (k=0;k<bytes;k++) {
			for (j=0;j<256;j++) {
				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
					break;
			}
			if (j < 256) {
				/* yes, dump it */
				printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
				for (j=0;j<256;j++) {
					printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
					if (((j+1)&7) == 0 && j<255)
						printf("\n\t");
				}
				printf("\n};\n\n");
			}
		}
	}

	printf("struct {\n");
	for (k=0;k<bytes;k++) {
		printf("\tunsigned char *bits%d;\n", k);
	}
	printf("} camel_charmap[256] = {\n\t");
	for (i=0;i<256;i++) {
		/* first, do we need this block? */
		printf("{ ");
		for (k=0;k<bytes;k++) {
			for (j=0;j<256;j++) {
				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
					break;
			}
			if (j < 256) {
				printf("m%02x%x, ", i, k);
			} else {
				printf("0, ");
			}
		}
		printf("}, ");
		if (((i+1)&7) == 0 && i<255)
			printf("\n\t");
	}
	printf("\n};\n\n");

	printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
	for (j=0;tables[j].table;j++) {
		printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
	}
	printf("};\n\n");

	printf("#define charset_mask(x) \\\n");
	for (k=0;k<bytes;k++) {
		if (k!=0)
			printf("\t| ");
		else
			printf("\t");
		printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
		if (k<bytes-1)
			printf("\t\\\n");
	}
	printf("\n\n");

}

#else

#include "camel-charset-map.h"
#include "camel-charset-map-private.h"
#include <unicode.h>
#include <glib.h>

void camel_charset_init(CamelCharset *c)
{
	c->mask = ~0;
	c->level = 0;
}

void
camel_charset_step(CamelCharset *c, const char *in, int len)
{
	register unsigned int mask;
	register int level;
	const char *inptr = in, *inend = in+len;

	mask = c->mask;
	level = c->level;

	/* check what charset a given string will fit in */
	while (inptr < inend) {
		unicode_char_t c;
		const char *newinptr;
		newinptr = unicode_get_utf8(inptr, &c);
		if (newinptr == NULL) {
			inptr++;
			continue;
		}
		inptr = newinptr;
		if (c<=0xffff) {
			mask &= charset_mask(c);

			if (c>=128 && c<256)
				level = MAX(level, 1);
			else if (c>=256)
				level = MAX(level, 2);
		} else {
			mask = 0;
			level = MAX(level, 2);
		}
	}

	c->mask = mask;
	c->level = level;
}

/* gets the best charset from the mask of chars in it */
static const char *
camel_charset_best_mask(unsigned int mask)
{
	int i;

	for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
		if (camel_charinfo[i].bit & mask)
			return camel_charinfo[i].name;
	}
	return "UTF-8";
}

const char *camel_charset_best_name(CamelCharset *charset)
{
	if (charset->level == 1)
		return "ISO-8859-1";
	else if (charset->level == 2)
		return camel_charset_best_mask(charset->mask);
	else
		return NULL;

}

/* finds the minimum charset for this string NULL means US-ASCII */
const char *
camel_charset_best(const char *in, int len)
{
	CamelCharset charset;

	camel_charset_init(&charset);
	camel_charset_step(&charset, in, len);
	return camel_charset_best_name(&charset);
}


#endif /* !BUILD_MAP */