2000-11-24 Not Zed <NotZed@HelixCode.com> * Makefile.am (SUBDIRS): Add tests. * camel-mime-filter-basic.c (filter): Well, I'll add the extra bytes here too, lathough not strictly needed, might save a re-malloc when we get to complete(). * camel-mime-filter-charset.c (filter): Make sure we have room if we only convert very short data. (complete): and here too. * tests/Makefile.am: Initial test harness & tests. Requires gcc for this. * camel-internet-address.c (d): Turn off debug. * camel-charset-map.c (camel_charset_step): Oops, & masks for set intersection, not | them. Dunno how this got even close to working. 2000-11-23 Not Zed <NotZed@HelixCode.com> * camel-mime-filter-basic.c (filter): For base64 encoding, the output size for 0, 1, or 2 bytes of input can exceed input*2, so make sure we account for that as well. (complete): And here. (complete): Similarly for qp encoding, if we have a trailing space, we need some extra bytes (not needed for 'filter()', as any such bytes are stored in state/save). * camel-mime-utils.c (quoted_decode_step): Removed fixme not required. (quoted_encode_close): Dont append a trailing afterall. Otherwise a pass through the encode/decode will grow the message each time. svn path=/trunk/; revision=6656
288 lines
6.3 KiB
C
288 lines
6.3 KiB
C
|
|
#include <stdio.h>
|
|
|
|
/*
|
|
if you want to build the charset map, add the root directory of
|
|
libunicode to the include path and define BUILD_MAP,
|
|
then run it as
|
|
./a.out > camel-charset-map-private.h
|
|
|
|
The tables genereated work like this:
|
|
|
|
An indirect array for each page of unicode character
|
|
Each array element has an indirect pointer to one of the bytes of
|
|
the generated bitmask.
|
|
*/
|
|
|
|
#ifdef BUILD_MAP
|
|
#include "iso/iso8859-2.h"
|
|
#include "iso/iso8859-3.h"
|
|
#include "iso/iso8859-4.h"
|
|
#include "iso/iso8859-5.h"
|
|
#include "iso/iso8859-6.h"
|
|
#include "iso/iso8859-7.h"
|
|
#include "iso/iso8859-8.h"
|
|
#include "iso/iso8859-9.h"
|
|
#include "iso/iso8859-10.h"
|
|
#include "iso/iso8859-13.h"
|
|
#include "iso/iso8859-14.h"
|
|
#include "iso/iso8859-15.h"
|
|
#include "iso/windows-1250.h"
|
|
#include "iso/windows-1252.h"
|
|
#include "iso/windows-1257.h"
|
|
#include "iso/koi8-r.h"
|
|
#include "iso/koi8-u.h"
|
|
#include "iso/tis620.2533-1.h"
|
|
#include "iso/armscii-8.h"
|
|
#include "iso/georgian-academy.h"
|
|
#include "iso/georgian-ps.h"
|
|
#include "msft/cp932.h"
|
|
#include "jis/shiftjis.h"
|
|
|
|
static struct {
|
|
unsigned short *table;
|
|
char *name;
|
|
int type; /* type of table */
|
|
unsigned int bit; /* assigned bit */
|
|
} tables[] = {
|
|
{ iso8859_2_table, "iso-8859-2", 0, 0} ,
|
|
{ iso8859_3_table, "iso-8859-3", 0, 0} ,
|
|
{ iso8859_4_table, "iso-8859-4", 0, 0},
|
|
{ iso8859_5_table, "iso-8859-5", 0, 0},
|
|
/* apparently -6 has special digits? */
|
|
{ iso8859_6_table, "iso-8859-6", 0, 0},
|
|
{ iso8859_7_table, "iso-8859-7", 0, 0},
|
|
{ iso8859_8_table, "iso-8859-8", 0, 0},
|
|
{ iso8859_9_table, "iso-8859-9", 0, 0},
|
|
{ iso8859_10_table, "iso-8859-10", 0, 0},
|
|
{ iso8859_13_table, "iso-8859-13", 0, 0},
|
|
{ iso8859_14_table, "iso-8859-14", 0, 0},
|
|
{ iso8859_15_table, "iso-8859-15", 0, 0},
|
|
{ windows_1250_table, "windows-1250", 0, 0},
|
|
{ windows_1252_table, "windows-1252", 0, 0},
|
|
{ windows_1257_table, "windows-1257", 0, 0},
|
|
{ koi8_r_table, "koi8-r", 0, 0},
|
|
{ koi8_u_table, "koi8-u", 0, 0},
|
|
{ tis_620_table, "tis620.2533-1", 0, 0},
|
|
{ armscii_8_table, "armscii-8", 0, 0},
|
|
{ georgian_academy_table, "georgian-academy", 0, 0},
|
|
{ georgian_ps_table, "georgian-ps", 0, 0},
|
|
{ cp932_table, "CP932", 1, 0},
|
|
{ sjis_table, "Shift-JIS", 1, 0},
|
|
{ 0, 0}
|
|
};
|
|
|
|
unsigned int encoding_map[256 * 256];
|
|
|
|
static void
|
|
add_bigmap(unsigned short **table, int bit)
|
|
{
|
|
int i;
|
|
int j;
|
|
|
|
for (i=0;i<256;i++) {
|
|
unsigned short *tab = table[i];
|
|
if (tab) {
|
|
for (j=0;j<256;j++) {
|
|
if (tab[j])
|
|
encoding_map[tab[j]] |= bit;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void main(void)
|
|
{
|
|
int i, j;
|
|
unsigned short *tab;
|
|
int max, min;
|
|
int bit = 0x01;
|
|
int k;
|
|
int bytes;
|
|
|
|
#if 0
|
|
/* iso-latin-1 (not needed-detected in code) */
|
|
for (i=0;i<256;i++) {
|
|
encoding_map[i] |= bit;
|
|
}
|
|
bit <<= 1;
|
|
#endif
|
|
|
|
/* dont count the terminator */
|
|
bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
|
|
|
|
/* the other latin charsets */
|
|
for (j=0;tables[j].table;j++) {
|
|
switch (tables[j].type) {
|
|
case 0: /* table from 128-256 */
|
|
tab = tables[j].table;
|
|
for (i=0;i<128;i++) {
|
|
/* 0-127 is the common */
|
|
encoding_map[i] |= bit;
|
|
encoding_map[tab[i]] |= bit;
|
|
}
|
|
break;
|
|
case 1: /* sparse table */
|
|
add_bigmap(tables[j].table, bit);
|
|
break;
|
|
}
|
|
tables[j].bit = bit;
|
|
bit <<= 1;
|
|
}
|
|
|
|
printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
|
|
|
|
for (i=0;i<256;i++) {
|
|
/* first, do we need this block? */
|
|
for (k=0;k<bytes;k++) {
|
|
for (j=0;j<256;j++) {
|
|
if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
|
|
break;
|
|
}
|
|
if (j < 256) {
|
|
/* yes, dump it */
|
|
printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
|
|
for (j=0;j<256;j++) {
|
|
printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
|
|
if (((j+1)&7) == 0 && j<255)
|
|
printf("\n\t");
|
|
}
|
|
printf("\n};\n\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
printf("struct {\n");
|
|
for (k=0;k<bytes;k++) {
|
|
printf("\tunsigned char *bits%d;\n", k);
|
|
}
|
|
printf("} camel_charmap[256] = {\n\t");
|
|
for (i=0;i<256;i++) {
|
|
/* first, do we need this block? */
|
|
printf("{ ");
|
|
for (k=0;k<bytes;k++) {
|
|
for (j=0;j<256;j++) {
|
|
if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
|
|
break;
|
|
}
|
|
if (j < 256) {
|
|
printf("m%02x%x, ", i, k);
|
|
} else {
|
|
printf("0, ");
|
|
}
|
|
}
|
|
printf("}, ");
|
|
if (((i+1)&7) == 0 && i<255)
|
|
printf("\n\t");
|
|
}
|
|
printf("\n};\n\n");
|
|
|
|
printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
|
|
for (j=0;tables[j].table;j++) {
|
|
printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
|
|
}
|
|
printf("};\n\n");
|
|
|
|
printf("#define charset_mask(x) \\\n");
|
|
for (k=0;k<bytes;k++) {
|
|
if (k!=0)
|
|
printf("\t| ");
|
|
else
|
|
printf("\t");
|
|
printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
|
|
if (k<bytes-1)
|
|
printf("\t\\\n");
|
|
}
|
|
printf("\n\n");
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#include "camel-charset-map.h"
|
|
#include "camel-charset-map-private.h"
|
|
#include <unicode.h>
|
|
#include <glib.h>
|
|
|
|
void camel_charset_init(CamelCharset *c)
|
|
{
|
|
c->mask = ~0;
|
|
c->level = 0;
|
|
}
|
|
|
|
void
|
|
camel_charset_step(CamelCharset *c, const char *in, int len)
|
|
{
|
|
register unsigned int mask;
|
|
register int level;
|
|
const char *inptr = in, *inend = in+len;
|
|
|
|
mask = c->mask;
|
|
level = c->level;
|
|
|
|
/* check what charset a given string will fit in */
|
|
while (inptr < inend) {
|
|
unicode_char_t c;
|
|
const char *newinptr;
|
|
newinptr = unicode_get_utf8(inptr, &c);
|
|
if (newinptr == NULL) {
|
|
inptr++;
|
|
continue;
|
|
}
|
|
inptr = newinptr;
|
|
if (c<=0xffff) {
|
|
mask &= charset_mask(c);
|
|
|
|
if (c>=128 && c<256)
|
|
level = MAX(level, 1);
|
|
else if (c>=256)
|
|
level = MAX(level, 2);
|
|
} else {
|
|
mask = 0;
|
|
level = MAX(level, 2);
|
|
}
|
|
}
|
|
|
|
c->mask = mask;
|
|
c->level = level;
|
|
}
|
|
|
|
/* gets the best charset from the mask of chars in it */
|
|
static const char *
|
|
camel_charset_best_mask(unsigned int mask)
|
|
{
|
|
int i;
|
|
|
|
for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
|
|
if (camel_charinfo[i].bit & mask)
|
|
return camel_charinfo[i].name;
|
|
}
|
|
return "UTF-8";
|
|
}
|
|
|
|
const char *camel_charset_best_name(CamelCharset *charset)
|
|
{
|
|
if (charset->level == 1)
|
|
return "ISO-8859-1";
|
|
else if (charset->level == 2)
|
|
return camel_charset_best_mask(charset->mask);
|
|
else
|
|
return NULL;
|
|
|
|
}
|
|
|
|
/* finds the minimum charset for this string NULL means US-ASCII */
|
|
const char *
|
|
camel_charset_best(const char *in, int len)
|
|
{
|
|
CamelCharset charset;
|
|
|
|
camel_charset_init(&charset);
|
|
camel_charset_step(&charset, in, len);
|
|
return camel_charset_best_name(&charset);
|
|
}
|
|
|
|
|
|
#endif /* !BUILD_MAP */
|
|
|