re-read the iconv man page, and treat the return value properly. It

2001-08-27  Not Zed  <NotZed@Ximian.com>

        * camel-mime-part-utils.c (convert_buffer): re-read the iconv man
        page, and treat the return value properly.  It returns the number
        of non-reversible conversions performed, not the number of output
        characters, sigh.
        (check_html_charset): Changed to just take a buffer of data, and
        not the mime parser.
        (simple_data_wrapper_construct_from_parser): Since we dont need
        the charset till we have all the data, search for the charset
        after we've read the data, if we have html data with no charset in
        the header.
        (simple_data_wrapper_construct_from_parser): Remove the
        seekable_source stuff.

        * Re-apply patches from before.

svn path=/trunk/; revision=12481
This commit is contained in:
Not Zed
2001-08-27 06:33:41 +00:00
committed by Michael Zucci
parent bfca62333d
commit 6fbbb872fe
5 changed files with 135 additions and 150 deletions

View File

@ -1,3 +1,20 @@
2001-08-27 Not Zed <NotZed@Ximian.com>
* camel-mime-part-utils.c (convert_buffer): re-read the iconv man
page, and treat the return value properly. It returns the number
of non-reversible conversions performed, not the number of output
characters, sigh.
(check_html_charset): Changed to just take a buffer of data, and
not the mime parser.
(simple_data_wrapper_construct_from_parser): Since we dont need
the charset till we have all the data, search for the charset
after we've read the data, if we have html data with no charset in
the header.
(simple_data_wrapper_construct_from_parser): Remove the
seekable_source stuff.
* Re-apply patches from before.
2001-08-25 Not Zed <NotZed@Ximian.com> 2001-08-25 Not Zed <NotZed@Ximian.com>
["Summarising" and "Synchronising" are spelt with a "s" in ["Summarising" and "Synchronising" are spelt with a "s" in

View File

@ -71,6 +71,7 @@ camel_data_wrapper_init (gpointer object, gpointer klass)
camel_data_wrapper->mime_type = header_content_type_new ("application", "octet-stream"); camel_data_wrapper->mime_type = header_content_type_new ("application", "octet-stream");
camel_data_wrapper->offline = FALSE; camel_data_wrapper->offline = FALSE;
camel_data_wrapper->rawtext = FALSE;
} }
static void static void

View File

@ -48,7 +48,9 @@ struct _CamelDataWrapper
CamelContentType *mime_type; CamelContentType *mime_type;
CamelStream *stream; CamelStream *stream;
gboolean offline;
unsigned int offline:1;
unsigned int rawtext:1;
}; };
typedef struct { typedef struct {

View File

@ -29,6 +29,7 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <unistd.h> #include <unistd.h>
#include <errno.h>
#include "string-utils.h" #include "string-utils.h"
#include "camel-mime-part-utils.h" #include "camel-mime-part-utils.h"
@ -49,40 +50,16 @@
/* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */ /* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
static const char * static const char *
check_html_charset (CamelMimeParser *mp, CamelMimeFilterBasicType enctype) check_html_charset(char *buffer, int length)
{ {
const char *buf;
off_t offset;
int length;
CamelHTMLParser *hp; CamelHTMLParser *hp;
const char *charset = NULL; const char *charset = NULL;
camel_html_parser_t state; camel_html_parser_t state;
struct _header_content_type *ct; struct _header_content_type *ct;
CamelMimeFilterBasic *fdec = NULL;
/* if we can't find the charset within the first 2k, we ain't gonna find it */
offset = camel_mime_parser_tell(mp);
length = camel_mime_parser_read(mp, &buf, 2048);
d(printf("Checking html for meta content-type: '%.*s'", len, buf));
if (length == 0) {
camel_mime_parser_seek(mp, offset, SEEK_SET);
return NULL;
}
/* if we need to first base64/qp decode, do this here, sigh */ /* if we need to first base64/qp decode, do this here, sigh */
hp = camel_html_parser_new(); hp = camel_html_parser_new();
if (enctype != 0) { camel_html_parser_set_data(hp, buffer, length, TRUE);
int dummy, len;
char *buffer;
fdec = camel_mime_filter_basic_new_type(enctype);
camel_mime_filter_filter((CamelMimeFilter *)fdec, (char *)buf, length, 0, &buffer, &len, &dummy);
camel_html_parser_set_data(hp, buffer, len, TRUE);
} else {
camel_html_parser_set_data(hp, buf, length, TRUE);
}
do { do {
const char *data; const char *data;
@ -96,7 +73,7 @@ check_html_charset (CamelMimeParser *mp, CamelMimeFilterBasicType enctype)
switch(state) { switch(state) {
case CAMEL_HTML_PARSER_ELEMENT: case CAMEL_HTML_PARSER_ELEMENT:
val = camel_html_parser_tag(hp); val = camel_html_parser_tag(hp);
d(printf("Got tag: %s\n", tag)); d(printf("Got tag: %s\n", val));
if (g_strcasecmp(val, "meta") == 0 if (g_strcasecmp(val, "meta") == 0
&& (val = camel_html_parser_attr(hp, "http-equiv")) && (val = camel_html_parser_attr(hp, "http-equiv"))
&& g_strcasecmp(val, "content-type") == 0 && g_strcasecmp(val, "content-type") == 0
@ -115,47 +92,85 @@ check_html_charset (CamelMimeParser *mp, CamelMimeFilterBasicType enctype)
} while (charset == NULL && state != CAMEL_HTML_PARSER_EOF); } while (charset == NULL && state != CAMEL_HTML_PARSER_EOF);
camel_object_unref((CamelObject *)hp); camel_object_unref((CamelObject *)hp);
if (fdec)
camel_object_unref((CamelObject *)fdec);
camel_mime_parser_seek(mp, offset, SEEK_SET);
return charset; return charset;
} }
static GByteArray *convert_buffer(GByteArray *in, const char *to, const char *from)
{
iconv_t ic;
int inlen, outlen, i=2;
char *inbuf, *outbuf;
char *buffer;
GByteArray *out = NULL;
d(printf("converting buffer from %s to %s: '%.*s'\n", from, to, (int)in->len, in->data));
ic = iconv_open(to, from);
if (ic == (iconv_t) -1) {
g_warning("Cannot convert from '%s' to '%s': %s", from, to, strerror(errno));
return NULL;
}
do {
/* make plenty of space? */
outlen = in->len * i + 16;
buffer = g_malloc(outlen);
inbuf = in->data;
inlen = in->len;
outbuf = buffer;
if (iconv(ic, (const char **)&inbuf, &inlen, &outbuf, &outlen) == -1) {
g_free(buffer);
g_warning("conversion failed: %s", strerror(errno));
/* we didn't have enough space */
if (errno == E2BIG && i<6) {
i++;
continue;
}
break;
}
out = g_byte_array_new();
g_byte_array_append(out, buffer, (in->len*i+16) - outlen);
/* close off the conversion */
outbuf = buffer;
outlen = in->len * i + 16;
if (iconv(ic, NULL, 0, &outbuf, &outlen) != -1)
g_byte_array_append(out, buffer, (in->len*i+16) - outlen);
g_free(buffer);
d(printf("converted: '%.*s'\n", (int)out->len, out->data));
break;
} while (1);
iconv_close(ic);
return out;
}
/* simple data wrapper */ /* simple data wrapper */
static void static void
simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp) simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp)
{ {
CamelMimeFilter *fdec = NULL, *fcrlf = NULL, *fch = NULL; CamelMimeFilter *fdec = NULL, *fcrlf = NULL;
int len, decid = -1, crlfid = -1, chrid = -1; int len, decid = -1, crlfid = -1;
struct _header_content_type *ct; struct _header_content_type *ct;
CamelSeekableStream *seekable_source = NULL;
CamelStream *source;
GByteArray *buffer; GByteArray *buffer;
off_t start = 0, end;
char *encoding, *buf; char *encoding, *buf;
const char *charset = NULL;
CamelMimeFilterBasicType enctype = 0; CamelMimeFilterBasicType enctype = 0;
CamelStream *mem;
d(printf("constructing data-wrapper\n")); d(printf("constructing data-wrapper\n"));
/* Ok, try and be smart. If we're storing a small message (typical) convert it,
and store it in memory as we parse it ... if not, throw away the conversion
and scan till the end ... */
/* if we can't seek, dont have a stream/etc, then we must cache it */
source = camel_mime_parser_stream (mp);
if (source) {
camel_object_ref ((CamelObject *)source);
if (CAMEL_IS_SEEKABLE_STREAM (source)) {
seekable_source = CAMEL_SEEKABLE_STREAM (source);
}
}
/* first, work out conversion, if any, required, we dont care about what we dont know about */ /* first, work out conversion, if any, required, we dont care about what we dont know about */
encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "content-transfer-encoding", NULL)); encoding = header_content_encoding_decode(camel_mime_parser_header(mp, "content-transfer-encoding", NULL));
if (encoding) { if (encoding) {
if (!strcasecmp (encoding, "base64")) { if (!strcasecmp(encoding, "base64")) {
d(printf("Adding base64 decoder ...\n")); d(printf("Adding base64 decoder ...\n"));
enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC; enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC;
} else if (!strcasecmp(encoding, "quoted-printable")) { } else if (!strcasecmp(encoding, "quoted-printable")) {
@ -171,106 +186,61 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser
} }
/* If we're doing text, we also need to do CRLF->LF and may have to convert it to UTF8 as well. */ /* If we're doing text, we also need to do CRLF->LF and may have to convert it to UTF8 as well. */
ct = camel_mime_parser_content_type (mp); ct = camel_mime_parser_content_type(mp);
if (header_content_type_is (ct, "text", "*")) { if (header_content_type_is(ct, "text", "*")) {
const char *charset = header_content_type_param (ct, "charset"); charset = header_content_type_param(ct, "charset");
if (fdec) { if (fdec) {
d(printf("Adding CRLF conversion filter\n")); d(printf("Adding CRLF conversion filter\n"));
fcrlf = (CamelMimeFilter *)camel_mime_filter_crlf_new (CAMEL_MIME_FILTER_CRLF_DECODE, fcrlf = (CamelMimeFilter *)camel_mime_filter_crlf_new(CAMEL_MIME_FILTER_CRLF_DECODE,
CAMEL_MIME_FILTER_CRLF_MODE_CRLF_ONLY); CAMEL_MIME_FILTER_CRLF_MODE_CRLF_ONLY);
crlfid = camel_mime_parser_filter_add (mp, fcrlf); crlfid = camel_mime_parser_filter_add(mp, fcrlf);
}
/* Possible Lame Mailer Alert... check the META tags for a charset */
if (!charset && header_content_type_is (ct, "text", "html"))
charset = check_html_charset (mp, enctype);
/* if the charset is not us-ascii or utf-8, then we need to convert to utf-8 */
if (charset && !(g_strcasecmp (charset, "us-ascii") == 0 || g_strcasecmp (charset, "utf-8") == 0)) {
d(printf("Adding conversion filter from %s to UTF-8\n", charset));
fch = (CamelMimeFilter *)camel_mime_filter_charset_new_convert (charset, "UTF-8");
if (fch) {
chrid = camel_mime_parser_filter_add (mp, (CamelMimeFilter *)fch);
} else {
g_warning ("Cannot convert '%s' to 'UTF-8', message display may be corrupt", charset);
}
} }
} }
buffer = g_byte_array_new (); /* read in the entire content */
buffer = g_byte_array_new();
if (seekable_source /* !cache */) { while (camel_mime_parser_step(mp, &buf, &len) != HSCAN_BODY_END) {
start = camel_mime_parser_tell (mp) + seekable_source->bound_start;
}
while (camel_mime_parser_step (mp, &buf, &len) != HSCAN_BODY_END) {
d(printf("appending o/p data: %d: %.*s\n", len, len, buf)); d(printf("appending o/p data: %d: %.*s\n", len, len, buf));
if (buffer) { g_byte_array_append(buffer, buf, len);
if (buffer->len > 20480 && seekable_source) {
/* is this a 'big' message? Yes? We dont want to convert it all then. */
camel_mime_parser_filter_remove (mp, decid);
camel_mime_parser_filter_remove (mp, chrid);
decid = -1;
chrid = -1;
g_byte_array_free (buffer, TRUE);
buffer = NULL;
} else {
g_byte_array_append (buffer, buf, len);
}
}
} }
if (buffer) { /* Possible Lame Mailer Alert... check the META tags for a charset */
CamelStream *mem; if (!charset && header_content_type_is (ct, "text", "html"))
charset = check_html_charset(buffer->data, buffer->len);
d(printf("Small message part, kept in memory!\n")); /* if we need to do charset conversion, see if we can/it works/etc */
if (charset && !(strcasecmp(charset, "us-ascii") == 0
|| strcasecmp(charset, "utf-8") == 0
|| strncasecmp(charset, "x-", 2) == 0)) {
GByteArray *out;
mem = camel_stream_mem_new_with_byte_array (buffer); out = convert_buffer(buffer, "UTF-8", charset);
camel_data_wrapper_construct_from_stream (dw, mem); if (out) {
camel_object_unref ((CamelObject *)mem); /* converted ok, use this data instead */
} else { g_byte_array_free(buffer, TRUE);
CamelStream *sub; buffer = out;
CamelStreamFilter *filter;
d(printf("Big message part, left on disk ...\n"));
end = camel_mime_parser_tell (mp) + seekable_source->bound_start;
sub = camel_seekable_substream_new_with_seekable_stream_and_bounds (seekable_source, start, end);
if (fdec || fch) {
filter = camel_stream_filter_new_with_stream (sub);
if (fdec) {
camel_mime_filter_reset (fdec);
camel_stream_filter_add (filter, fdec);
}
if (fcrlf) {
camel_mime_filter_reset (fcrlf);
camel_stream_filter_add (filter, fcrlf);
}
if (fch) {
camel_mime_filter_reset (fch);
camel_stream_filter_add (filter, fch);
}
camel_data_wrapper_construct_from_stream (dw, (CamelStream *)filter);
camel_object_unref ((CamelObject *)filter);
} else { } else {
camel_data_wrapper_construct_from_stream (dw, sub); g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset);
/* else failed to convert, leave as raw? */
dw->rawtext = TRUE;
/* should we change the content-type header? */
} }
camel_object_unref ((CamelObject *)sub);
} }
camel_mime_parser_filter_remove (mp, decid); d(printf("message part kept in memory!\n"));
camel_mime_parser_filter_remove (mp, crlfid);
camel_mime_parser_filter_remove (mp, chrid); mem = camel_stream_mem_new_with_byte_array(buffer);
camel_data_wrapper_construct_from_stream(dw, mem);
camel_object_unref((CamelObject *)mem);
camel_mime_parser_filter_remove(mp, decid);
camel_mime_parser_filter_remove(mp, crlfid);
if (fdec) if (fdec)
camel_object_unref ((CamelObject *)fdec); camel_object_unref((CamelObject *)fdec);
if (fcrlf) if (fcrlf)
camel_object_unref ((CamelObject *)fcrlf); camel_object_unref((CamelObject *)fcrlf);
if (fch)
camel_object_unref ((CamelObject *)fch);
if (source)
camel_object_unref ((CamelObject *)source);
} }
/* This replaces the data wrapper repository ... and/or could be replaced by it? */ /* This replaces the data wrapper repository ... and/or could be replaced by it? */
@ -295,9 +265,7 @@ camel_mime_part_construct_content_from_parser (CamelMimePart *dw, CamelMimeParse
case HSCAN_MULTIPART: { case HSCAN_MULTIPART: {
CamelDataWrapper *bodypart; CamelDataWrapper *bodypart;
#ifndef NO_WARNINGS /* FIXME: we should use a came-mime-mutlipart, not jsut a camel-multipart, but who cares */
#warning This should use a camel-mime-multipart
#endif
d(printf("Creating multi-part\n")); d(printf("Creating multi-part\n"));
content = (CamelDataWrapper *)camel_multipart_new (); content = (CamelDataWrapper *)camel_multipart_new ();
@ -321,9 +289,6 @@ camel_mime_part_construct_content_from_parser (CamelMimePart *dw, CamelMimeParse
g_warning("Invalid state encountered???: %d", camel_mime_parser_state (mp)); g_warning("Invalid state encountered???: %d", camel_mime_parser_state (mp));
} }
if (content) { if (content) {
#ifndef NO_WARNINGS
#warning there just has got to be a better way ... to transfer the mime-type to the datawrapper
#endif
/* would you believe you have to set this BEFORE you set the content object??? oh my god !!!! */ /* would you believe you have to set this BEFORE you set the content object??? oh my god !!!! */
camel_data_wrapper_set_mime_type_field (content, camel_data_wrapper_set_mime_type_field (content,
camel_mime_part_get_content_type ((CamelMimePart *)dw)); camel_mime_part_get_content_type ((CamelMimePart *)dw));

View File

@ -606,10 +606,10 @@ write_to_stream(CamelDataWrapper *data_wrapper, CamelStream *stream)
break; break;
} }
if (header_content_type_is (mp->content_type, "text", "*")) { if (!data_wrapper->rawtext && header_content_type_is(mp->content_type, "text", "*")) {
charset = header_content_type_param (mp->content_type, "charset"); charset = header_content_type_param(mp->content_type, "charset");
if (charset && !(!g_strcasecmp (charset, "us-ascii") || !g_strcasecmp (charset, "utf-8"))) { if (charset && !(!strcasecmp(charset, "us-ascii") || !strcasecmp(charset, "utf-8"))) {
charenc = (CamelMimeFilter *)camel_mime_filter_charset_new_convert ("UTF-8", charset); charenc = (CamelMimeFilter *)camel_mime_filter_charset_new_convert("UTF-8", charset);
} }
} }