New function to parse an HTML meta-tag.

2001-07-10 Jeffrey Stedfast <fejj@ximian.com> * camel-mime-utils.c (html_meta_param_list_decode): New function to parse an HTML meta-tag. * camel-mime-part-utils.c (simple_data_wrapper_construct_from_parser): If the Content-Type did not contain a charset parameter and it's also a text/html part, we have 1 last place to look - in the META html tags. *sigh* * camel-mime-message.c (camel_mime_message_get_source): s/gint/unsigned since that's what it should be. svn path=/trunk/; revision=10976
2001-07-10 22:06:56 +00:00
parent 456227c7b4
commit e39d94c5ef
7 changed files with 365 additions and 233 deletions
--- a/camel/ChangeLog
+++ b/camel/ChangeLog
@ -1,3 +1,16 @@
+2001-07-10  Jeffrey Stedfast  <fejj@ximian.com>
+
+	* camel-mime-utils.c (html_meta_param_list_decode): New function
+	to parse an HTML meta-tag.
+
+	* camel-mime-part-utils.c
+	(simple_data_wrapper_construct_from_parser): If the Content-Type
+	did not contain a charset parameter and it's also a text/html
+	part, we have 1 last place to look - in the META html tags. *sigh*
+
+	* camel-mime-message.c (camel_mime_message_get_source):
+	s/gint/unsigned since that's what it should be.
+
 2001-07-09  Jeffrey Stedfast  <fejj@ximian.com>

 	* camel-pgp-context.c (pgp_sign): Forget the passphrase if the
--- a/camel/camel-mime-message.c
+++ b/camel/camel-mime-message.c
@ -171,7 +171,8 @@ camel_mime_message_get_type (void)
 	return camel_mime_message_type;
 }

-static void unref_recipient (gpointer key, gpointer value, gpointer user_data)
+static void
+unref_recipient (gpointer key, gpointer value, gpointer user_data)
 {
 	camel_object_unref (CAMEL_OBJECT (value));
 }
@ -193,6 +194,7 @@ camel_mime_message_set_date(CamelMimeMessage *message,  time_t date, int offset)
 	char *datestr;
 	
 	g_assert(message);
+	
 	if (date == CAMEL_MESSAGE_DATE_CURRENT) {
 		struct tm *local;
 		int tz;
@ -418,10 +420,12 @@ const char *
 camel_mime_message_get_source (CamelMimeMessage *mime_message)
 {
 	const char *src;
+	
 	g_assert(mime_message);
+	
 	src = camel_medium_get_header (CAMEL_MEDIUM (mime_message), "X-Evolution-Source");
 	if (src) {
-		while (*src && isspace ((gint) *src))
+		while (*src && isspace ((unsigned) *src))
 			++src;
 	}
 	return src;
@ -514,15 +518,15 @@ process_header (CamelMedium *medium, const char *header_name, const char *header
 	switch (header_type) {
 	case HEADER_FROM:
 		if (message->from)
-			camel_object_unref((CamelObject *)message->from);
+			camel_object_unref (CAMEL_OBJECT (message->from));
 		message->from = camel_internet_address_new ();
-		camel_address_decode((CamelAddress *)message->from, header_value);
+		camel_address_decode (CAMEL_ADDRESS (message->from), header_value);
 		break;
 	case HEADER_REPLY_TO:
 		if (message->reply_to)
-			camel_object_unref((CamelObject *)message->reply_to);
+			camel_object_unref (CAMEL_OBJECT (message->reply_to));
 		message->reply_to = camel_internet_address_new ();
-		camel_address_decode((CamelAddress *)message->reply_to, header_value);
+		camel_address_decode (CAMEL_ADDRESS (message->reply_to), header_value);
 		break;
 	case HEADER_SUBJECT:
 		g_free(message->subject);
@ -555,6 +559,7 @@ process_header (CamelMedium *medium, const char *header_name, const char *header
 	default:
 		return FALSE;
 	}
+	
 	return TRUE;
 }

@ -733,11 +738,11 @@ find_best_encoding(CamelMimePart *part, CamelBestencRequired required, CamelBest
 		   it as binary data (and take the result we have so far) */
 		
 		if (charenc != NULL) {
-
 			/* otherwise, try another pass, converting to the real charset */
 			
 			camel_mime_filter_reset ((CamelMimeFilter *)bestenc);
-			camel_mime_filter_bestenc_set_flags(bestenc, CAMEL_BESTENC_GET_ENCODING|CAMEL_BESTENC_LF_IS_CRLF|callerflags);
+			camel_mime_filter_bestenc_set_flags (bestenc, CAMEL_BESTENC_GET_ENCODING |
+							     CAMEL_BESTENC_LF_IS_CRLF | callerflags);
 			
 			camel_stream_filter_add (filter, (CamelMimeFilter *)charenc);
 			camel_stream_filter_add (filter, (CamelMimeFilter *)bestenc);
@ -790,7 +795,8 @@ best_encoding(CamelMimeMessage *msg, CamelMimePart *part, void *datap)
 				char *newct;
 				
 				/* FIXME: ick, the part content_type interface needs fixing bigtime */
-				header_content_type_set_param(part->content_type, "charset", charset?charset:"us-ascii");
+				header_content_type_set_param (part->content_type, "charset",
+							       charset ? charset : "us-ascii");
 				newct = header_content_type_format (part->content_type);
 				if (newct) {
 					d(printf("Setting content-type to %s\n", newct));
--- a/camel/camel-mime-part-utils.c
+++ b/camel/camel-mime-part-utils.c
@ -89,6 +89,7 @@ simple_data_wrapper_construct_from_parser(CamelDataWrapper *dw, CamelMimeParser
 	ct = camel_mime_parser_content_type (mp);
 	if (header_content_type_is (ct, "text", "*")) {
 		const char *charset = header_content_type_param (ct, "charset");
+		char *acharset; /* to be alloca'd if needed */
 		
 		if (fdec) {
 			d(printf("Adding CRLF conversion filter\n"));
@ -97,9 +98,67 @@ simple_data_wrapper_construct_from_parser(CamelDataWrapper *dw, CamelMimeParser
 			crlfid = camel_mime_parser_filter_add (mp, fcrlf);
 		}
 		
-		if (charset!=NULL
-		    && !(strcasecmp(charset, "us-ascii")==0
-			 || strcasecmp(charset, "utf-8")==0)) {
+		/* Possible Lame Mailer Alert... check the META tag for a charset */
+		if (!charset && header_content_type_is (ct, "text", "html")) {
+			/* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
+			const char *data, *slashhead;
+			CamelStream *mem;
+			
+			mem = camel_stream_mem_new ();
+			camel_data_wrapper_write_to_stream (dw, mem);
+			camel_stream_write (mem, "", 1);
+			
+			data = CAMEL_STREAM_MEM (mem)->buffer->data;
+			slashhead = strstrcase (data, "</head");
+			if (!slashhead)
+				slashhead = data + CAMEL_STREAM_MEM (mem)->buffer->len;
+			
+			/* Yea, this is ugly */
+			while (data < slashhead) {
+				struct _header_param *params;
+				char *meta, *metaend;
+				const char *val;
+				
+				meta = strstrcase (data, "<meta");
+				if (!meta)
+					break;
+				
+				metaend = strchr (meta, '>');
+				if (!metaend)
+					metaend = slashhead;
+				
+				params = html_meta_param_list_decode (meta, metaend - meta);
+				if (params) {
+					val = header_param (params, "http-equiv");
+					if (val && !g_strcasecmp (val, "Content-Type")) {
+						struct _header_content_type *content_type;
+						
+						content_type = header_content_type_decode (val);
+						charset = header_content_type_param (content_type, "charset");
+						if (charset) {
+							acharset = alloca (strlen (charset) + 1);
+							strcpy (acharset, charset);
+							charset = acharset;
+						}
+						
+						header_content_type_unref (content_type);
+					}
+					
+					header_param_list_free (params);
+					
+					/* break as soon as we find a charset */
+					if (charset)
+						break;
+				}
+				
+				data = metaend;
+			}
+			
+			camel_object_unref (CAMEL_OBJECT (mem));
+		}
+		
+		/* if the charset is not us-ascii or utf-8, then we need to convert to utf-8 */
+		if (charset && !(g_strcasecmp (charset, "us-ascii") == 0 || g_strcasecmp (charset, "utf-8") == 0)) {
 			d(printf("Adding conversion filter from %s to UTF-8\n", charset));
 			fch = (CamelMimeFilter *)camel_mime_filter_charset_new_convert (charset, "UTF-8");
 			if (fch) {
@ -108,7 +167,6 @@ simple_data_wrapper_construct_from_parser(CamelDataWrapper *dw, CamelMimeParser
 				g_warning ("Cannot convert '%s' to 'UTF-8', message display may be corrupt", charset);
 			}
 		}
-
 	}
 	
 	buffer = g_byte_array_new();
--- a/camel/camel-mime-utils.c
+++ b/camel/camel-mime-utils.c
@ -2719,6 +2719,58 @@ header_param_list_decode(const char *in)
 	return header_decode_param_list(&in);
 }

+struct _header_param *
+html_meta_param_list_decode (const char *in, int inlen)
+{
+	struct _header_param *params = NULL, *last = NULL;
+	const char *inptr, *inend;
+	
+	if (in == NULL)
+		return NULL;
+	
+	inptr = in;
+	inend = inptr + inlen;
+	
+	if (*inptr != '<')
+		return NULL;
+	
+	if (!g_strncasecmp (inptr, "<meta", 5))
+		inptr += 5;
+	else
+		return NULL;
+	
+	header_decode_lwsp (&inptr);
+	
+	while (inptr < inend && *inptr != '>') {
+		char *name = NULL, *value = NULL;
+		struct _header_param *param;
+		
+		name = decode_token (&inptr);
+		header_decode_lwsp (&inptr);
+		if (*inptr != '=') {
+			g_free (name);
+			break;
+		}
+		
+		value = header_decode_value (&inptr);
+		header_decode_lwsp (&inptr);
+		
+		param = g_malloc (sizeof (struct _header_param));
+		param->next = NULL;
+		param->name = name;
+		param->value = value;
+		
+		if (last) {
+			last->next = param;
+			last = param;
+		} else {
+			last = params = param;
+		}
+	}
+	
+	return params;
+}
+
 /* FIXME: I wrote this in a quick & dirty fasion - it may not be 100% correct */
 static char *
 header_encode_param (const unsigned char *in, gboolean *encoded)
--- a/camel/camel-mime-utils.h
+++ b/camel/camel-mime-utils.h
@ -109,6 +109,9 @@ void header_param_list_format_append(GString *out, struct _header_param *p);
 char *header_param_list_format(struct _header_param *p);
 void header_param_list_free(struct _header_param *p);

+/* for decoding META tags in text/html stuff */
+struct _header_param *html_meta_param_list_decode (const char *in, int inlen);
+
 /* Content-Type header */
 struct _header_content_type *header_content_type_new(const char *type, const char *subtype);
 struct _header_content_type *header_content_type_decode(const char *in);
--- a/camel/string-utils.c
+++ b/camel/string-utils.c
@ -209,7 +209,7 @@ strip (gchar *string, gchar c)
 }

 char *
-strstrcase (char *haystack, const char *needle)
+strstrcase (const char *haystack, const char *needle)
 {
 	/* find the needle in the haystack neglecting case */
 	const char *ptr;
--- a/camel/string-utils.h
+++ b/camel/string-utils.h
@ -61,7 +61,7 @@ void    string_unquote          (gchar *string);

 gchar   *strip                  (gchar *string, gchar c);

-char    *strstrcase             (char *haystack, const char *needle);
+char    *strstrcase             (const char *haystack, const char *needle);

 #ifdef __cplusplus
 }