I#1849 - markdown-utils: Ignore character encoding hints in HTML when converting to text

The passed-in HTML is already in UTF-8, thus do not re-encode it again. Closes https://gitlab.gnome.org/GNOME/evolution/-/issues/1849
2022-03-30 09:42:02 +02:00 · 2022-03-30 09:42:02 +02:00 · 97cd791810
commit 97cd791810
parent ea233a6b24
1 changed files with 6 additions and 5 deletions
--- a/src/e-util/e-markdown-utils.c
+++ b/src/e-util/e-markdown-utils.c
@ -571,9 +571,9 @@ e_markdown_utils_html_to_text (const gchar *html,
 	sax.warning = markdown_utils_sax_warning_cb;
 	sax.error = markdown_utils_sax_error_cb;

-	ctxt = htmlCreatePushParserCtxt (&sax, &data, html ? html : "", length, "", XML_CHAR_ENCODING_UTF8);
-
-	htmlParseChunk (ctxt, "", 0, 1);
+	ctxt = htmlCreatePushParserCtxt (&sax, &data, "", 0, "", XML_CHAR_ENCODING_UTF8);
+	htmlCtxtUseOptions (ctxt, HTML_PARSE_RECOVER | HTML_PARSE_NONET | HTML_PARSE_IGNORE_ENC);
+	htmlParseChunk (ctxt, html ? html : "", length, 1);

 	/* The libxml doesn't read elements after </html>, but the quirks can be stored after them,
 	   thus retry after that element end, if it exists. */
@ -585,8 +585,9 @@ e_markdown_utils_html_to_text (const gchar *html,

 			data.composer_quirks.reading_html_end = TRUE;

-			ctxt2 = htmlCreatePushParserCtxt (&sax, &data, (const gchar *) ctxt->input->cur, html_end_length, "", XML_CHAR_ENCODING_UTF8);
-			htmlParseChunk (ctxt2, "", 0, 1);
+			ctxt2 = htmlCreatePushParserCtxt (&sax, &data, "", 0, "", XML_CHAR_ENCODING_UTF8);
+			htmlCtxtUseOptions (ctxt2, HTML_PARSE_RECOVER | HTML_PARSE_NONET | HTML_PARSE_IGNORE_ENC);
+			htmlParseChunk (ctxt2, (const gchar *) ctxt->input->cur, html_end_length, 1);
 			htmlFreeParserCtxt (ctxt2);
 		}
 	}