
2002-02-28 Not Zed <NotZed@Ximian.com> * camel-mime-utils.c (header_fold): Use the FOLD_SIZE as a recommended folding size, but add a new FOLD_MAX_SIZE (=998, the smtp max line size) as the hard limit for any output. svn path=/trunk/; revision=15866
499 lines
14 KiB
C
499 lines
14 KiB
C
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
|
|
/*
|
|
* Authors: Jeffrey Stedfast <fejj@ximian.com>
|
|
* Michael Zucchi <NotZed@Ximian.com>
|
|
*
|
|
* Copyright 2000 Ximian, Inc. (www.ximian.com)
|
|
* Copyright 2001 Ximian Inc. (www.ximian.com)
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of version 2 of the GNU General Public
|
|
* License as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public
|
|
* License along with this program; if not, write to the
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
* Boston, MA 02111-1307, USA.
|
|
*
|
|
*/
|
|
|
|
/* (from glibc headers:
|
|
POSIX says that <sys/types.h> must be included (by the caller) before <regex.h>. */
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include <sys/types.h>
|
|
#include <regex.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <stdio.h>
|
|
|
|
#include "camel-exception.h"
|
|
#include "camel-mime-message.h"
|
|
#include "camel-multipart.h"
|
|
#include "camel-stream-mem.h"
|
|
#include "e-util/e-sexp.h"
|
|
|
|
#include "camel-search-private.h"
|
|
|
|
#include <gal/unicode/gunicode.h>
|
|
|
|
#define d(x)
|
|
|
|
/* builds the regex into pattern */
|
|
/* taken from camel-folder-search, with added isregex & exception parameter */
|
|
/* Basically, we build a new regex, either based on subset regex's, or substrings,
|
|
that can be executed once over the whoel body, to match anything suitable.
|
|
This is more efficient than multiple searches, and probably most (naive) strstr
|
|
implementations, over long content.
|
|
|
|
A small issue is that case-insenstivity wont work entirely correct for utf8 strings. */
|
|
int
|
|
camel_search_build_match_regex (regex_t *pattern, camel_search_flags_t type, int argc,
|
|
struct _ESExpResult **argv, CamelException *ex)
|
|
{
|
|
GString *match = g_string_new("");
|
|
int c, i, count=0, err;
|
|
char *word;
|
|
int flags;
|
|
|
|
/* build a regex pattern we can use to match the words, we OR them together */
|
|
if (argc>1)
|
|
g_string_append_c (match, '(');
|
|
for (i = 0; i < argc; i++) {
|
|
if (argv[i]->type == ESEXP_RES_STRING) {
|
|
if (count > 0)
|
|
g_string_append_c (match, '|');
|
|
|
|
word = argv[i]->value.string;
|
|
if (type & CAMEL_SEARCH_MATCH_REGEX) {
|
|
/* no need to escape because this should already be a valid regex */
|
|
g_string_append (match, word);
|
|
} else {
|
|
/* escape any special chars (not sure if this list is complete) */
|
|
if (type & CAMEL_SEARCH_MATCH_START)
|
|
g_string_append_c (match, '^');
|
|
while ((c = *word++)) {
|
|
if (strchr ("*\\.()[]^$+", c) != NULL) {
|
|
g_string_append_c (match, '\\');
|
|
}
|
|
g_string_append_c (match, c);
|
|
}
|
|
if (type & CAMEL_SEARCH_MATCH_END)
|
|
g_string_append_c (match, '^');
|
|
}
|
|
count++;
|
|
} else {
|
|
g_warning("Invalid type passed to body-contains match function");
|
|
}
|
|
}
|
|
if (argc > 1)
|
|
g_string_append_c (match, ')');
|
|
flags = REG_EXTENDED|REG_NOSUB;
|
|
if (type & CAMEL_SEARCH_MATCH_ICASE)
|
|
flags |= REG_ICASE;
|
|
if (type & CAMEL_SEARCH_MATCH_NEWLINE)
|
|
flags |= REG_NEWLINE;
|
|
err = regcomp (pattern, match->str, flags);
|
|
if (err != 0) {
|
|
/* regerror gets called twice to get the full error string
|
|
length to do proper posix error reporting */
|
|
int len = regerror (err, pattern, 0, 0);
|
|
char *buffer = g_malloc0 (len + 1);
|
|
|
|
regerror (err, pattern, buffer, len);
|
|
camel_exception_setv (ex, CAMEL_EXCEPTION_SYSTEM,
|
|
_("Regular expression compilation failed: %s: %s"),
|
|
match->str, buffer);
|
|
|
|
regfree (pattern);
|
|
}
|
|
d(printf("Built regex: '%s'\n", match->str));
|
|
g_string_free (match, TRUE);
|
|
|
|
return err;
|
|
}
|
|
|
|
static unsigned char soundex_table[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0,
|
|
49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0,
|
|
0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0,
|
|
49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
static void
|
|
soundexify (const gchar *sound, gchar code[5])
|
|
{
|
|
guchar *c, last = '\0';
|
|
gint n;
|
|
|
|
for (c = (guchar *) sound; *c && !isalpha (*c); c++);
|
|
code[0] = toupper (*c);
|
|
memset (code + 1, '0', 3);
|
|
for (n = 1; *c && n < 5; c++) {
|
|
guchar ch = soundex_table[*c];
|
|
|
|
if (ch && ch != last) {
|
|
code[n++] = ch;
|
|
last = ch;
|
|
}
|
|
}
|
|
code[4] = '\0';
|
|
}
|
|
|
|
static gboolean
|
|
header_soundex (const char *header, const char *match)
|
|
{
|
|
char mcode[5], hcode[5];
|
|
const char *p;
|
|
char c;
|
|
GString *word;
|
|
int truth = FALSE;
|
|
|
|
soundexify (match, mcode);
|
|
|
|
/* split the header into words, and soundexify and compare each one */
|
|
/* FIXME: Should this convert to utf8, and split based on that, and what not?
|
|
soundex only makes sense for us-ascii though ... */
|
|
|
|
word = g_string_new("");
|
|
p = header;
|
|
do {
|
|
c = *p++;
|
|
if (c == 0 || isspace (c)) {
|
|
if (word->len > 0) {
|
|
soundexify (word->str, hcode);
|
|
if (strcmp (hcode, mcode) == 0)
|
|
truth = TRUE;
|
|
}
|
|
g_string_truncate (word, 0);
|
|
} else if (isalpha (c))
|
|
g_string_append_c (word, c);
|
|
} while (c && !truth);
|
|
g_string_free (word, TRUE);
|
|
|
|
return truth;
|
|
}
|
|
|
|
static gunichar
|
|
utf8_get (const char **inp)
|
|
{
|
|
const unsigned char *p = *inp;
|
|
gunichar c;
|
|
|
|
if (p == NULL)
|
|
return 0;
|
|
|
|
c = g_utf8_get_char (p);
|
|
*inp = g_unichar_validate (c) ? g_utf8_next_char (p) : NULL;
|
|
|
|
return c;
|
|
}
|
|
|
|
static const char *
|
|
camel_ustrstrcase (const char *haystack, const char *needle)
|
|
{
|
|
gunichar *nuni, *puni;
|
|
gunichar u;
|
|
const char *p;
|
|
|
|
g_return_val_if_fail (haystack != NULL, NULL);
|
|
g_return_val_if_fail (needle != NULL, NULL);
|
|
|
|
if (strlen (needle) == 0)
|
|
return haystack;
|
|
if (strlen (haystack) == 0)
|
|
return NULL;
|
|
|
|
puni = nuni = alloca (sizeof (gunichar) * strlen (needle));
|
|
|
|
p = needle;
|
|
while ((u = utf8_get (&p)))
|
|
*puni++ = g_unichar_tolower (u);
|
|
|
|
/* NULL means there was illegal utf-8 sequence */
|
|
if (!p)
|
|
return NULL;
|
|
|
|
p = haystack;
|
|
while ((u = utf8_get (&p))) {
|
|
gunichar c;
|
|
|
|
c = g_unichar_tolower (u);
|
|
/* We have valid stripped char */
|
|
if (c == nuni[0]) {
|
|
const gchar *q = p;
|
|
gint npos = 1;
|
|
|
|
while (nuni + npos < puni) {
|
|
u = utf8_get (&q);
|
|
if (!q || !u)
|
|
return NULL;
|
|
|
|
c = g_unichar_tolower (u);
|
|
if (c != nuni[npos])
|
|
break;
|
|
|
|
npos++;
|
|
}
|
|
|
|
if (nuni + npos == puni)
|
|
return p;
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
#define CAMEL_SEARCH_COMPARE(x, y, z) G_STMT_START { \
|
|
if ((x) == (z)) { \
|
|
if ((y) == (z)) \
|
|
return 0; \
|
|
else \
|
|
return -1; \
|
|
} else if ((y) == (z)) \
|
|
return 1; \
|
|
} G_STMT_END
|
|
|
|
static int
|
|
camel_ustrcasecmp (const char *s1, const char *s2)
|
|
{
|
|
gunichar u1, u2 = 0;
|
|
|
|
CAMEL_SEARCH_COMPARE (s1, s2, NULL);
|
|
|
|
u1 = utf8_get (&s1);
|
|
u2 = utf8_get (&s2);
|
|
while (u1 && u2) {
|
|
u1 = g_unichar_tolower (u1);
|
|
u2 = g_unichar_tolower (u2);
|
|
if (u1 < u2)
|
|
return -1;
|
|
else if (u1 > u2)
|
|
return 1;
|
|
|
|
u1 = utf8_get (&s1);
|
|
u2 = utf8_get (&s2);
|
|
}
|
|
|
|
/* end of one of the strings ? */
|
|
CAMEL_SEARCH_COMPARE (u1, u2, 0);
|
|
|
|
/* if we have invalid utf8 sequence ? */
|
|
CAMEL_SEARCH_COMPARE (s1, s2, NULL);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
camel_ustrncasecmp (const char *s1, const char *s2, size_t len)
|
|
{
|
|
gunichar u1, u2 = 0;
|
|
|
|
CAMEL_SEARCH_COMPARE (s1, s2, NULL);
|
|
|
|
u1 = utf8_get (&s1);
|
|
u2 = utf8_get (&s2);
|
|
while (len > 0 && u1 && u2) {
|
|
u1 = g_unichar_tolower (u1);
|
|
u2 = g_unichar_tolower (u2);
|
|
if (u1 < u2)
|
|
return -1;
|
|
else if (u1 > u2)
|
|
return 1;
|
|
|
|
len--;
|
|
u1 = utf8_get (&s1);
|
|
u2 = utf8_get (&s2);
|
|
}
|
|
|
|
if (len == 0)
|
|
return 0;
|
|
|
|
/* end of one of the strings ? */
|
|
CAMEL_SEARCH_COMPARE (u1, u2, 0);
|
|
|
|
/* if we have invalid utf8 sequence ? */
|
|
CAMEL_SEARCH_COMPARE (s1, s2, NULL);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* value is the match value suitable for exact match if required */
|
|
static int
|
|
header_match(const char *value, const char *match, camel_search_match_t how)
|
|
{
|
|
const char *p;
|
|
int vlen, mlen;
|
|
|
|
if (how == CAMEL_SEARCH_MATCH_SOUNDEX)
|
|
return header_soundex (value, match);
|
|
|
|
vlen = strlen(value);
|
|
mlen = strlen(match);
|
|
if (vlen < mlen)
|
|
return FALSE;
|
|
|
|
/* from dan the man, if we have mixed case, perform a case-sensitive match,
|
|
otherwise not */
|
|
p = match;
|
|
while (*p) {
|
|
if (isupper(*p)) {
|
|
switch (how) {
|
|
case CAMEL_SEARCH_MATCH_EXACT:
|
|
return strcmp(value, match) == 0;
|
|
case CAMEL_SEARCH_MATCH_CONTAINS:
|
|
return strstr(value, match) != NULL;
|
|
case CAMEL_SEARCH_MATCH_STARTS:
|
|
return strncmp(value, match, mlen) == 0;
|
|
case CAMEL_SEARCH_MATCH_ENDS:
|
|
return strcmp(value + vlen - mlen, match) == 0;
|
|
default:
|
|
break;
|
|
}
|
|
return FALSE;
|
|
}
|
|
p++;
|
|
}
|
|
|
|
switch (how) {
|
|
case CAMEL_SEARCH_MATCH_EXACT:
|
|
return camel_ustrcasecmp(value, match) == 0;
|
|
case CAMEL_SEARCH_MATCH_CONTAINS:
|
|
return camel_ustrstrcase(value, match) != NULL;
|
|
case CAMEL_SEARCH_MATCH_STARTS:
|
|
return camel_ustrncasecmp(value, match, mlen) == 0;
|
|
case CAMEL_SEARCH_MATCH_ENDS:
|
|
return camel_ustrcasecmp(value + vlen - mlen, match) == 0;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* searhces for match inside value, if match is mixed case, hten use case-sensitive,
|
|
else insensitive */
|
|
gboolean
|
|
camel_search_header_match (const char *value, const char *match, camel_search_match_t how, camel_search_t type, const char *default_charset)
|
|
{
|
|
const char *name, *addr;
|
|
int truth = FALSE, i;
|
|
CamelInternetAddress *cia;
|
|
char *v, *vdom, *mdom;
|
|
|
|
while (*value && isspace (*value))
|
|
value++;
|
|
|
|
switch(type) {
|
|
case CAMEL_SEARCH_TYPE_ENCODED:
|
|
v = header_decode_string(value, default_charset); /* FIXME: Find header charset */
|
|
truth = header_match(v, match, how);
|
|
g_free(v);
|
|
break;
|
|
case CAMEL_SEARCH_TYPE_MLIST:
|
|
/* Special mailing list old-version domain hack
|
|
If one of the mailing list names doesn't have an @ in it, its old-style, so
|
|
only match against the pre-domain part, which should be common */
|
|
|
|
vdom = strchr(value, '@');
|
|
mdom = strchr(match, '@');
|
|
if (mdom == NULL && vdom != NULL) {
|
|
v = alloca(vdom-value+1);
|
|
memcpy(v, value, vdom-value);
|
|
v[vdom-value] = 0;
|
|
value = (char *)v;
|
|
} else if (mdom != NULL && vdom == NULL) {
|
|
v = alloca(mdom-match+1);
|
|
memcpy(v, match, mdom-match);
|
|
v[mdom-match] = 0;
|
|
match = (char *)v;
|
|
}
|
|
/* Falls through */
|
|
case CAMEL_SEARCH_TYPE_ASIS:
|
|
truth = header_match(value, match, how);
|
|
break;
|
|
case CAMEL_SEARCH_TYPE_ADDRESS_ENCODED:
|
|
case CAMEL_SEARCH_TYPE_ADDRESS:
|
|
/* possible simple case to save some work if we can */
|
|
if (header_match(value, match, how))
|
|
return TRUE;
|
|
|
|
/* Now we decode any addresses, and try asis matches on name and address parts */
|
|
cia = camel_internet_address_new();
|
|
if (type == CAMEL_SEARCH_TYPE_ADDRESS_ENCODED)
|
|
camel_address_decode((CamelAddress *)cia, value);
|
|
else
|
|
camel_address_unformat((CamelAddress *)cia, value);
|
|
|
|
for (i=0; !truth && camel_internet_address_get(cia, i, &name, &addr);i++)
|
|
truth = (name && header_match(name, match, how)) || (addr && header_match(addr, match, how));
|
|
|
|
camel_object_unref((CamelObject *)cia);
|
|
break;
|
|
}
|
|
|
|
return truth;
|
|
}
|
|
|
|
/* performs a 'slow' content-based match */
|
|
/* there is also an identical copy of this in camel-filter-search.c */
|
|
gboolean
|
|
camel_search_message_body_contains (CamelDataWrapper *object, regex_t *pattern)
|
|
{
|
|
CamelDataWrapper *containee;
|
|
int truth = FALSE;
|
|
int parts, i;
|
|
|
|
containee = camel_medium_get_content_object (CAMEL_MEDIUM (object));
|
|
|
|
if (containee == NULL)
|
|
return FALSE;
|
|
|
|
/* TODO: I find it odd that get_part and get_content_object do not
|
|
add a reference, probably need fixing for multithreading */
|
|
|
|
/* using the object types is more accurate than using the mime/types */
|
|
if (CAMEL_IS_MULTIPART (containee)) {
|
|
parts = camel_multipart_get_number (CAMEL_MULTIPART (containee));
|
|
for (i = 0; i < parts && truth == FALSE; i++) {
|
|
CamelDataWrapper *part = (CamelDataWrapper *)camel_multipart_get_part (CAMEL_MULTIPART (containee), i);
|
|
if (part)
|
|
truth = camel_search_message_body_contains (part, pattern);
|
|
}
|
|
} else if (CAMEL_IS_MIME_MESSAGE (containee)) {
|
|
/* for messages we only look at its contents */
|
|
truth = camel_search_message_body_contains ((CamelDataWrapper *)containee, pattern);
|
|
} else if (header_content_type_is(CAMEL_DATA_WRAPPER (containee)->mime_type, "text", "*")) {
|
|
/* for all other text parts, we look inside, otherwise we dont care */
|
|
CamelStreamMem *mem = (CamelStreamMem *)camel_stream_mem_new ();
|
|
|
|
camel_data_wrapper_write_to_stream (containee, CAMEL_STREAM (mem));
|
|
camel_stream_write (CAMEL_STREAM (mem), "", 1);
|
|
truth = regexec (pattern, mem->buffer->data, 0, NULL, 0) == 0;
|
|
camel_object_unref (CAMEL_OBJECT (mem));
|
|
}
|
|
|
|
return truth;
|
|
}
|
|
|