
2003-04-16 Jeffrey Stedfast <fejj@ximian.com> * camel-url-scanner.c (camel_url_web_end): Urls are unlikely to end with punctuation or closing braces, so strip any of those off the end of the url. Fixes bug #41461. * tests/mime-filter/data/html.1.out: Removed a trailing \n at the end of the file, this is incorrect (since out input file does not contain one). * camel-mime-filter-tohtml.c (html_convert): Only output a <br> if we found an eoln in the input. Along the same lines, don't write a '\n' to the output buffer unless we've encountered that eoln in the input. Fixes bug #41407. svn path=/trunk/; revision=20882
451 lines
9.9 KiB
C
451 lines
9.9 KiB
C
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
|
|
/*
|
|
* Authors: Jeffrey Stedfast <fejj@ximian.com>
|
|
*
|
|
* Copyright 2002 Ximian, Inc. (www.ximian.com)
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
|
|
*
|
|
*/
|
|
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include <string.h>
|
|
|
|
#include "e-util/e-trie.h"
|
|
#include "camel-url-scanner.h"
|
|
|
|
|
|
struct _CamelUrlScanner {
|
|
GPtrArray *patterns;
|
|
ETrie *trie;
|
|
};
|
|
|
|
|
|
CamelUrlScanner *
|
|
camel_url_scanner_new (void)
|
|
{
|
|
CamelUrlScanner *scanner;
|
|
|
|
scanner = g_new (CamelUrlScanner, 1);
|
|
scanner->patterns = g_ptr_array_new ();
|
|
scanner->trie = e_trie_new (TRUE);
|
|
|
|
return scanner;
|
|
}
|
|
|
|
|
|
void
|
|
camel_url_scanner_free (CamelUrlScanner *scanner)
|
|
{
|
|
g_return_if_fail (scanner != NULL);
|
|
|
|
g_ptr_array_free (scanner->patterns, TRUE);
|
|
e_trie_free (scanner->trie);
|
|
g_free (scanner);
|
|
}
|
|
|
|
|
|
void
|
|
camel_url_scanner_add (CamelUrlScanner *scanner, urlpattern_t *pattern)
|
|
{
|
|
g_return_if_fail (scanner != NULL);
|
|
|
|
e_trie_add (scanner->trie, pattern->pattern, scanner->patterns->len);
|
|
g_ptr_array_add (scanner->patterns, pattern);
|
|
}
|
|
|
|
|
|
gboolean
|
|
camel_url_scanner_scan (CamelUrlScanner *scanner, const char *in, size_t inlen, urlmatch_t *match)
|
|
{
|
|
const char *pos, *inend;
|
|
urlpattern_t *pat;
|
|
int pattern;
|
|
|
|
g_return_val_if_fail (scanner != NULL, FALSE);
|
|
g_return_val_if_fail (in != NULL, FALSE);
|
|
|
|
if (!(pos = e_trie_search (scanner->trie, in, inlen, &pattern)))
|
|
return FALSE;
|
|
|
|
pat = g_ptr_array_index (scanner->patterns, pattern);
|
|
|
|
match->pattern = pat->pattern;
|
|
match->prefix = pat->prefix;
|
|
|
|
inend = in + inlen;
|
|
if (!pat->start (in, pos, inend, match))
|
|
return FALSE;
|
|
|
|
if (!pat->end (in, pos, inend, match))
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
static unsigned char url_scanner_table[256] = {
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160,
|
|
68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128,
|
|
160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
|
|
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128,
|
|
128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
|
|
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
};
|
|
|
|
enum {
|
|
IS_CTRL = (1 << 0),
|
|
IS_ALPHA = (1 << 1),
|
|
IS_DIGIT = (1 << 2),
|
|
IS_LWSP = (1 << 3),
|
|
IS_SPACE = (1 << 4),
|
|
IS_SPECIAL = (1 << 5),
|
|
IS_DOMAIN = (1 << 6),
|
|
IS_URLSAFE = (1 << 7),
|
|
};
|
|
|
|
#define is_ctrl(x) ((url_scanner_table[(unsigned char)(x)] & IS_CTRL) != 0)
|
|
#define is_lwsp(x) ((url_scanner_table[(unsigned char)(x)] & IS_LWSP) != 0)
|
|
#define is_atom(x) ((url_scanner_table[(unsigned char)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0)
|
|
#define is_alpha(x) ((url_scanner_table[(unsigned char)(x)] & IS_ALPHA) != 0)
|
|
#define is_digit(x) ((url_scanner_table[(unsigned char)(x)] & IS_DIGIT) != 0)
|
|
#define is_domain(x) ((url_scanner_table[(unsigned char)(x)] & IS_DOMAIN) != 0)
|
|
#define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
|
|
|
|
|
|
gboolean
|
|
camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
register const char *inptr = pos;
|
|
|
|
g_assert (*inptr == '@');
|
|
|
|
inptr--;
|
|
|
|
while (inptr > in) {
|
|
if (is_atom (*inptr))
|
|
inptr--;
|
|
else
|
|
break;
|
|
|
|
while (inptr > in && is_atom (*inptr))
|
|
inptr--;
|
|
|
|
if (inptr > in && *inptr == '.')
|
|
inptr--;
|
|
}
|
|
|
|
if (!is_atom (*inptr))
|
|
inptr++;
|
|
|
|
if (inptr == pos)
|
|
return FALSE;
|
|
|
|
match->um_so = (inptr - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
const char *inptr = pos;
|
|
int parts = 0, digits;
|
|
|
|
g_assert (*inptr == '@');
|
|
|
|
inptr++;
|
|
|
|
if (*inptr == '[') {
|
|
/* domain literal */
|
|
do {
|
|
inptr++;
|
|
|
|
digits = 0;
|
|
while (inptr < inend && is_digit (*inptr) && digits < 3) {
|
|
inptr++;
|
|
digits++;
|
|
}
|
|
|
|
parts++;
|
|
|
|
if (*inptr != '.' && parts != 4)
|
|
return FALSE;
|
|
} while (parts < 4);
|
|
|
|
if (*inptr == ']')
|
|
inptr++;
|
|
else
|
|
return FALSE;
|
|
} else {
|
|
while (inptr < inend) {
|
|
if (is_domain (*inptr))
|
|
inptr++;
|
|
else
|
|
break;
|
|
|
|
while (inptr < inend && is_domain (*inptr))
|
|
inptr++;
|
|
|
|
if (inptr < inend && *inptr == '.' && is_domain (inptr[1]))
|
|
inptr++;
|
|
}
|
|
}
|
|
|
|
if (inptr == pos + 1)
|
|
return FALSE;
|
|
|
|
match->um_eo = (inptr - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static struct {
|
|
char open;
|
|
char close;
|
|
} url_braces[] = {
|
|
{ '(', ')' },
|
|
{ '{', '}' },
|
|
{ '[', ']' },
|
|
{ '<', '>' },
|
|
};
|
|
|
|
static char
|
|
url_stop_at_brace (const char *in, size_t so)
|
|
{
|
|
int i;
|
|
|
|
if (so > 0) {
|
|
for (i = 0; i < 4; i++) {
|
|
if (in[so - 1] == url_braces[i].open)
|
|
return url_braces[i].close;
|
|
}
|
|
}
|
|
|
|
return '\0';
|
|
}
|
|
|
|
gboolean
|
|
camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
match->um_so = (pos - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
register const char *inptr = pos;
|
|
char close_brace;
|
|
|
|
inptr += strlen (match->pattern);
|
|
|
|
if (*inptr == '/')
|
|
inptr++;
|
|
|
|
close_brace = url_stop_at_brace (in, match->um_so);
|
|
|
|
while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
|
|
inptr++;
|
|
|
|
if (inptr == pos)
|
|
return FALSE;
|
|
|
|
match->um_eo = (inptr - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
match->um_so = (pos - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
register const char *inptr = pos;
|
|
int parts = 0, digits, port;
|
|
char close_brace;
|
|
|
|
inptr += strlen (match->pattern);
|
|
|
|
close_brace = url_stop_at_brace (in, match->um_so);
|
|
|
|
/* find the end of the domain */
|
|
if (is_digit (*inptr)) {
|
|
/* domain-literal */
|
|
do {
|
|
digits = 0;
|
|
while (inptr < inend && is_digit (*inptr) && digits < 3) {
|
|
inptr++;
|
|
digits++;
|
|
}
|
|
|
|
parts++;
|
|
|
|
if (*inptr != '.' && parts != 4)
|
|
return FALSE;
|
|
else if (*inptr == '.')
|
|
inptr++;
|
|
|
|
} while (parts < 4);
|
|
} else if (is_domain (*inptr)) {
|
|
while (inptr < inend) {
|
|
if (is_domain (*inptr))
|
|
inptr++;
|
|
else
|
|
break;
|
|
|
|
while (inptr < inend && is_domain (*inptr))
|
|
inptr++;
|
|
|
|
if (inptr < inend && *inptr == '.' && is_domain (inptr[1]))
|
|
inptr++;
|
|
}
|
|
} else {
|
|
return FALSE;
|
|
}
|
|
|
|
if (inptr < inend) {
|
|
switch (*inptr) {
|
|
case ':': /* port notation */
|
|
inptr++;
|
|
port = 0;
|
|
|
|
while (inptr < inend && is_digit (*inptr) && port < 65536)
|
|
port = (port * 10) + (*inptr++ - '0');
|
|
|
|
if (port >= 65536)
|
|
inptr--;
|
|
|
|
if (inptr >= inend || *inptr != '/')
|
|
break;
|
|
|
|
/* we have a '/' so there could be a path - fall through */
|
|
case '/': /* we've detected a path component to our url */
|
|
inptr++;
|
|
|
|
while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
|
|
inptr++;
|
|
|
|
/* urls are extremely unlikely to end with any
|
|
* punctuation, so strip any trailing
|
|
* punctuation off. Also strip off any closing
|
|
* braces. */
|
|
while (inptr > pos && strchr (",.?!)}]", inptr[-1]))
|
|
inptr--;
|
|
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
match->um_eo = (inptr - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
#ifdef BUILD_TABLE
|
|
|
|
#include <stdio.h>
|
|
|
|
/* got these from rfc1738 */
|
|
#define CHARS_LWSP " \t\n\r" /* linear whitespace chars */
|
|
#define CHARS_SPECIAL "()<>@,;:\\\".[]"
|
|
|
|
/* got these from rfc1738 */
|
|
#define CHARS_URLSAFE "$-_.+!*'(),{}|\\^~[]`#%\";/?:@&="
|
|
|
|
|
|
static void
|
|
table_init_bits (unsigned int mask, const unsigned char *vals)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; vals[i] != '\0'; i++)
|
|
url_scanner_table[vals[i]] |= mask;
|
|
}
|
|
|
|
static void
|
|
url_scanner_table_init (void)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 256; i++) {
|
|
url_scanner_table[i] = 0;
|
|
if (i < 32)
|
|
url_scanner_table[i] |= IS_CTRL;
|
|
if ((i >= '0' && i <= '9'))
|
|
url_scanner_table[i] |= IS_DIGIT | IS_DOMAIN;
|
|
if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z'))
|
|
url_scanner_table[i] |= IS_ALPHA | IS_DOMAIN;
|
|
if (i >= 127)
|
|
url_scanner_table[i] |= IS_CTRL;
|
|
}
|
|
|
|
url_scanner_table[' '] |= IS_SPACE;
|
|
url_scanner_table['-'] |= IS_DOMAIN;
|
|
|
|
/* not defined to be special in rfc0822, but when scanning
|
|
backwards to find the beginning of the email address we do
|
|
not want to include this char if we come accross it - so
|
|
this is kind of a hack */
|
|
url_scanner_table['/'] |= IS_SPECIAL;
|
|
|
|
table_init_bits (IS_LWSP, CHARS_LWSP);
|
|
table_init_bits (IS_SPECIAL, CHARS_SPECIAL);
|
|
table_init_bits (IS_URLSAFE, CHARS_URLSAFE);
|
|
}
|
|
|
|
int main (int argc, char **argv)
|
|
{
|
|
int i;
|
|
|
|
url_scanner_table_init ();
|
|
|
|
printf ("static unsigned char url_scanner_table[256] = {");
|
|
for (i = 0; i < 256; i++) {
|
|
printf ("%s%3d%s", (i % 16) ? "" : "\n\t",
|
|
url_scanner_table[i], i != 255 ? "," : "\n");
|
|
}
|
|
printf ("};\n\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
#endif /* BUILD_TABLE */
|