
2002-12-09 Jeffrey Stedfast <fejj@ximian.com> * camel-url-scanner.c (camel_url_addrspec_end): Fixed to not be fooled in the case where the address is followed immediately by a period. (camel_url_web_end): Made more robust. (camel_url_scanner_scan): Oops. We need to set the match->pattern string pointer to the correct pattern before executing the start/end methods (as some of them rely on this info). svn path=/trunk/; revision=19077
412 lines
9.2 KiB
C
412 lines
9.2 KiB
C
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
|
|
/*
|
|
* Authors: Jeffrey Stedfast <fejj@ximian.com>
|
|
*
|
|
* Copyright 2002 Ximian, Inc. (www.ximian.com)
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
|
|
*
|
|
*/
|
|
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include <string.h>
|
|
|
|
#include "e-util/e-trie.h"
|
|
#include "camel-url-scanner.h"
|
|
|
|
|
|
struct _CamelUrlScanner {
|
|
GPtrArray *patterns;
|
|
ETrie *trie;
|
|
};
|
|
|
|
|
|
CamelUrlScanner *
|
|
camel_url_scanner_new (void)
|
|
{
|
|
CamelUrlScanner *scanner;
|
|
|
|
scanner = g_new (CamelUrlScanner, 1);
|
|
scanner->patterns = g_ptr_array_new ();
|
|
scanner->trie = e_trie_new (TRUE);
|
|
|
|
return scanner;
|
|
}
|
|
|
|
|
|
void
|
|
camel_url_scanner_free (CamelUrlScanner *scanner)
|
|
{
|
|
g_return_if_fail (scanner != NULL);
|
|
|
|
g_ptr_array_free (scanner->patterns, TRUE);
|
|
e_trie_free (scanner->trie);
|
|
g_free (scanner);
|
|
}
|
|
|
|
|
|
void
|
|
camel_url_scanner_add (CamelUrlScanner *scanner, urlpattern_t *pattern)
|
|
{
|
|
g_return_if_fail (scanner != NULL);
|
|
|
|
e_trie_add (scanner->trie, pattern->pattern, scanner->patterns->len);
|
|
g_ptr_array_add (scanner->patterns, pattern);
|
|
}
|
|
|
|
|
|
gboolean
|
|
camel_url_scanner_scan (CamelUrlScanner *scanner, const char *in, size_t inlen, urlmatch_t *match)
|
|
{
|
|
const char *pos, *inend;
|
|
urlpattern_t *pat;
|
|
int pattern;
|
|
|
|
g_return_val_if_fail (scanner != NULL, FALSE);
|
|
g_return_val_if_fail (in != NULL, FALSE);
|
|
|
|
if (!(pos = e_trie_search (scanner->trie, in, inlen, &pattern)))
|
|
return FALSE;
|
|
|
|
pat = g_ptr_array_index (scanner->patterns, pattern);
|
|
|
|
match->pattern = pat->pattern;
|
|
match->prefix = pat->prefix;
|
|
|
|
inend = in + inlen;
|
|
if (!pat->start (in, pos, inend, match))
|
|
return FALSE;
|
|
|
|
if (!pat->end (in, pos, inend, match))
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
static unsigned char url_scanner_table[256] = {
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160,
|
|
68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128,
|
|
160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
|
|
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128,
|
|
128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
|
|
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
};
|
|
|
|
enum {
|
|
IS_CTRL = (1 << 0),
|
|
IS_ALPHA = (1 << 1),
|
|
IS_DIGIT = (1 << 2),
|
|
IS_LWSP = (1 << 3),
|
|
IS_SPACE = (1 << 4),
|
|
IS_SPECIAL = (1 << 5),
|
|
IS_DOMAIN = (1 << 6),
|
|
IS_URLSAFE = (1 << 7),
|
|
};
|
|
|
|
#define is_ctrl(x) ((url_scanner_table[(unsigned char)(x)] & IS_CTRL) != 0)
|
|
#define is_lwsp(x) ((url_scanner_table[(unsigned char)(x)] & IS_LWSP) != 0)
|
|
#define is_atom(x) ((url_scanner_table[(unsigned char)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0)
|
|
#define is_alpha(x) ((url_scanner_table[(unsigned char)(x)] & IS_ALPHA) != 0)
|
|
#define is_digit(x) ((url_scanner_table[(unsigned char)(x)] & IS_DIGIT) != 0)
|
|
#define is_domain(x) ((url_scanner_table[(unsigned char)(x)] & IS_DOMAIN) != 0)
|
|
#define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
|
|
|
|
|
|
gboolean
|
|
camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
register const char *inptr = pos;
|
|
|
|
g_assert (*inptr == '@');
|
|
|
|
inptr--;
|
|
|
|
while (inptr > in) {
|
|
if (is_atom (*inptr))
|
|
inptr--;
|
|
else
|
|
break;
|
|
|
|
while (inptr > in && is_atom (*inptr))
|
|
inptr--;
|
|
|
|
if (inptr > in && *inptr == '.')
|
|
inptr--;
|
|
}
|
|
|
|
if (!is_atom (*inptr))
|
|
inptr++;
|
|
|
|
if (inptr == pos)
|
|
return FALSE;
|
|
|
|
match->um_so = (inptr - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
const char *inptr = pos;
|
|
int parts = 0, digits;
|
|
|
|
g_assert (*inptr == '@');
|
|
|
|
inptr++;
|
|
|
|
if (*inptr == '[') {
|
|
/* domain literal */
|
|
do {
|
|
inptr++;
|
|
|
|
digits = 0;
|
|
while (inptr < inend && is_digit (*inptr) && digits < 3) {
|
|
inptr++;
|
|
digits++;
|
|
}
|
|
|
|
parts++;
|
|
|
|
if (*inptr != '.' && parts != 4)
|
|
return FALSE;
|
|
} while (parts < 4);
|
|
|
|
if (*inptr == ']')
|
|
inptr++;
|
|
else
|
|
return FALSE;
|
|
} else {
|
|
while (inptr < inend) {
|
|
if (is_domain (*inptr))
|
|
inptr++;
|
|
else
|
|
break;
|
|
|
|
while (inptr < inend && is_domain (*inptr))
|
|
inptr++;
|
|
|
|
if (inptr < inend && *inptr == '.' && is_domain (inptr[1]))
|
|
inptr++;
|
|
}
|
|
}
|
|
|
|
if (inptr == pos)
|
|
return FALSE;
|
|
|
|
match->um_eo = (inptr - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
match->um_so = (pos - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
register const char *inptr = pos;
|
|
|
|
inptr += strlen (match->pattern);
|
|
|
|
if (*inptr == '/')
|
|
inptr++;
|
|
|
|
while (inptr < inend && is_urlsafe (*inptr))
|
|
inptr++;
|
|
|
|
if (inptr == pos)
|
|
return FALSE;
|
|
|
|
match->um_eo = (inptr - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
match->um_so = (pos - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
|
|
{
|
|
register const char *inptr = pos;
|
|
int parts = 0, digits, port;
|
|
|
|
inptr += strlen (match->pattern);
|
|
|
|
/* find the end of the domain */
|
|
if (is_digit (*inptr)) {
|
|
/* domain-literal */
|
|
do {
|
|
digits = 0;
|
|
while (inptr < inend && is_digit (*inptr) && digits < 3) {
|
|
inptr++;
|
|
digits++;
|
|
}
|
|
|
|
parts++;
|
|
|
|
if (*inptr != '.' && parts != 4)
|
|
return FALSE;
|
|
else if (*inptr == '.')
|
|
inptr++;
|
|
|
|
} while (parts < 4);
|
|
} else if (is_domain (*inptr)) {
|
|
while (inptr < inend) {
|
|
if (is_domain (*inptr))
|
|
inptr++;
|
|
else
|
|
break;
|
|
|
|
while (inptr < inend && is_domain (*inptr))
|
|
inptr++;
|
|
|
|
if (inptr < inend && *inptr == '.' && is_domain (inptr[1]))
|
|
inptr++;
|
|
}
|
|
} else {
|
|
return FALSE;
|
|
}
|
|
|
|
if (inptr < inend) {
|
|
switch (*inptr) {
|
|
case ':': /* port notation */
|
|
inptr++;
|
|
port = 0;
|
|
|
|
while (inptr < inend && is_digit (*inptr) && port < 65536)
|
|
port = (port * 10) + (*inptr++ - '0');
|
|
|
|
if (port >= 65536)
|
|
inptr--;
|
|
|
|
if (inptr >= inend || *inptr != '/')
|
|
break;
|
|
|
|
/* we have a '/' so there could be a path - fall through */
|
|
case '/': /* we've detected a path component to our url */
|
|
inptr++;
|
|
|
|
while (inptr < inend && is_urlsafe (*inptr))
|
|
inptr++;
|
|
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
match->um_eo = (inptr - in);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
#ifdef BUILD_TABLE
|
|
|
|
#include <stdio.h>
|
|
|
|
/* got these from rfc1738 */
|
|
#define CHARS_LWSP " \t\n\r" /* linear whitespace chars */
|
|
#define CHARS_SPECIAL "()<>@,;:\\\".[]"
|
|
|
|
/* got these from rfc1738 */
|
|
#define CHARS_URLSAFE "$-_.+!*'(),{}|\\^~[]`#%\";/?:@&="
|
|
|
|
|
|
static void
|
|
table_init_bits (unsigned int mask, const unsigned char *vals)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; vals[i] != '\0'; i++)
|
|
url_scanner_table[vals[i]] |= mask;
|
|
}
|
|
|
|
static void
|
|
url_scanner_table_init (void)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 256; i++) {
|
|
url_scanner_table[i] = 0;
|
|
if (i < 32)
|
|
url_scanner_table[i] |= IS_CTRL;
|
|
if ((i >= '0' && i <= '9'))
|
|
url_scanner_table[i] |= IS_DIGIT | IS_DOMAIN;
|
|
if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z'))
|
|
url_scanner_table[i] |= IS_ALPHA | IS_DOMAIN;
|
|
}
|
|
|
|
url_scanner_table[127] |= IS_CTRL;
|
|
url_scanner_table[' '] |= IS_SPACE;
|
|
url_scanner_table['-'] |= IS_DOMAIN;
|
|
|
|
/* not defined to be special in rfc0822, but when scanning
|
|
backwards to find the beginning of the email address we do
|
|
not want to include this char if we come accross it - so
|
|
this is kind of a hack */
|
|
url_scanner_table['/'] |= IS_SPECIAL;
|
|
|
|
table_init_bits (IS_LWSP, CHARS_LWSP);
|
|
table_init_bits (IS_SPECIAL, CHARS_SPECIAL);
|
|
table_init_bits (IS_URLSAFE, CHARS_URLSAFE);
|
|
}
|
|
|
|
int main (int argc, char **argv)
|
|
{
|
|
int i;
|
|
|
|
url_scanner_table_init ();
|
|
|
|
printf ("static unsigned char url_scanner_table[256] = {");
|
|
for (i = 0; i < 256; i++) {
|
|
printf ("%s%3d%s", (i % 16) ? "" : "\n\t",
|
|
url_scanner_table[i], i != 255 ? "," : "\n");
|
|
}
|
|
printf ("};\n\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
#endif /* BUILD_TABLE */
|