
2001-10-16 Jon Trowbridge <trow@ximian.com> * e-searching-tokenizer.c (e_searching_tokenizer_begin): Always explicitly clear out the SearchInfo. svn path=/trunk/; revision=13704
1037 lines
24 KiB
C
1037 lines
24 KiB
C
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
|
|
|
|
/*
|
|
* e-searching-tokenizer.c
|
|
*
|
|
* Copyright (C) 2001 Ximian, Inc.
|
|
*
|
|
* Developed by Jon Trowbridge <trow@ximian.com>
|
|
*/
|
|
|
|
/*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation; either version 2 of the
|
|
* License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
|
* USA.
|
|
*/
|
|
|
|
#include <config.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <gal/unicode/gunicode.h>
|
|
#include "e-searching-tokenizer.h"
|
|
|
|
enum {
|
|
EST_MATCH_SIGNAL,
|
|
EST_LAST_SIGNAL
|
|
};
|
|
guint e_searching_tokenizer_signals[EST_LAST_SIGNAL] = { 0 };
|
|
|
|
#define START_MAGIC "<\n>S<\n>"
|
|
#define END_MAGIC "<\n>E<\n>"
|
|
|
|
static void e_searching_tokenizer_begin (HTMLTokenizer *, gchar *);
|
|
static void e_searching_tokenizer_end (HTMLTokenizer *);
|
|
static gchar *e_searching_tokenizer_peek_token (HTMLTokenizer *);
|
|
static gchar *e_searching_tokenizer_next_token (HTMLTokenizer *);
|
|
static gboolean e_searching_tokenizer_has_more (HTMLTokenizer *);
|
|
|
|
static HTMLTokenizer *e_searching_tokenizer_clone (HTMLTokenizer *);
|
|
|
|
static const gchar *ignored_tags[] = { "b", "i", NULL };
|
|
static const gchar *space_tags[] = { "br", NULL };
|
|
|
|
GtkObjectClass *parent_class = NULL;
|
|
|
|
/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
|
|
|
|
typedef enum {
|
|
MATCH_FAILED = 0,
|
|
MATCH_COMPLETE,
|
|
MATCH_START,
|
|
MATCH_CONTINUES,
|
|
MATCH_END
|
|
} MatchInfo;
|
|
|
|
typedef struct _SearchInfo SearchInfo;
|
|
struct _SearchInfo {
|
|
gchar *search;
|
|
gchar *current;
|
|
|
|
gboolean case_sensitive;
|
|
gboolean allow_space_tags_to_match_whitespace;
|
|
|
|
gint match_size_incr;
|
|
gchar *match_color;
|
|
gboolean match_bold;
|
|
};
|
|
|
|
typedef struct _SharedState SharedState;
|
|
struct _SharedState {
|
|
gint refs;
|
|
gchar *str_primary;
|
|
gchar *str_secondary;
|
|
gboolean case_sensitive_primary;
|
|
gboolean case_sensitive_secondary;
|
|
};
|
|
|
|
struct _ESearchingTokenizerPrivate {
|
|
gint match_count;
|
|
SearchInfo *search;
|
|
GList *pending;
|
|
GList *trash;
|
|
SharedState *shared;
|
|
};
|
|
|
|
/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
|
|
|
|
static SharedState *
|
|
shared_state_new (void)
|
|
{
|
|
SharedState *shared = g_new0 (SharedState, 1);
|
|
shared->refs = 1;
|
|
return shared;
|
|
}
|
|
|
|
static void
|
|
shared_state_ref (SharedState *shared)
|
|
{
|
|
g_return_if_fail (shared != NULL);
|
|
g_return_if_fail (shared->refs > 0);
|
|
++shared->refs;
|
|
}
|
|
|
|
static void
|
|
shared_state_unref (SharedState *shared)
|
|
{
|
|
if (shared) {
|
|
g_return_if_fail (shared->refs > 0);
|
|
--shared->refs;
|
|
if (shared->refs == 0) {
|
|
g_free (shared->str_primary);
|
|
g_free (shared->str_secondary);
|
|
g_free (shared);
|
|
}
|
|
}
|
|
}
|
|
|
|
/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
|
|
|
|
static SearchInfo *
|
|
search_info_new (void)
|
|
{
|
|
SearchInfo *si;
|
|
|
|
si = g_new0 (SearchInfo, 1);
|
|
si->case_sensitive = FALSE;
|
|
|
|
si->match_size_incr = 1;
|
|
si->match_color = g_strdup ("red");
|
|
si->match_bold = FALSE;
|
|
|
|
si->allow_space_tags_to_match_whitespace = TRUE;
|
|
|
|
return si;
|
|
}
|
|
|
|
static void
|
|
search_info_free (SearchInfo *si)
|
|
{
|
|
if (si) {
|
|
g_free (si->search);
|
|
g_free (si->match_color);
|
|
g_free (si);
|
|
}
|
|
}
|
|
|
|
static SearchInfo *
|
|
search_info_clone (SearchInfo *si)
|
|
{
|
|
SearchInfo *new_si = NULL;
|
|
|
|
if (si) {
|
|
new_si = search_info_new ();
|
|
new_si->search = g_strdup (si->search);
|
|
new_si->case_sensitive = si->case_sensitive;
|
|
}
|
|
|
|
return new_si;
|
|
}
|
|
|
|
static void
|
|
search_info_set_string (SearchInfo *si, const gchar *str)
|
|
{
|
|
g_return_if_fail (si);
|
|
g_return_if_fail (str);
|
|
|
|
g_free (si->search);
|
|
si->search = g_strdup (str);
|
|
si->current = NULL;
|
|
}
|
|
|
|
static void
|
|
search_info_set_case_sensitivity (SearchInfo *si, gboolean flag)
|
|
{
|
|
g_return_if_fail (si);
|
|
|
|
si->case_sensitive = flag;
|
|
}
|
|
|
|
static void
|
|
search_info_set_match_size_increase (SearchInfo *si, gint incr)
|
|
{
|
|
g_return_if_fail (si);
|
|
g_return_if_fail (incr >= 0);
|
|
|
|
si->match_size_incr = incr;
|
|
}
|
|
|
|
static void
|
|
search_info_set_match_color (SearchInfo *si, const gchar *color)
|
|
{
|
|
g_return_if_fail (si);
|
|
|
|
g_free (si->match_color);
|
|
si->match_color = g_strdup (color);
|
|
}
|
|
|
|
static void
|
|
search_info_set_match_bold (SearchInfo *si, gboolean flag)
|
|
{
|
|
g_return_if_fail (si);
|
|
|
|
si->match_bold = flag;
|
|
}
|
|
|
|
static void
|
|
search_info_reset (SearchInfo *si)
|
|
{
|
|
if (si == NULL)
|
|
return;
|
|
si->current = NULL;
|
|
}
|
|
|
|
/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */
|
|
|
|
static const gchar *
|
|
find_whole (SearchInfo *si, const gchar *haystack, const gchar *needle)
|
|
{
|
|
const gchar *h, *n;
|
|
|
|
g_return_val_if_fail (si, NULL);
|
|
g_return_val_if_fail (haystack && needle, NULL);
|
|
g_return_val_if_fail (g_utf8_validate (haystack, -1, NULL), NULL);
|
|
g_return_val_if_fail (g_utf8_validate (needle, -1, NULL), NULL);
|
|
|
|
while (*haystack) {
|
|
h = haystack;
|
|
n = needle;
|
|
while (*h && *n) {
|
|
gunichar c1 = g_utf8_get_char (h);
|
|
gunichar c2 = g_utf8_get_char (n);
|
|
|
|
if (!si->case_sensitive) {
|
|
c1 = g_unichar_tolower (c1);
|
|
c2 = g_unichar_tolower (c2);
|
|
}
|
|
|
|
if (c1 != c2)
|
|
break;
|
|
|
|
h = g_utf8_next_char (h);
|
|
n = g_utf8_next_char (n);
|
|
}
|
|
if (*n == '\0')
|
|
return haystack;
|
|
if (*h == '\0')
|
|
return NULL;
|
|
haystack = g_utf8_next_char (haystack);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* This is a really stupid implementation of this function. */
|
|
static const gchar *
|
|
find_head (SearchInfo *si, const gchar *haystack, const gchar *needle)
|
|
{
|
|
const gchar *h, *n;
|
|
|
|
g_return_val_if_fail (si, NULL);
|
|
g_return_val_if_fail (haystack && needle, NULL);
|
|
g_return_val_if_fail (g_utf8_validate (haystack, -1, NULL), NULL);
|
|
g_return_val_if_fail (g_utf8_validate (needle, -1, NULL), NULL);
|
|
|
|
while (*haystack) {
|
|
h = haystack;
|
|
n = needle;
|
|
while (*h && *n) {
|
|
gunichar c1 = g_utf8_get_char (h);
|
|
gunichar c2 = g_utf8_get_char (n);
|
|
|
|
if (!si->case_sensitive) {
|
|
c1 = g_unichar_tolower (c1);
|
|
c2 = g_unichar_tolower (c2);
|
|
}
|
|
|
|
if (c1 != c2)
|
|
break;
|
|
|
|
h = g_utf8_next_char (h);
|
|
n = g_utf8_next_char (n);
|
|
}
|
|
if (*h == '\0')
|
|
return haystack;
|
|
haystack = g_utf8_next_char (haystack);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static const gchar *
|
|
find_partial (SearchInfo *si, const gchar *haystack, const gchar *needle)
|
|
{
|
|
g_return_val_if_fail (si, NULL);
|
|
g_return_val_if_fail (haystack && needle, NULL);
|
|
g_return_val_if_fail (g_utf8_validate (haystack, -1, NULL), NULL);
|
|
g_return_val_if_fail (g_utf8_validate (needle, -1, NULL), NULL);
|
|
|
|
while (*needle) {
|
|
gunichar c1 = g_utf8_get_char (haystack);
|
|
gunichar c2 = g_utf8_get_char (needle);
|
|
|
|
if (!si->case_sensitive) {
|
|
c1 = g_unichar_tolower (c1);
|
|
c2 = g_unichar_tolower (c2);
|
|
}
|
|
|
|
if (c1 != c2)
|
|
return NULL;
|
|
|
|
needle = g_utf8_next_char (needle);
|
|
haystack = g_utf8_next_char (haystack);
|
|
}
|
|
return haystack;
|
|
}
|
|
|
|
static gboolean
|
|
tag_match (const gchar *token, const gchar *tag)
|
|
{
|
|
token += 2; /* Skip past TAG_ESCAPE and < */
|
|
if (*token == '/')
|
|
++token;
|
|
while (*token && *tag) {
|
|
gunichar c1 = g_unichar_tolower (g_utf8_get_char (token));
|
|
gunichar c2 = g_unichar_tolower (g_utf8_get_char (tag));
|
|
if (c1 != c2)
|
|
return FALSE;
|
|
token = g_utf8_next_char (token);
|
|
tag = g_utf8_next_char (tag);
|
|
}
|
|
return (*tag == '\0' && *token == '>');
|
|
}
|
|
|
|
static MatchInfo
|
|
search_info_compare (SearchInfo *si, const gchar *token, gint *start_pos, gint *end_pos)
|
|
{
|
|
gboolean token_is_tag;
|
|
const gchar *s;
|
|
gint i;
|
|
|
|
g_return_val_if_fail (si != NULL, MATCH_FAILED);
|
|
g_return_val_if_fail (token != NULL, MATCH_FAILED);
|
|
g_return_val_if_fail (start_pos != NULL, MATCH_FAILED);
|
|
g_return_val_if_fail (end_pos != NULL, MATCH_FAILED);
|
|
|
|
token_is_tag = (*token == TAG_ESCAPE);
|
|
|
|
/* Try to start a new match. */
|
|
if (si->current == NULL) {
|
|
|
|
/* A match can never start on a token. */
|
|
if (token_is_tag)
|
|
return MATCH_FAILED;
|
|
|
|
/* Check to see if the search string is entirely embedded within the token. */
|
|
s = find_whole (si, token, si->search);
|
|
if (s) {
|
|
const gchar *pos = s;
|
|
i = g_utf8_strlen (si->search, -1);
|
|
while (i > 0) {
|
|
pos = g_utf8_next_char (pos);
|
|
--i;
|
|
}
|
|
*start_pos = s - token;
|
|
*end_pos = pos - token;
|
|
|
|
return MATCH_COMPLETE;
|
|
}
|
|
|
|
/* Check to see if the beginning of the search string lies in this token. */
|
|
s = find_head (si, token, si->search);
|
|
if (s) {
|
|
*start_pos = s - token;
|
|
si->current = si->search;
|
|
while (*s) {
|
|
s = g_utf8_next_char (s);
|
|
si->current = g_utf8_next_char (si->current);
|
|
}
|
|
|
|
return MATCH_START;
|
|
}
|
|
|
|
return MATCH_FAILED;
|
|
}
|
|
|
|
/* Try to continue a previously-started match. */
|
|
|
|
/* Deal with tags that we encounter mid-match. */
|
|
if (token_is_tag) {
|
|
|
|
/* "Ignored tags" will never mess up a match. */
|
|
for (i=0; ignored_tags[i]; ++i) {
|
|
if (tag_match (token, ignored_tags[i]))
|
|
return MATCH_CONTINUES;
|
|
}
|
|
|
|
/* "Space tags" only match whitespace in our ongoing match. */
|
|
if (si->allow_space_tags_to_match_whitespace
|
|
&& g_unichar_isspace (g_utf8_get_char (si->current))) {
|
|
for (i=0; space_tags[i]; ++i) {
|
|
if (tag_match (token, space_tags[i])) {
|
|
si->current = g_utf8_next_char (si->current);
|
|
return MATCH_CONTINUES;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* All other tags derail our match. */
|
|
return MATCH_FAILED;
|
|
}
|
|
|
|
s = find_partial (si, token, si->current);
|
|
if (s) {
|
|
if (start_pos)
|
|
*start_pos = 0;
|
|
if (end_pos)
|
|
*end_pos = s - token;
|
|
return MATCH_END;
|
|
}
|
|
|
|
s = find_partial (si, si->current, token);
|
|
if (s) {
|
|
si->current = (gchar *) s;
|
|
return MATCH_CONTINUES;
|
|
}
|
|
|
|
return MATCH_FAILED;
|
|
}
|
|
|
|
/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
|
|
|
|
static void
|
|
e_searching_tokenizer_cleanup (ESearchingTokenizer *st)
|
|
{
|
|
g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
|
|
|
|
if (st->priv->trash) {
|
|
g_list_foreach (st->priv->trash, (GFunc) g_free, NULL);
|
|
g_list_free (st->priv->trash);
|
|
st->priv->trash = NULL;
|
|
}
|
|
|
|
if (st->priv->pending) {
|
|
g_list_foreach (st->priv->pending, (GFunc) g_free, NULL);
|
|
g_list_free (st->priv->pending);
|
|
st->priv->pending = NULL;
|
|
}
|
|
}
|
|
|
|
static void
|
|
e_searching_tokenizer_destroy (GtkObject *obj)
|
|
{
|
|
ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (obj);
|
|
|
|
e_searching_tokenizer_cleanup (st);
|
|
|
|
search_info_free (st->priv->search);
|
|
shared_state_unref (st->priv->shared);
|
|
|
|
g_free (st->priv);
|
|
st->priv = NULL;
|
|
|
|
if (parent_class->destroy)
|
|
parent_class->destroy (obj);
|
|
}
|
|
|
|
static void
|
|
e_searching_tokenizer_class_init (ESearchingTokenizerClass *klass)
|
|
{
|
|
GtkObjectClass *obj_class = (GtkObjectClass *) klass;
|
|
HTMLTokenizerClass *tok_class = HTML_TOKENIZER_CLASS (klass);
|
|
|
|
e_searching_tokenizer_signals[EST_MATCH_SIGNAL] =
|
|
gtk_signal_new ("match",
|
|
GTK_RUN_LAST,
|
|
obj_class->type,
|
|
GTK_SIGNAL_OFFSET (ESearchingTokenizerClass, match),
|
|
gtk_marshal_NONE__NONE,
|
|
GTK_TYPE_NONE,
|
|
0);
|
|
gtk_object_class_add_signals (obj_class, e_searching_tokenizer_signals, EST_LAST_SIGNAL);
|
|
|
|
obj_class->destroy = e_searching_tokenizer_destroy;
|
|
|
|
tok_class->begin = e_searching_tokenizer_begin;
|
|
tok_class->end = e_searching_tokenizer_end;
|
|
|
|
tok_class->peek_token = e_searching_tokenizer_peek_token;
|
|
tok_class->next_token = e_searching_tokenizer_next_token;
|
|
tok_class->has_more = e_searching_tokenizer_has_more;
|
|
tok_class->clone = e_searching_tokenizer_clone;
|
|
|
|
parent_class = gtk_type_class (HTML_TYPE_TOKENIZER);
|
|
}
|
|
|
|
static void
|
|
e_searching_tokenizer_init (ESearchingTokenizer *st)
|
|
{
|
|
st->priv = g_new0 (struct _ESearchingTokenizerPrivate, 1);
|
|
st->priv->shared = shared_state_new ();
|
|
}
|
|
|
|
GtkType
|
|
e_searching_tokenizer_get_type (void)
|
|
{
|
|
static GtkType e_searching_tokenizer_type = 0;
|
|
if (! e_searching_tokenizer_type) {
|
|
static GtkTypeInfo e_searching_tokenizer_info = {
|
|
"ESearchingTokenizer",
|
|
sizeof (ESearchingTokenizer),
|
|
sizeof (ESearchingTokenizerClass),
|
|
(GtkClassInitFunc) e_searching_tokenizer_class_init,
|
|
(GtkObjectInitFunc) e_searching_tokenizer_init,
|
|
NULL, NULL,
|
|
(GtkClassInitFunc) NULL
|
|
};
|
|
e_searching_tokenizer_type = gtk_type_unique (HTML_TYPE_TOKENIZER,
|
|
&e_searching_tokenizer_info);
|
|
}
|
|
return e_searching_tokenizer_type;
|
|
}
|
|
|
|
HTMLTokenizer *
|
|
e_searching_tokenizer_new (void)
|
|
{
|
|
return (HTMLTokenizer *) gtk_type_new (E_TYPE_SEARCHING_TOKENIZER);
|
|
}
|
|
|
|
/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
|
|
|
|
static GList *
|
|
g_list_remove_head (GList *x)
|
|
{
|
|
GList *repl = NULL;
|
|
if (x) {
|
|
repl = g_list_remove_link (x, x);
|
|
g_list_free_1 (x);
|
|
}
|
|
return repl;
|
|
}
|
|
|
|
/* I can't believe that there isn't a better way to do this. */
|
|
static GList *
|
|
g_list_insert_before (GList *list, GList *llink, gpointer data)
|
|
{
|
|
gint pos = g_list_position (list, llink);
|
|
return g_list_insert (list, data, pos);
|
|
}
|
|
|
|
static gchar *
|
|
pop_pending (ESearchingTokenizer *st)
|
|
{
|
|
gchar *token = NULL;
|
|
if (st->priv->pending) {
|
|
token = (gchar *) st->priv->pending->data;
|
|
st->priv->trash = g_list_prepend (st->priv->trash, token);
|
|
st->priv->pending = g_list_remove_head (st->priv->pending);
|
|
}
|
|
return token;
|
|
}
|
|
|
|
static inline void
|
|
add_pending (ESearchingTokenizer *st, gchar *tok)
|
|
{
|
|
st->priv->pending = g_list_append (st->priv->pending, tok);
|
|
}
|
|
|
|
static void
|
|
add_pending_match_begin (ESearchingTokenizer *st, SearchInfo *si)
|
|
{
|
|
gchar *size_str = NULL;
|
|
gchar *color_str= NULL;
|
|
|
|
if (si->match_size_incr > 0)
|
|
size_str = g_strdup_printf (" size=+%d", si->match_size_incr);
|
|
if (si->match_color)
|
|
color_str = g_strdup_printf (" color=%s", si->match_color);
|
|
|
|
if (size_str || color_str)
|
|
add_pending (st, g_strdup_printf ("%c<font%s%s>",
|
|
TAG_ESCAPE,
|
|
size_str ? size_str : "",
|
|
color_str ? color_str : ""));
|
|
|
|
g_free (size_str);
|
|
g_free (color_str);
|
|
|
|
if (si->match_bold)
|
|
add_pending (st, g_strdup_printf ("%c<b>", TAG_ESCAPE));
|
|
}
|
|
|
|
static void
|
|
add_pending_match_end (ESearchingTokenizer *st, SearchInfo *si)
|
|
{
|
|
if (si->match_bold)
|
|
add_pending (st, g_strdup_printf ("%c</b>", TAG_ESCAPE));
|
|
|
|
if (si->match_size_incr > 0 || si->match_color)
|
|
add_pending (st, g_strdup_printf ("%c</font>", TAG_ESCAPE));
|
|
}
|
|
|
|
static void
|
|
add_to_trash (ESearchingTokenizer *st, gchar *txt)
|
|
{
|
|
st->priv->trash = g_list_prepend (st->priv->trash, txt);
|
|
}
|
|
|
|
static gchar *
|
|
get_next_token (ESearchingTokenizer *st)
|
|
{
|
|
HTMLTokenizer *ht = HTML_TOKENIZER (st);
|
|
HTMLTokenizerClass *klass = HTML_TOKENIZER_CLASS (parent_class);
|
|
|
|
return klass->has_more (ht) ? klass->next_token (ht) : NULL;
|
|
}
|
|
|
|
/*
|
|
* Move the matched part of the queue into pending, replacing the start and end placeholders by
|
|
* the appropriate tokens.
|
|
*/
|
|
static GList *
|
|
queue_matched (ESearchingTokenizer *st, SearchInfo *si, GList *q)
|
|
{
|
|
GList *qh = q;
|
|
gboolean post_start = FALSE;
|
|
|
|
while (q != NULL) {
|
|
GList *q_next = g_list_next (q);
|
|
if (!strcmp ((gchar *) q->data, START_MAGIC)) {
|
|
add_pending_match_begin (st, si);
|
|
post_start = TRUE;
|
|
} else if (!strcmp ((gchar *) q->data, END_MAGIC)) {
|
|
add_pending_match_end (st, si);
|
|
q_next = NULL;
|
|
} else {
|
|
gboolean is_tag = *((gchar *)q->data) == TAG_ESCAPE;
|
|
if (is_tag && post_start)
|
|
add_pending_match_end (st, si);
|
|
add_pending (st, g_strdup ((gchar *) q->data));
|
|
if (is_tag && post_start)
|
|
add_pending_match_begin (st, si);
|
|
}
|
|
qh = g_list_remove_link (qh, q);
|
|
g_list_free_1 (q);
|
|
q = q_next;
|
|
}
|
|
|
|
return qh;
|
|
}
|
|
|
|
/*
|
|
* Strip the start and end placeholders out of the queue.
|
|
*/
|
|
static GList *
|
|
queue_match_failed (ESearchingTokenizer *st, GList *q)
|
|
{
|
|
GList *qh = q;
|
|
|
|
/* If we do find the START_MAGIC token in the queue, we want
|
|
to drop everything up to and including the token immediately
|
|
following START_MAGIC. */
|
|
while (q != NULL && strcmp ((gchar *) q->data, START_MAGIC))
|
|
q = g_list_next (q);
|
|
if (q) {
|
|
q = g_list_next (q);
|
|
/* If there is no token following START_MAGIC, something is
|
|
very wrong. */
|
|
if (q == NULL) {
|
|
g_assert_not_reached ();
|
|
}
|
|
}
|
|
|
|
/* Otherwise we just want to just drop the the first token. */
|
|
if (q == NULL)
|
|
q = qh;
|
|
|
|
/* Now move everything up to and including q to pending. */
|
|
while (qh && qh != q) {
|
|
if (strcmp ((gchar *) qh->data, START_MAGIC))
|
|
add_pending (st, g_strdup (qh->data));
|
|
qh = g_list_remove_head (qh);
|
|
}
|
|
if (qh == q) {
|
|
if (strcmp ((gchar *) qh->data, START_MAGIC))
|
|
add_pending (st, g_strdup (qh->data));
|
|
qh = g_list_remove_head (qh);
|
|
}
|
|
|
|
return qh;
|
|
}
|
|
|
|
static void
|
|
matched (ESearchingTokenizer *st)
|
|
{
|
|
++st->priv->match_count;
|
|
gtk_signal_emit (GTK_OBJECT (st), e_searching_tokenizer_signals[EST_MATCH_SIGNAL]);
|
|
}
|
|
|
|
static void
|
|
get_pending_tokens (ESearchingTokenizer *st)
|
|
{
|
|
GList *queue = NULL;
|
|
gchar *token = NULL;
|
|
MatchInfo result;
|
|
gint start_pos, end_pos;
|
|
GList *start_after = NULL;
|
|
|
|
/* Get an initial token into the queue. */
|
|
token = get_next_token (st);
|
|
if (token) {
|
|
queue = g_list_append (queue, token);
|
|
}
|
|
|
|
while (queue) {
|
|
GList *q;
|
|
gboolean finished = FALSE;
|
|
search_info_reset (st->priv->search);
|
|
|
|
if (start_after) {
|
|
q = g_list_next (start_after);
|
|
start_after = NULL;
|
|
} else {
|
|
q = queue;
|
|
}
|
|
|
|
while (q) {
|
|
GList *q_next = g_list_next (q);
|
|
token = (gchar *) q->data;
|
|
|
|
result = search_info_compare (st->priv->search, token, &start_pos, &end_pos);
|
|
|
|
switch (result) {
|
|
|
|
case MATCH_FAILED:
|
|
|
|
queue = queue_match_failed (st, queue);
|
|
|
|
finished = TRUE;
|
|
break;
|
|
|
|
case MATCH_COMPLETE:
|
|
|
|
if (start_pos != 0)
|
|
add_pending (st, g_strndup (token, start_pos));
|
|
add_pending_match_begin (st, st->priv->search);
|
|
add_pending (st, g_strndup (token+start_pos, end_pos-start_pos));
|
|
add_pending_match_end (st, st->priv->search);
|
|
if (*(token+end_pos)) {
|
|
queue->data = g_strdup (token+end_pos);
|
|
add_to_trash (st, (gchar *) queue->data);
|
|
} else {
|
|
queue = g_list_remove_head (queue);
|
|
}
|
|
|
|
matched (st);
|
|
|
|
finished = TRUE;
|
|
break;
|
|
|
|
case MATCH_START: {
|
|
|
|
gchar *s1 = g_strndup (token, start_pos);
|
|
gchar *s2 = g_strdup (START_MAGIC);
|
|
gchar *s3 = g_strdup (token+start_pos);
|
|
|
|
queue = g_list_insert_before (queue, q, s1);
|
|
queue = g_list_insert_before (queue, q, s2);
|
|
queue = g_list_insert_before (queue, q, s3);
|
|
|
|
add_to_trash (st, s1);
|
|
add_to_trash (st, s2);
|
|
add_to_trash (st, s3);
|
|
|
|
queue = g_list_remove_link (queue, q);
|
|
|
|
finished = FALSE;
|
|
break;
|
|
}
|
|
|
|
case MATCH_CONTINUES:
|
|
/* Do nothing... */
|
|
finished = FALSE;
|
|
break;
|
|
|
|
case MATCH_END: {
|
|
gchar *s1 = g_strndup (token, end_pos);
|
|
gchar *s2 = g_strdup (END_MAGIC);
|
|
gchar *s3 = g_strdup (token+end_pos);
|
|
|
|
queue = g_list_insert_before (queue, q, s1);
|
|
queue = g_list_insert_before (queue, q, s2);
|
|
queue = g_list_insert_before (queue, q, s3);
|
|
|
|
add_to_trash (st, s1);
|
|
add_to_trash (st, s2);
|
|
add_to_trash (st, s3);
|
|
|
|
queue = g_list_remove_link (queue, q);
|
|
queue = queue_matched (st, st->priv->search, queue);
|
|
|
|
matched (st);
|
|
|
|
finished = TRUE;
|
|
break;
|
|
}
|
|
|
|
default:
|
|
g_assert_not_reached ();
|
|
}
|
|
|
|
/* If we reach the end of the queue but we aren't finished, try to pull in another
|
|
token and stick it onto the end. */
|
|
if (q_next == NULL && !finished) {
|
|
gchar *next_token = get_next_token (st);
|
|
if (next_token) {
|
|
queue = g_list_append (queue, next_token);
|
|
q_next = g_list_last (queue);
|
|
}
|
|
}
|
|
q = finished ? NULL : q_next;
|
|
|
|
} /* while (q) */
|
|
|
|
if (!finished && queue) { /* ...we add the token at the head of the queue to pending and try again. */
|
|
add_pending (st, g_strdup ((gchar *) queue->data));
|
|
queue = g_list_remove_head (queue);
|
|
}
|
|
|
|
} /* while (queue) */
|
|
}
|
|
|
|
/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
|
|
|
|
static void
|
|
e_searching_tokenizer_begin (HTMLTokenizer *t, gchar *content_type)
|
|
{
|
|
ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t);
|
|
SearchInfo *si;
|
|
|
|
/* Reset our search */
|
|
search_info_free (st->priv->search);
|
|
st->priv->search = NULL;
|
|
|
|
if (st->priv->shared && (st->priv->shared->str_primary || st->priv->shared->str_secondary)) {
|
|
st->priv->search = search_info_new ();
|
|
}
|
|
si = st->priv->search;
|
|
|
|
if (st->priv->shared && si) {
|
|
if (st->priv->shared->str_primary) {
|
|
|
|
search_info_set_string (si, st->priv->shared->str_primary);
|
|
search_info_set_case_sensitivity (si, st->priv->shared->case_sensitive_primary);
|
|
|
|
search_info_set_match_color (si, "red");
|
|
search_info_set_match_bold (si, TRUE);
|
|
|
|
} else if (st->priv->shared->str_secondary) {
|
|
|
|
search_info_set_string (si, st->priv->shared->str_secondary);
|
|
search_info_set_case_sensitivity (si, st->priv->shared->case_sensitive_secondary);
|
|
|
|
search_info_set_match_color (si, "purple");
|
|
search_info_set_match_bold (si, TRUE);
|
|
}
|
|
|
|
} else {
|
|
|
|
search_info_free (st->priv->search);
|
|
st->priv->search = NULL;
|
|
|
|
}
|
|
|
|
e_searching_tokenizer_cleanup (st);
|
|
search_info_reset (st->priv->search);
|
|
|
|
st->priv->match_count = 0;
|
|
|
|
HTML_TOKENIZER_CLASS (parent_class)->begin (t, content_type);
|
|
}
|
|
|
|
static void
|
|
e_searching_tokenizer_end (HTMLTokenizer *t)
|
|
{
|
|
ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t);
|
|
e_searching_tokenizer_cleanup (st);
|
|
|
|
HTML_TOKENIZER_CLASS (parent_class)->end (t);
|
|
}
|
|
|
|
static gchar *
|
|
e_searching_tokenizer_peek_token (HTMLTokenizer *tok)
|
|
{
|
|
ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok);
|
|
|
|
/* If no search is active, just use the default method. */
|
|
if (st->priv->search == NULL)
|
|
return HTML_TOKENIZER_CLASS (parent_class)->peek_token (tok);
|
|
|
|
if (st->priv->pending == NULL)
|
|
get_pending_tokens (st);
|
|
return st->priv->pending ? (gchar *) st->priv->pending->data : NULL;
|
|
}
|
|
|
|
static gchar *
|
|
e_searching_tokenizer_next_token (HTMLTokenizer *tok)
|
|
{
|
|
ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok);
|
|
|
|
/* If no search is active, just use the default method. */
|
|
if (st->priv->search == NULL)
|
|
return HTML_TOKENIZER_CLASS (parent_class)->next_token (tok);
|
|
|
|
if (st->priv->pending == NULL)
|
|
get_pending_tokens (st);
|
|
return pop_pending (st);
|
|
}
|
|
|
|
static gboolean
|
|
e_searching_tokenizer_has_more (HTMLTokenizer *tok)
|
|
{
|
|
ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok);
|
|
|
|
/* If no search is active, pending will always be NULL and thus
|
|
we'll always fall back to using the default method. */
|
|
|
|
return st->priv->pending || HTML_TOKENIZER_CLASS (parent_class)->has_more (tok);
|
|
}
|
|
|
|
static HTMLTokenizer *
|
|
e_searching_tokenizer_clone (HTMLTokenizer *tok)
|
|
{
|
|
ESearchingTokenizer *orig_st = E_SEARCHING_TOKENIZER (tok);
|
|
ESearchingTokenizer *new_st = E_SEARCHING_TOKENIZER (e_searching_tokenizer_new ());
|
|
|
|
if (new_st->priv->search) {
|
|
search_info_free (new_st->priv->search);
|
|
}
|
|
|
|
new_st->priv->search = search_info_clone (orig_st->priv->search);
|
|
|
|
shared_state_ref (orig_st->priv->shared);
|
|
shared_state_unref (new_st->priv->shared);
|
|
new_st->priv->shared = orig_st->priv->shared;
|
|
|
|
gtk_signal_connect_object (GTK_OBJECT (new_st),
|
|
"match",
|
|
GTK_SIGNAL_FUNC (matched),
|
|
GTK_OBJECT (orig_st));
|
|
|
|
return HTML_TOKENIZER (new_st);
|
|
}
|
|
/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */
|
|
|
|
static gboolean
|
|
only_whitespace (const gchar *p)
|
|
{
|
|
gunichar c;
|
|
g_return_val_if_fail (p, FALSE);
|
|
|
|
while (*p && g_unichar_validate (c = g_utf8_get_char (p))) {
|
|
if (!g_unichar_isspace (c))
|
|
return FALSE;
|
|
p = g_utf8_next_char (p);
|
|
}
|
|
return TRUE;
|
|
}
|
|
|
|
void
|
|
e_searching_tokenizer_set_primary_search_string (ESearchingTokenizer *st, const gchar *search_str)
|
|
{
|
|
g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
|
|
|
|
g_free (st->priv->shared->str_primary);
|
|
st->priv->shared->str_primary = NULL;
|
|
|
|
if (search_str != NULL
|
|
&& g_utf8_validate (search_str, -1, NULL)
|
|
&& !only_whitespace (search_str)) {
|
|
|
|
st->priv->shared->str_primary = g_strdup (search_str);
|
|
}
|
|
}
|
|
|
|
void
|
|
e_searching_tokenizer_set_primary_case_sensitivity (ESearchingTokenizer *st, gboolean is_case_sensitive)
|
|
{
|
|
g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
|
|
|
|
st->priv->shared->case_sensitive_primary = is_case_sensitive;
|
|
}
|
|
|
|
void
|
|
e_searching_tokenizer_set_secondary_search_string (ESearchingTokenizer *st, const gchar *search_str)
|
|
{
|
|
g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
|
|
|
|
g_free (st->priv->shared->str_secondary);
|
|
st->priv->shared->str_secondary = NULL;
|
|
|
|
if (search_str != NULL
|
|
&& g_utf8_validate (search_str, -1, NULL)
|
|
&& !only_whitespace (search_str)) {
|
|
|
|
st->priv->shared->str_secondary = g_strdup (search_str);
|
|
}
|
|
}
|
|
|
|
void
|
|
e_searching_tokenizer_set_secondary_case_sensitivity (ESearchingTokenizer *st, gboolean is_case_sensitive)
|
|
{
|
|
g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
|
|
|
|
st->priv->shared->case_sensitive_secondary = is_case_sensitive;
|
|
}
|
|
|
|
gint
|
|
e_searching_tokenizer_match_count (ESearchingTokenizer *st)
|
|
{
|
|
g_return_val_if_fail (st && E_IS_SEARCHING_TOKENIZER (st), -1);
|
|
|
|
return st->priv->match_count;
|
|
}
|
|
|
|
|
|
|