app: improve file magic matching

Change file magic matching from using a simple boolean "magic matches"
logic to using a matching quality. The quality is measured by the
number of bytes that matched.

Matching a single file procedure's magics now tries all magics and
returns the best match quality.

Searching a file procedure for a given file now tries all file
procedures and returns the one with the best match quality.

This fixes raw camera files being opened as TIFF, given a better magic
than just the generic TIFF magic is provided.
This commit is contained in:
Michael Natterer
2016-04-24 23:56:57 +02:00
parent dc3f276476
commit 6af83a5a08

View File

@ -40,9 +40,10 @@
typedef enum typedef enum
{ {
FILE_MATCH_NONE, /* positive values indicate the lenght of a matching magic */
FILE_MATCH_MAGIC,
FILE_MATCH_SIZE FILE_MATCH_NONE = 0,
FILE_MATCH_SIZE = -1
} FileMatchType; } FileMatchType;
@ -116,11 +117,13 @@ file_procedure_find (GSList *procs,
/* Then look for magics, but not on remote files */ /* Then look for magics, but not on remote files */
if (g_file_is_native (file)) if (g_file_is_native (file))
{ {
GSList *list; GSList *list;
GInputStream *input = NULL; GInputStream *input = NULL;
gboolean opened = FALSE; gboolean opened = FALSE;
gsize head_size = 0; gsize head_size = 0;
guchar head[256]; guchar head[256];
FileMatchType best_match_val = FILE_MATCH_NONE;
GimpPlugInProcedure *best_file_proc = NULL;
for (list = procs; list; list = g_slist_next (list)) for (list = procs; list; list = g_slist_next (list))
{ {
@ -172,24 +175,28 @@ file_procedure_find (GSList *procs,
} }
else if (match_val != FILE_MATCH_NONE) else if (match_val != FILE_MATCH_NONE)
{ {
g_object_unref (input); g_printerr ("magic match %d on %s\n",
match_val,
gimp_object_get_name (file_proc));
return file_proc; if (match_val > best_match_val)
{
best_match_val = match_val;
best_file_proc = file_proc;
}
} }
} }
} }
} }
if (input) if (input)
{ g_object_unref (input);
#if 0
if (ferror (ifp))
g_set_error_literal (error, G_FILE_ERROR,
g_file_error_from_errno (errno),
g_strerror (errno));
#endif
g_object_unref (input); if (best_file_proc)
{
g_printerr ("best magic match on %s\n",
gimp_object_get_name (best_file_proc));
return best_file_proc;
} }
} }
@ -414,7 +421,7 @@ file_check_single_magic (const gchar *offset,
FileMatchType found = FILE_MATCH_NONE; FileMatchType found = FILE_MATCH_NONE;
glong offs; glong offs;
gulong num_testval; gulong num_testval;
gulong num_operatorval; gulong num_operator_val;
gint numbytes, k; gint numbytes, k;
const gchar *num_operator_ptr; const gchar *num_operator_ptr;
gchar num_operator; gchar num_operator;
@ -461,18 +468,20 @@ file_check_single_magic (const gchar *offset,
if (g_ascii_isdigit (num_operator_ptr[1])) if (g_ascii_isdigit (num_operator_ptr[1]))
{ {
if (num_operator_ptr[1] != '0') /* decimal */ if (num_operator_ptr[1] != '0') /* decimal */
sscanf (num_operator_ptr+1, "%lu", &num_operatorval); sscanf (num_operator_ptr+1, "%lu", &num_operator_val);
else if (num_operator_ptr[2] == 'x') /* hexadecimal */ else if (num_operator_ptr[2] == 'x') /* hexadecimal */
sscanf (num_operator_ptr+3, "%lx", &num_operatorval); sscanf (num_operator_ptr+3, "%lx", &num_operator_val);
else /* octal */ else /* octal */
sscanf (num_operator_ptr+2, "%lo", &num_operatorval); sscanf (num_operator_ptr+2, "%lo", &num_operator_val);
num_operator = *num_operator_ptr; num_operator = *num_operator_ptr;
} }
} }
if (numbytes > 0) /* Numerical test ? */ if (numbytes > 0)
{ {
/* Numerical test */
gchar num_test = '='; gchar num_test = '=';
gulong fileval = 0; gulong fileval = 0;
@ -489,8 +498,10 @@ file_check_single_magic (const gchar *offset,
if (errno != 0) if (errno != 0)
return FILE_MATCH_NONE; return FILE_MATCH_NONE;
if (numbytes == 5) /* Check for file size ? */ if (numbytes == 5)
{ {
/* Check for file size */
GFileInfo *info = g_file_query_info (file, GFileInfo *info = g_file_query_info (file,
G_FILE_ATTRIBUTE_STANDARD_SIZE, G_FILE_ATTRIBUTE_STANDARD_SIZE,
G_FILE_QUERY_INFO_NONE, G_FILE_QUERY_INFO_NONE,
@ -502,13 +513,17 @@ file_check_single_magic (const gchar *offset,
g_object_unref (info); g_object_unref (info);
} }
else if (offs >= 0 && else if (offs >= 0 &&
(offs + numbytes <= headsize)) /* We have it in memory ? */ (offs + numbytes <= headsize))
{ {
/* We have it in memory */
for (k = 0; k < numbytes; k++) for (k = 0; k < numbytes; k++)
fileval = (fileval << 8) | (glong) file_head[offs + k]; fileval = (fileval << 8) | (glong) file_head[offs + k];
} }
else /* Read it from file */ else
{ {
/* Read it from file */
if (! g_seekable_seek (G_SEEKABLE (input), offs, if (! g_seekable_seek (G_SEEKABLE (input), offs,
(offs >= 0) ? G_SEEK_SET : G_SEEK_END, (offs >= 0) ? G_SEEK_SET : G_SEEK_END,
NULL, NULL)) NULL, NULL))
@ -532,20 +547,31 @@ file_check_single_magic (const gchar *offset,
} }
if (num_operator == '&') if (num_operator == '&')
fileval &= num_operatorval; fileval &= num_operator_val;
if (num_test == '<') if (num_test == '<')
found = (fileval < num_testval); {
if (fileval < num_testval)
found = numbytes;
}
else if (num_test == '>') else if (num_test == '>')
found = (fileval > num_testval); {
if (fileval > num_testval)
found = numbytes;
}
else else
found = (fileval == num_testval); {
if (fileval == num_testval)
found = numbytes;
}
if (found && (numbytes == 5)) if (found && (numbytes == 5))
found = FILE_MATCH_SIZE; found = FILE_MATCH_SIZE;
} }
else if (numbytes == 0) /* String test */ else if (numbytes == 0)
{ {
/* String test */
gchar mem_testval[256]; gchar mem_testval[256];
file_convert_string (value, file_convert_string (value,
@ -556,20 +582,23 @@ file_check_single_magic (const gchar *offset,
return FILE_MATCH_NONE; return FILE_MATCH_NONE;
if (offs >= 0 && if (offs >= 0 &&
(offs + numbytes <= headsize)) /* We have it in memory ? */ (offs + numbytes <= headsize))
{ {
found = (memcmp (mem_testval, file_head + offs, numbytes) == 0); /* We have it in memory */
if (memcmp (mem_testval, file_head + offs, numbytes) == 0)
found = numbytes;
} }
else /* Read it from file */ else
{ {
/* Read it from file */
if (! g_seekable_seek (G_SEEKABLE (input), offs, if (! g_seekable_seek (G_SEEKABLE (input), offs,
(offs >= 0) ? G_SEEK_SET : G_SEEK_END, (offs >= 0) ? G_SEEK_SET : G_SEEK_END,
NULL, NULL)) NULL, NULL))
return FILE_MATCH_NONE; return FILE_MATCH_NONE;
found = FILE_MATCH_MAGIC; for (k = 0; k < numbytes; k++)
for (k = 0; found && (k < numbytes); k++)
{ {
guchar byte; guchar byte;
GError *error = NULL; GError *error = NULL;
@ -579,12 +608,15 @@ file_check_single_magic (const gchar *offset,
if (error) if (error)
{ {
g_clear_error (&error); g_clear_error (&error);
return FILE_MATCH_NONE; return FILE_MATCH_NONE;
} }
if (byte != mem_testval[k]) if (byte != mem_testval[k])
found = FILE_MATCH_NONE; return FILE_MATCH_NONE;
} }
found = numbytes;
} }
} }
@ -599,36 +631,89 @@ file_check_magic_list (GSList *magics_list,
GInputStream *input) GInputStream *input)
{ {
const gchar *offset; gboolean and = FALSE;
const gchar *type; gboolean found = FALSE;
const gchar *value; FileMatchType best_match_val = FILE_MATCH_NONE;
gboolean and = FALSE; FileMatchType match_val = FILE_MATCH_NONE;
gboolean found = FALSE;
FileMatchType match_val;
while (magics_list) for (; magics_list; magics_list = magics_list->next)
{ {
if ((offset = magics_list->data) == NULL) break; const gchar *offset;
if ((magics_list = magics_list->next) == NULL) break; const gchar *type;
if ((type = magics_list->data) == NULL) break; const gchar *value;
if ((magics_list = magics_list->next) == NULL) break; FileMatchType single_match_val = FILE_MATCH_NONE;
if ((value = magics_list->data) == NULL) break;
magics_list = magics_list->next; if ((offset = magics_list->data) == NULL) return FILE_MATCH_NONE;
if ((magics_list = magics_list->next) == NULL) return FILE_MATCH_NONE;
if ((type = magics_list->data) == NULL) return FILE_MATCH_NONE;
if ((magics_list = magics_list->next) == NULL) return FILE_MATCH_NONE;
if ((value = magics_list->data) == NULL) return FILE_MATCH_NONE;
single_match_val = file_check_single_magic (offset, type, value,
head, headsize,
file, input);
match_val = file_check_single_magic (offset, type, value,
head, headsize,
file, input);
if (and) if (and)
found = found && (match_val != FILE_MATCH_NONE); found = found && (single_match_val != FILE_MATCH_NONE);
else else
found = (match_val != FILE_MATCH_NONE); found = (single_match_val != FILE_MATCH_NONE);
if (match_val == FILE_MATCH_NONE)
{
/* if we have no match yet, this is it in any case */
match_val = single_match_val;
}
else if (single_match_val != FILE_MATCH_NONE)
{
/* else if we have a match on this one, combine it with the
* existing return value
*/
if (single_match_val == FILE_MATCH_SIZE)
{
/* if we already have a magic match, simply increase
* that by one to indicate "better match", not perfect
* but better than losing the additional size match
* entirely
*/
if (match_val != FILE_MATCH_SIZE)
match_val += 1;
}
else
{
/* if we already have a magic match, simply add to its
* length; otherwise if we already have a size match,
* combine it with this match, see comment above
*/
if (match_val != FILE_MATCH_SIZE)
match_val += single_match_val;
else
match_val = single_match_val + 1;
}
}
if (best_match_val == FILE_MATCH_NONE)
{
/* if we have no best match yet, this is it */
best_match_val = match_val;
}
else if (match_val != FILE_MATCH_NONE)
{
/* otherwise if this was a match, update the best match, note
* that by using MAX we will not overwrite a magic match
* with a size match
*/
best_match_val = MAX (best_match_val, match_val);
}
and = (strchr (offset, '&') != NULL); and = (strchr (offset, '&') != NULL);
if (! and && found) if (! and)
return match_val; match_val = FILE_MATCH_NONE;
} }
return FILE_MATCH_NONE; return best_match_val;
} }