input: add confidence (detection strength) to format_match()

When users don't specify the input format, applications can try to have
the format auto-detected. Some of the tests were weak and could result
in false positives.

Add a 'confidence' parameter to the input modules' format_match()
method. Claim high confidence (1) for those formats which can check
magic strings or the presence of essential keywords (vcd, wav). Claim
medium confidence (10) for those formats which happened to mis-detect
inputs in the past (trace32_ad). Claim weak confidence (100) for those
formats which totally lack reliable conditions and rather guess than
detect (chronovu_la8).

Prefer the best match in public scan routines. Return at most one module
to callers even if multiple modules matched, to keep the current API.

This addresses part of bug #1200.
This commit is contained in:
Gerhard Sittig 2018-05-12 11:25:36 +02:00 committed by Uwe Hermann
parent 7102443a4a
commit 54ee427df0
6 changed files with 80 additions and 32 deletions

View File

@ -37,16 +37,23 @@ struct context {
uint64_t samplerate;
};
static int format_match(GHashTable *metadata)
static int format_match(GHashTable *metadata, unsigned int *confidence)
{
int size;
/*
* In the absence of a reliable condition like magic strings,
* we can only guess based on the file size. Since this is
* rather weak a condition, signal "little confidence" and
* optionally give precedence to better matches.
*/
size = GPOINTER_TO_INT(g_hash_table_lookup(metadata,
GINT_TO_POINTER(SR_INPUT_META_FILESIZE)));
if (size == CHRONOVU_LA8_FILESIZE)
return SR_OK;
if (size != CHRONOVU_LA8_FILESIZE)
return SR_ERR;
*confidence = 100;
return SR_OK;
}
static int init(struct sr_input *in, GHashTable *options)

View File

@ -342,7 +342,9 @@ static gboolean check_required_metadata(const uint8_t *metadata, uint8_t *avail)
* 128 bytes is normally enough.
*
* If an input module is found, an instance is created into *in.
* Otherwise, *in contains NULL.
* Otherwise, *in contains NULL. When multiple input moduless claim
* support for the format, the one with highest confidence takes
* precedence. Applications will see at most one input module spec.
*
* If an instance is created, it has the given buffer used for scanning
* already submitted to it, to be processed before more data is sent.
@ -353,9 +355,10 @@ static gboolean check_required_metadata(const uint8_t *metadata, uint8_t *avail)
*/
SR_API int sr_input_scan_buffer(GString *buf, const struct sr_input **in)
{
const struct sr_input_module *imod;
const struct sr_input_module *imod, *best_imod;
GHashTable *meta;
unsigned int m, i;
unsigned int conf, best_conf;
int ret;
uint8_t mitem, avail_metadata[8];
@ -364,7 +367,8 @@ SR_API int sr_input_scan_buffer(GString *buf, const struct sr_input **in)
avail_metadata[1] = 0;
*in = NULL;
ret = SR_ERR;
best_imod = NULL;
best_conf = ~0;
for (i = 0; input_module_list[i]; i++) {
imod = input_module_list[i];
if (!imod->metadata[0]) {
@ -388,45 +392,55 @@ SR_API int sr_input_scan_buffer(GString *buf, const struct sr_input **in)
continue;
}
sr_spew("Trying module %s.", imod->id);
ret = imod->format_match(meta);
ret = imod->format_match(meta, &conf);
g_hash_table_destroy(meta);
if (ret == SR_ERR_DATA) {
/* Module recognized this buffer, but cannot handle it. */
break;
continue;
} else if (ret == SR_ERR) {
/* Module didn't recognize this buffer. */
continue;
} else if (ret != SR_OK) {
/* Can be SR_ERR_NA. */
return ret;
continue;
}
/* Found a matching module. */
sr_spew("Module %s matched.", imod->id);
*in = sr_input_new(imod, NULL);
g_string_insert_len((*in)->buf, 0, buf->str, buf->len);
break;
sr_spew("Module %s matched, confidence %u.", imod->id, conf);
if (conf >= best_conf)
continue;
best_imod = imod;
best_conf = conf;
}
return ret;
if (best_imod) {
*in = sr_input_new(best_imod, NULL);
g_string_insert_len((*in)->buf, 0, buf->str, buf->len);
return SR_OK;
}
return SR_ERR;
}
/**
* Try to find an input module that can parse the given file.
*
* If an input module is found, an instance is created into *in.
* Otherwise, *in contains NULL.
* Otherwise, *in contains NULL. When multiple input moduless claim
* support for the format, the one with highest confidence takes
* precedence. Applications will see at most one input module spec.
*
*/
SR_API int sr_input_scan_file(const char *filename, const struct sr_input **in)
{
int64_t filesize;
FILE *stream;
const struct sr_input_module *imod;
const struct sr_input_module *imod, *best_imod;
GHashTable *meta;
GString *header;
size_t count;
unsigned int midx, i;
unsigned int conf, best_conf;
int ret;
uint8_t avail_metadata[8];
@ -475,8 +489,8 @@ SR_API int sr_input_scan_file(const char *filename, const struct sr_input **in)
avail_metadata[midx] = 0;
/* TODO: MIME type */
ret = SR_ERR;
best_imod = NULL;
best_conf = ~0;
for (i = 0; input_module_list[i]; i++) {
imod = input_module_list[i];
if (!imod->metadata[0]) {
@ -490,24 +504,30 @@ SR_API int sr_input_scan_file(const char *filename, const struct sr_input **in)
sr_dbg("Trying module %s.", imod->id);
ret = imod->format_match(meta);
ret = imod->format_match(meta, &conf);
if (ret == SR_ERR) {
/* Module didn't recognize this buffer. */
continue;
} else if (ret != SR_OK) {
/* Module recognized this buffer, but cannot handle it. */
break;
continue;
}
/* Found a matching module. */
sr_dbg("Module %s matched.", imod->id);
*in = sr_input_new(imod, NULL);
break;
sr_dbg("Module %s matched, confidence %u.", imod->id, conf);
if (conf >= best_conf)
continue;
best_imod = imod;
best_conf = conf;
}
g_hash_table_destroy(meta);
g_string_free(header, TRUE);
return ret;
if (best_imod) {
*in = sr_input_new(best_imod, NULL);
return SR_OK;
}
return SR_ERR;
}
/**

View File

@ -156,13 +156,19 @@ static int init(struct sr_input *in, GHashTable *options)
return SR_OK;
}
static int format_match(GHashTable *metadata)
static int format_match(GHashTable *metadata, unsigned int *confidence)
{
GString *buf;
int rc;
buf = g_hash_table_lookup(metadata, GINT_TO_POINTER(SR_INPUT_META_HEADER));
rc = process_header(buf, NULL);
return process_header(buf, NULL);
if (rc != SR_OK)
return rc;
*confidence = 10;
return SR_OK;
}
static int process_header(GString *buf, struct context *inc)

View File

@ -265,7 +265,7 @@ static gboolean parse_header(const struct sr_input *in, GString *buf)
return status;
}
static int format_match(GHashTable *metadata)
static int format_match(GHashTable *metadata, unsigned int *confidence)
{
GString *buf, *tmpbuf;
gboolean status;
@ -283,7 +283,11 @@ static int format_match(GHashTable *metadata)
g_free(name);
g_free(contents);
return status ? SR_OK : SR_ERR;
if (!status)
return SR_ERR;
*confidence = 1;
return SR_OK;
}
/* Send all accumulated bytes from inc->buffer. */

View File

@ -124,7 +124,7 @@ static int parse_wav_header(GString *buf, struct context *inc)
return SR_OK;
}
static int format_match(GHashTable *metadata)
static int format_match(GHashTable *metadata, unsigned int *confidence)
{
GString *buf;
int ret;
@ -143,6 +143,8 @@ static int format_match(GHashTable *metadata)
if ((ret = parse_wav_header(buf, NULL)) != SR_OK)
return ret;
*confidence = 1;
return SR_OK;
}

View File

@ -426,6 +426,8 @@ struct sr_input_module {
* Check if this input module can load and parse the specified stream.
*
* @param[in] metadata Metadata the module can use to identify the stream.
* @param[out] confidence "Strength" of the detection.
* Specialized handlers can take precedence over generic/basic support.
*
* @retval SR_OK This module knows the format.
* @retval SR_ERR_NA There wasn't enough data for this module to
@ -434,8 +436,15 @@ struct sr_input_module {
* it. This means the stream is either corrupt, or indicates a
* feature that the module does not support.
* @retval SR_ERR This module does not know the format.
*
* Lower numeric values of 'confidence' mean that the input module
* stronger believes in its capability to handle this specific format.
* This way, multiple input modules can claim support for a format,
* and the application can pick the best match, or try fallbacks
* in case of errors. This approach also copes with formats that
* are unreliable to detect in the absence of magic signatures.
*/
int (*format_match) (GHashTable *metadata);
int (*format_match) (GHashTable *metadata, unsigned int *confidence);
/**
* Initialize the input module.