input/csv: add automatic format match support
Implement .format_match() support in the CSV input module. Simple multi-column files will automatically load without an "-I csv" spec. Non-default option values still do require the module selection before options can get passed to the module.
This commit is contained in:
parent
43e1e23a11
commit
cd7c5f9655
123
src/input/csv.c
123
src/input/csv.c
|
@ -20,9 +20,11 @@
|
||||||
|
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
|
#include <ctype.h>
|
||||||
#include <glib.h>
|
#include <glib.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <strings.h>
|
||||||
|
|
||||||
#include <libsigrok/libsigrok.h>
|
#include <libsigrok/libsigrok.h>
|
||||||
#include "libsigrok-internal.h"
|
#include "libsigrok-internal.h"
|
||||||
|
@ -1026,6 +1028,125 @@ static const col_parse_cb col_parse_funcs[] = {
|
||||||
[FORMAT_TIME] = parse_timestamp,
|
[FORMAT_TIME] = parse_timestamp,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BEWARE! Implementor's notes. Sync with feature set and default option
|
||||||
|
* values required during maintenance of the input module implementation.
|
||||||
|
*
|
||||||
|
* When applications invoke .format_match() routines, trying automatic
|
||||||
|
* determination of an input file's format handler, then no options are
|
||||||
|
* in effect. Because specifying options requires selection of an input
|
||||||
|
* module to pass the options to, which obsoletes the format-match check.
|
||||||
|
*
|
||||||
|
* Which means that we only need to deal with the default format here,
|
||||||
|
* which happens to be the simple multi-column format without header
|
||||||
|
* lines or leading garbage. Which means that the check can be rather
|
||||||
|
* strict, resulting in high levels of confidence upon match, never
|
||||||
|
* "accidently" winning for unreadable or unsupported-by-default formats.
|
||||||
|
*
|
||||||
|
* This .format_match() logic only needs to become more involved when
|
||||||
|
* default option values change, or when automatic detection of column
|
||||||
|
* data types improves. Then the supported-by-default types of input
|
||||||
|
* data must be considered acceptable here in the format-match check
|
||||||
|
* as well.
|
||||||
|
*
|
||||||
|
* Notice that the format check cannot re-use regular processing logic
|
||||||
|
* when their implementation assumes proper input data and wll generate
|
||||||
|
* diagnostics for unexpected input data. Failure to match the format is
|
||||||
|
* non-fatal here, mismatch must remain silent. It's up to applications
|
||||||
|
* how large a chunk of data gets passed here (start of the file's
|
||||||
|
* content). But inspection of the first few hundred bytes will usually
|
||||||
|
* be GoodEnough(TM) for the format-match purpose. Notice that filenames
|
||||||
|
* need not necessarily be available to the format-match routine.
|
||||||
|
*
|
||||||
|
* This implementation errs on the safe side. Users can always select
|
||||||
|
* the CSV input module when automatic format detection fails.
|
||||||
|
*/
|
||||||
|
static int format_match(GHashTable *metadata, unsigned int *confidence)
|
||||||
|
{
|
||||||
|
const int match_confidence = 100;
|
||||||
|
const char *default_extension = ".csv";
|
||||||
|
const char *line_termination = "\n";
|
||||||
|
const char *comment_leader = ";";
|
||||||
|
const char *column_separator = ",";
|
||||||
|
const char *binary_charset = "01";
|
||||||
|
|
||||||
|
const char *fn;
|
||||||
|
GString *buf;
|
||||||
|
size_t fn_len;
|
||||||
|
GString *tmpbuf;
|
||||||
|
gboolean status;
|
||||||
|
size_t line_idx, col_idx;
|
||||||
|
char *rdptr, **lines, *line;
|
||||||
|
char **cols, *col;
|
||||||
|
|
||||||
|
/* Get the application provided input data properties. */
|
||||||
|
fn = g_hash_table_lookup(metadata, GINT_TO_POINTER(SR_INPUT_META_FILENAME));
|
||||||
|
buf = g_hash_table_lookup(metadata, GINT_TO_POINTER(SR_INPUT_META_HEADER));
|
||||||
|
|
||||||
|
/* Filenames are a strong hint. Use then when available. */
|
||||||
|
if (fn && *fn && (fn_len = strlen(fn)) >= strlen(default_extension)) {
|
||||||
|
if (strcasecmp(&fn[fn_len - strlen(default_extension)], default_extension) == 0) {
|
||||||
|
*confidence = 10;
|
||||||
|
return SR_OK;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check file content for compatibility with the input module's
|
||||||
|
* default format. Which translates to:
|
||||||
|
* - Must be at least one text line worth of input data. Ignore
|
||||||
|
* incomplete lines at the end of the available buffer.
|
||||||
|
* - Must be LF terminated text lines, optional CR-LF sequence.
|
||||||
|
* (Drop CR-only for simplicity since that's rare and users
|
||||||
|
* can override the automatic detection.)
|
||||||
|
* - Strip comments and skip empty lines.
|
||||||
|
* - Data lines must be binary input (potentially multiple bits
|
||||||
|
* per column which then get ignored). Presence of comma is
|
||||||
|
* optional but then must be followed by another data column.
|
||||||
|
* - No other content is acceptable, there neither are ignored
|
||||||
|
* columns nor analog data nor timestamps in the default layout.
|
||||||
|
* (See the above "sync format match with default options"
|
||||||
|
* comment though during maintenance!)
|
||||||
|
* Run the check on a copy to not affect the caller's buffer.
|
||||||
|
*/
|
||||||
|
if (!buf || !buf->len || !buf->str || !*buf->str)
|
||||||
|
return SR_ERR;
|
||||||
|
rdptr = g_strstr_len(buf->str, buf->len, line_termination);
|
||||||
|
if (!rdptr)
|
||||||
|
return SR_ERR;
|
||||||
|
tmpbuf = g_string_new_len(buf->str, rdptr + 1 - buf->str);
|
||||||
|
tmpbuf->str[tmpbuf->len - 1] = '\0';
|
||||||
|
status = TRUE;
|
||||||
|
*confidence = match_confidence;
|
||||||
|
lines = g_strsplit(tmpbuf->str, line_termination, 0);
|
||||||
|
for (line_idx = 0; status && (line = lines[line_idx]); line_idx++) {
|
||||||
|
rdptr = strstr(line, comment_leader);
|
||||||
|
if (rdptr)
|
||||||
|
*rdptr = '\0';
|
||||||
|
line = g_strstrip(line);
|
||||||
|
if (!line || !*line)
|
||||||
|
continue;
|
||||||
|
cols = g_strsplit(line, column_separator, 0);
|
||||||
|
if (!cols) {
|
||||||
|
status = FALSE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
for (col_idx = 0; status && (col = cols[col_idx]); col_idx++) {
|
||||||
|
if (strspn(col, binary_charset) != strlen(col)) {
|
||||||
|
status = FALSE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
g_strfreev(cols);
|
||||||
|
}
|
||||||
|
g_strfreev(lines);
|
||||||
|
g_string_free(tmpbuf, TRUE);
|
||||||
|
|
||||||
|
if (!status)
|
||||||
|
return SR_ERR;
|
||||||
|
return SR_OK;
|
||||||
|
}
|
||||||
|
|
||||||
static int init(struct sr_input *in, GHashTable *options)
|
static int init(struct sr_input *in, GHashTable *options)
|
||||||
{
|
{
|
||||||
struct context *inc;
|
struct context *inc;
|
||||||
|
@ -1678,7 +1799,9 @@ SR_PRIV struct sr_input_module input_csv = {
|
||||||
.name = "CSV",
|
.name = "CSV",
|
||||||
.desc = "Comma-separated values",
|
.desc = "Comma-separated values",
|
||||||
.exts = (const char*[]){"csv", NULL},
|
.exts = (const char*[]){"csv", NULL},
|
||||||
|
.metadata = { SR_INPUT_META_FILENAME, SR_INPUT_META_HEADER | SR_INPUT_META_REQUIRED },
|
||||||
.options = get_options,
|
.options = get_options,
|
||||||
|
.format_match = format_match,
|
||||||
.init = init,
|
.init = init,
|
||||||
.receive = receive,
|
.receive = receive,
|
||||||
.end = end,
|
.end = end,
|
||||||
|
|
Loading…
Reference in New Issue