use of com.fasterxml.jackson.dataformat.csv.CsvFactory in project tracdap by finos.
the class CsvDecoder method decodeChunk.
@Override
protected void decodeChunk(ByteBuf chunk) {
var csvFactory = new CsvFactory().enable(CsvParser.Feature.TRIM_SPACES).enable(CsvParser.Feature.FAIL_ON_MISSING_COLUMNS);
try (var stream = new ByteBufInputStream(chunk);
var parser = (CsvParser) csvFactory.createParser((InputStream) stream)) {
var csvSchema = CsvSchemaMapping.arrowToCsv(this.arrowSchema).build();
csvSchema = DEFAULT_HEADER_FLAG ? csvSchema.withHeader() : csvSchema.withoutHeader();
parser.setSchema(csvSchema);
var row = 0;
var col = 0;
JsonToken token;
while ((token = parser.nextToken()) != null) {
switch(token) {
// For CSV files, a null field name is produced for every field
case FIELD_NAME:
continue;
case VALUE_NULL:
case VALUE_TRUE:
case VALUE_FALSE:
case VALUE_STRING:
case VALUE_NUMBER_INT:
case VALUE_NUMBER_FLOAT:
var vector = root.getVector(col);
JacksonValues.parseAndSet(vector, row, parser, token);
col++;
break;
case START_OBJECT:
if (row == 0)
for (var vector_ : root.getFieldVectors()) vector_.allocateNew();
break;
case END_OBJECT:
row++;
col = 0;
if (row == BATCH_SIZE) {
root.setRowCount(row);
dispatchBatch(root);
row = 0;
}
break;
default:
var msg = String.format("Unexpected token %s", token.name());
throw new CsvReadException(parser, msg, csvSchema);
}
}
if (row > 0 || col > 0) {
root.setRowCount(row);
dispatchBatch(root);
}
} catch (JacksonException e) {
// This exception is a "well-behaved" parse failure, parse location and message should be meaningful
var errorMessage = String.format("CSV decoding failed on line %d: %s", e.getLocation().getLineNr(), e.getOriginalMessage());
log.error(errorMessage, e);
throw new EDataCorruption(errorMessage, e);
} catch (IOException e) {
// Decoders work on a stream of buffers, "real" IO exceptions should not occur
// IO exceptions here indicate parse failures, not file/socket communication errors
// This is likely to be a more "badly-behaved" failure, or at least one that was not anticipated
var errorMessage = "CSV decoding failed, content is garbled: " + e.getMessage();
log.error(errorMessage, e);
throw new EDataCorruption(errorMessage, e);
} catch (Throwable e) {
// Ensure unexpected errors are still reported to the Flow API
log.error("Unexpected error in CSV decoding", e);
throw new EUnexpected(e);
} finally {
chunk.release();
}
}
use of com.fasterxml.jackson.dataformat.csv.CsvFactory in project tracdap by finos.
the class CsvDecoder method decodeChunk.
@Override
protected void decodeChunk(ByteBuf chunk) {
var csvFactory = new CsvFactory().enable(CsvParser.Feature.FAIL_ON_MISSING_COLUMNS).enable(CsvParser.Feature.EMPTY_STRING_AS_NULL).enable(CsvParser.Feature.TRIM_SPACES);
try (var stream = new ByteBufInputStream(chunk);
var parser = (CsvParser) csvFactory.createParser((InputStream) stream)) {
var csvSchema = CsvSchemaMapping.arrowToCsv(this.arrowSchema).build();
csvSchema = DEFAULT_HEADER_FLAG ? csvSchema.withHeader() : csvSchema.withoutHeader();
parser.setSchema(csvSchema);
var row = 0;
var col = 0;
JsonToken token;
while ((token = parser.nextToken()) != null) {
switch(token) {
// For CSV files, a null field name is produced for every field
case FIELD_NAME:
continue;
case VALUE_NULL:
// Special handling to differentiate between null and empty strings
var nullVector = root.getVector(col);
var minorType = nullVector.getMinorType();
if (minorType == Types.MinorType.VARCHAR) {
// Null strings are encoded with no space between commas (or EOL): some_value,,next_value
// An empty string is encoded as "", i.e. token width = 2 (or more with padding)
// Using token end - token start, a gap between commas -> empty string instead of null
// It would be nicer to check the original bytes to see if there are quote chars in there
// But this is not possible with the current Jackson API
var tokenStart = parser.currentTokenLocation();
var tokenEnd = parser.currentLocation();
var tokenWidth = tokenEnd.getColumnNr() - tokenStart.getColumnNr();
if (tokenWidth > 1) {
JacksonValues.setEmptyString(nullVector, row);
col++;
continue;
}
}
case VALUE_TRUE:
case VALUE_FALSE:
case VALUE_STRING:
case VALUE_NUMBER_INT:
case VALUE_NUMBER_FLOAT:
var vector = root.getVector(col);
JacksonValues.parseAndSet(vector, row, parser, token);
col++;
break;
case START_OBJECT:
if (row == 0)
for (var vector_ : root.getFieldVectors()) vector_.allocateNew();
break;
case END_OBJECT:
row++;
col = 0;
if (row == BATCH_SIZE) {
root.setRowCount(row);
dispatchBatch(root);
row = 0;
}
break;
default:
var msg = String.format("Unexpected token %s", token.name());
throw new CsvReadException(parser, msg, csvSchema);
}
}
if (row > 0 || col > 0) {
root.setRowCount(row);
dispatchBatch(root);
}
} catch (JacksonException e) {
// This exception is a "well-behaved" parse failure, parse location and message should be meaningful
var errorMessage = String.format("CSV decoding failed on line %d: %s", e.getLocation().getLineNr(), e.getOriginalMessage());
log.error(errorMessage, e);
throw new EDataCorruption(errorMessage, e);
} catch (IOException e) {
// Decoders work on a stream of buffers, "real" IO exceptions should not occur
// IO exceptions here indicate parse failures, not file/socket communication errors
// This is likely to be a more "badly-behaved" failure, or at least one that was not anticipated
var errorMessage = "CSV decoding failed, content is garbled: " + e.getMessage();
log.error(errorMessage, e);
throw new EDataCorruption(errorMessage, e);
} catch (Throwable e) {
// Ensure unexpected errors are still reported to the Flow API
log.error("Unexpected error in CSV decoding", e);
throw new EUnexpected(e);
} finally {
chunk.release();
}
}
Aggregations