use of org.kitesdk.data.spi.filesystem.CSVFileReader in project nifi by apache.
the class ConvertCSVToAvro method onTrigger.
@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile incomingCSV = session.get();
if (incomingCSV == null) {
return;
}
CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(incomingCSV).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(incomingCSV).getValue()).quote(context.getProperty(QUOTE).evaluateAttributeExpressions(incomingCSV).getValue()).escape(context.getProperty(ESCAPE).evaluateAttributeExpressions(incomingCSV).getValue()).hasHeader(context.getProperty(HAS_HEADER).evaluateAttributeExpressions(incomingCSV).asBoolean()).linesToSkip(context.getProperty(LINES_TO_SKIP).evaluateAttributeExpressions(incomingCSV).asInteger()).build();
String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingCSV).getValue();
final Schema schema;
try {
schema = getSchema(schemaProperty, DefaultConfiguration.get());
} catch (SchemaNotFoundException e) {
getLogger().error("Cannot find schema: " + schemaProperty);
session.transfer(incomingCSV, FAILURE);
return;
}
try (final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class))) {
writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
try {
final AtomicLong written = new AtomicLong(0L);
final FailureTracker failures = new FailureTracker();
FlowFile badRecords = session.clone(incomingCSV);
FlowFile outgoingAvro = session.write(incomingCSV, new StreamCallback() {
@Override
public void process(InputStream in, OutputStream out) throws IOException {
try (CSVFileReader<Record> reader = new CSVFileReader<>(in, props, schema, Record.class)) {
reader.initialize();
try (DataFileWriter<Record> w = writer.create(schema, out)) {
while (reader.hasNext()) {
try {
Record record = reader.next();
w.append(record);
written.incrementAndGet();
} catch (DatasetRecordException e) {
failures.add(e);
}
}
}
}
}
});
long errors = failures.count();
session.adjustCounter("Converted records", written.get(), false);
session.adjustCounter("Conversion errors", errors, false);
if (written.get() > 0L) {
session.transfer(outgoingAvro, SUCCESS);
if (errors > 0L) {
getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors + written.get() });
badRecords = session.putAttribute(badRecords, "errors", failures.summary());
session.transfer(badRecords, INCOMPATIBLE);
} else {
session.remove(badRecords);
}
} else {
session.remove(outgoingAvro);
if (errors > 0L) {
getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors });
badRecords = session.putAttribute(badRecords, "errors", failures.summary());
} else {
badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
}
session.transfer(badRecords, FAILURE);
}
} catch (ProcessException | DatasetIOException e) {
getLogger().error("Failed reading or writing", e);
session.transfer(incomingCSV, FAILURE);
} catch (DatasetException e) {
getLogger().error("Failed to read FlowFile", e);
session.transfer(incomingCSV, FAILURE);
}
} catch (final IOException ioe) {
throw new RuntimeException("Unable to close Avro Writer", ioe);
}
}
Aggregations