use of org.kitesdk.data.spi.filesystem.CSVProperties in project nifi by apache.
the class ConvertCSVToAvro method onTrigger.
@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile incomingCSV = session.get();
if (incomingCSV == null) {
return;
}
CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(incomingCSV).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(incomingCSV).getValue()).quote(context.getProperty(QUOTE).evaluateAttributeExpressions(incomingCSV).getValue()).escape(context.getProperty(ESCAPE).evaluateAttributeExpressions(incomingCSV).getValue()).hasHeader(context.getProperty(HAS_HEADER).evaluateAttributeExpressions(incomingCSV).asBoolean()).linesToSkip(context.getProperty(LINES_TO_SKIP).evaluateAttributeExpressions(incomingCSV).asInteger()).build();
String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingCSV).getValue();
final Schema schema;
try {
schema = getSchema(schemaProperty, DefaultConfiguration.get());
} catch (SchemaNotFoundException e) {
getLogger().error("Cannot find schema: " + schemaProperty);
session.transfer(incomingCSV, FAILURE);
return;
}
try (final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class))) {
writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
try {
final AtomicLong written = new AtomicLong(0L);
final FailureTracker failures = new FailureTracker();
FlowFile badRecords = session.clone(incomingCSV);
FlowFile outgoingAvro = session.write(incomingCSV, new StreamCallback() {
@Override
public void process(InputStream in, OutputStream out) throws IOException {
try (CSVFileReader<Record> reader = new CSVFileReader<>(in, props, schema, Record.class)) {
reader.initialize();
try (DataFileWriter<Record> w = writer.create(schema, out)) {
while (reader.hasNext()) {
try {
Record record = reader.next();
w.append(record);
written.incrementAndGet();
} catch (DatasetRecordException e) {
failures.add(e);
}
}
}
}
}
});
long errors = failures.count();
session.adjustCounter("Converted records", written.get(), false);
session.adjustCounter("Conversion errors", errors, false);
if (written.get() > 0L) {
session.transfer(outgoingAvro, SUCCESS);
if (errors > 0L) {
getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors + written.get() });
badRecords = session.putAttribute(badRecords, "errors", failures.summary());
session.transfer(badRecords, INCOMPATIBLE);
} else {
session.remove(badRecords);
}
} else {
session.remove(outgoingAvro);
if (errors > 0L) {
getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors });
badRecords = session.putAttribute(badRecords, "errors", failures.summary());
} else {
badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
}
session.transfer(badRecords, FAILURE);
}
} catch (ProcessException | DatasetIOException e) {
getLogger().error("Failed reading or writing", e);
session.transfer(incomingCSV, FAILURE);
} catch (DatasetException e) {
getLogger().error("Failed to read FlowFile", e);
session.transfer(incomingCSV, FAILURE);
}
} catch (final IOException ioe) {
throw new RuntimeException("Unable to close Avro Writer", ioe);
}
}
use of org.kitesdk.data.spi.filesystem.CSVProperties in project nifi by apache.
the class InferAvroSchema method inferAvroSchemaFromCSV.
/**
* Infers the Avro schema from the input Flowfile content. To infer an Avro schema for CSV content a header line is
* required. You can configure the processor to pull that header line from the first line of the CSV data if it is
* present OR you can manually supply the desired header line as a property value.
*
* @param inputFlowFile
* The original input FlowFile containing the CSV content as it entered this processor.
*
* @param context
* ProcessContext to pull processor configurations.
*
* @param session
* ProcessSession to transfer FlowFiles
*/
private String inferAvroSchemaFromCSV(final FlowFile inputFlowFile, final ProcessContext context, final ProcessSession session) {
// Determines the header line either from the property input or the first line of the delimited file.
final AtomicReference<String> header = new AtomicReference<>();
final AtomicReference<Boolean> hasHeader = new AtomicReference<>();
if (context.getProperty(GET_CSV_HEADER_DEFINITION_FROM_INPUT).asBoolean() == Boolean.TRUE) {
// Read the first line of the file to get the header value.
session.read(inputFlowFile, new InputStreamCallback() {
@Override
public void process(InputStream in) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(in));
header.set(br.readLine());
hasHeader.set(Boolean.TRUE);
br.close();
}
});
hasHeader.set(Boolean.TRUE);
} else {
header.set(context.getProperty(CSV_HEADER_DEFINITION).evaluateAttributeExpressions(inputFlowFile).getValue());
hasHeader.set(Boolean.FALSE);
}
// Prepares the CSVProperties for kite
CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(inputFlowFile).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(inputFlowFile).getValue()).quote(context.getProperty(QUOTE_STRING).evaluateAttributeExpressions(inputFlowFile).getValue()).escape(context.getProperty(ESCAPE_STRING).evaluateAttributeExpressions(inputFlowFile).getValue()).linesToSkip(context.getProperty(HEADER_LINE_SKIP_COUNT).evaluateAttributeExpressions(inputFlowFile).asInteger()).header(header.get()).hasHeader(hasHeader.get()).build();
final AtomicReference<String> avroSchema = new AtomicReference<>();
session.read(inputFlowFile, new InputStreamCallback() {
@Override
public void process(InputStream in) throws IOException {
avroSchema.set(CSVUtil.inferSchema(context.getProperty(RECORD_NAME).evaluateAttributeExpressions(inputFlowFile).getValue(), in, props).toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean()));
}
});
return avroSchema.get();
}
Aggregations