Search in sources :

Example 1 with CSVFileReader

use of org.kitesdk.data.spi.filesystem.CSVFileReader in project nifi by apache.

the class ConvertCSVToAvro method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingCSV = session.get();
    if (incomingCSV == null) {
        return;
    }
    CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(incomingCSV).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(incomingCSV).getValue()).quote(context.getProperty(QUOTE).evaluateAttributeExpressions(incomingCSV).getValue()).escape(context.getProperty(ESCAPE).evaluateAttributeExpressions(incomingCSV).getValue()).hasHeader(context.getProperty(HAS_HEADER).evaluateAttributeExpressions(incomingCSV).asBoolean()).linesToSkip(context.getProperty(LINES_TO_SKIP).evaluateAttributeExpressions(incomingCSV).asInteger()).build();
    String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingCSV).getValue();
    final Schema schema;
    try {
        schema = getSchema(schemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + schemaProperty);
        session.transfer(incomingCSV, FAILURE);
        return;
    }
    try (final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class))) {
        writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
        try {
            final AtomicLong written = new AtomicLong(0L);
            final FailureTracker failures = new FailureTracker();
            FlowFile badRecords = session.clone(incomingCSV);
            FlowFile outgoingAvro = session.write(incomingCSV, new StreamCallback() {

                @Override
                public void process(InputStream in, OutputStream out) throws IOException {
                    try (CSVFileReader<Record> reader = new CSVFileReader<>(in, props, schema, Record.class)) {
                        reader.initialize();
                        try (DataFileWriter<Record> w = writer.create(schema, out)) {
                            while (reader.hasNext()) {
                                try {
                                    Record record = reader.next();
                                    w.append(record);
                                    written.incrementAndGet();
                                } catch (DatasetRecordException e) {
                                    failures.add(e);
                                }
                            }
                        }
                    }
                }
            });
            long errors = failures.count();
            session.adjustCounter("Converted records", written.get(), false);
            session.adjustCounter("Conversion errors", errors, false);
            if (written.get() > 0L) {
                session.transfer(outgoingAvro, SUCCESS);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors + written.get() });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                    session.transfer(badRecords, INCOMPATIBLE);
                } else {
                    session.remove(badRecords);
                }
            } else {
                session.remove(outgoingAvro);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                } else {
                    badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
                }
                session.transfer(badRecords, FAILURE);
            }
        } catch (ProcessException | DatasetIOException e) {
            getLogger().error("Failed reading or writing", e);
            session.transfer(incomingCSV, FAILURE);
        } catch (DatasetException e) {
            getLogger().error("Failed to read FlowFile", e);
            session.transfer(incomingCSV, FAILURE);
        }
    } catch (final IOException ioe) {
        throw new RuntimeException("Unable to close Avro Writer", ioe);
    }
}
Also used : Schema(org.apache.avro.Schema) OutputStream(java.io.OutputStream) DatasetRecordException(org.kitesdk.data.DatasetRecordException) DatasetException(org.kitesdk.data.DatasetException) Record(org.apache.avro.generic.GenericData.Record) FlowFile(org.apache.nifi.flowfile.FlowFile) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) StreamCallback(org.apache.nifi.processor.io.StreamCallback) CSVProperties(org.kitesdk.data.spi.filesystem.CSVProperties) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) CSVFileReader(org.kitesdk.data.spi.filesystem.CSVFileReader) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) DatasetIOException(org.kitesdk.data.DatasetIOException)

Aggregations

IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 OutputStream (java.io.OutputStream)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 Schema (org.apache.avro.Schema)1 DataFileWriter (org.apache.avro.file.DataFileWriter)1 Record (org.apache.avro.generic.GenericData.Record)1 FlowFile (org.apache.nifi.flowfile.FlowFile)1 ProcessException (org.apache.nifi.processor.exception.ProcessException)1 StreamCallback (org.apache.nifi.processor.io.StreamCallback)1 DatasetException (org.kitesdk.data.DatasetException)1 DatasetIOException (org.kitesdk.data.DatasetIOException)1 DatasetRecordException (org.kitesdk.data.DatasetRecordException)1 SchemaNotFoundException (org.kitesdk.data.SchemaNotFoundException)1 CSVFileReader (org.kitesdk.data.spi.filesystem.CSVFileReader)1 CSVProperties (org.kitesdk.data.spi.filesystem.CSVProperties)1