Search in sources :

Example 1 with CSVProperties

use of org.kitesdk.data.spi.filesystem.CSVProperties in project nifi by apache.

the class ConvertCSVToAvro method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingCSV = session.get();
    if (incomingCSV == null) {
        return;
    }
    CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(incomingCSV).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(incomingCSV).getValue()).quote(context.getProperty(QUOTE).evaluateAttributeExpressions(incomingCSV).getValue()).escape(context.getProperty(ESCAPE).evaluateAttributeExpressions(incomingCSV).getValue()).hasHeader(context.getProperty(HAS_HEADER).evaluateAttributeExpressions(incomingCSV).asBoolean()).linesToSkip(context.getProperty(LINES_TO_SKIP).evaluateAttributeExpressions(incomingCSV).asInteger()).build();
    String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingCSV).getValue();
    final Schema schema;
    try {
        schema = getSchema(schemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + schemaProperty);
        session.transfer(incomingCSV, FAILURE);
        return;
    }
    try (final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class))) {
        writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
        try {
            final AtomicLong written = new AtomicLong(0L);
            final FailureTracker failures = new FailureTracker();
            FlowFile badRecords = session.clone(incomingCSV);
            FlowFile outgoingAvro = session.write(incomingCSV, new StreamCallback() {

                @Override
                public void process(InputStream in, OutputStream out) throws IOException {
                    try (CSVFileReader<Record> reader = new CSVFileReader<>(in, props, schema, Record.class)) {
                        reader.initialize();
                        try (DataFileWriter<Record> w = writer.create(schema, out)) {
                            while (reader.hasNext()) {
                                try {
                                    Record record = reader.next();
                                    w.append(record);
                                    written.incrementAndGet();
                                } catch (DatasetRecordException e) {
                                    failures.add(e);
                                }
                            }
                        }
                    }
                }
            });
            long errors = failures.count();
            session.adjustCounter("Converted records", written.get(), false);
            session.adjustCounter("Conversion errors", errors, false);
            if (written.get() > 0L) {
                session.transfer(outgoingAvro, SUCCESS);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors + written.get() });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                    session.transfer(badRecords, INCOMPATIBLE);
                } else {
                    session.remove(badRecords);
                }
            } else {
                session.remove(outgoingAvro);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                } else {
                    badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
                }
                session.transfer(badRecords, FAILURE);
            }
        } catch (ProcessException | DatasetIOException e) {
            getLogger().error("Failed reading or writing", e);
            session.transfer(incomingCSV, FAILURE);
        } catch (DatasetException e) {
            getLogger().error("Failed to read FlowFile", e);
            session.transfer(incomingCSV, FAILURE);
        }
    } catch (final IOException ioe) {
        throw new RuntimeException("Unable to close Avro Writer", ioe);
    }
}
Also used : Schema(org.apache.avro.Schema) OutputStream(java.io.OutputStream) DatasetRecordException(org.kitesdk.data.DatasetRecordException) DatasetException(org.kitesdk.data.DatasetException) Record(org.apache.avro.generic.GenericData.Record) FlowFile(org.apache.nifi.flowfile.FlowFile) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) StreamCallback(org.apache.nifi.processor.io.StreamCallback) CSVProperties(org.kitesdk.data.spi.filesystem.CSVProperties) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) CSVFileReader(org.kitesdk.data.spi.filesystem.CSVFileReader) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) DatasetIOException(org.kitesdk.data.DatasetIOException)

Example 2 with CSVProperties

use of org.kitesdk.data.spi.filesystem.CSVProperties in project nifi by apache.

the class InferAvroSchema method inferAvroSchemaFromCSV.

/**
 * Infers the Avro schema from the input Flowfile content. To infer an Avro schema for CSV content a header line is
 * required. You can configure the processor to pull that header line from the first line of the CSV data if it is
 * present OR you can manually supply the desired header line as a property value.
 *
 * @param inputFlowFile
 *  The original input FlowFile containing the CSV content as it entered this processor.
 *
 * @param context
 *  ProcessContext to pull processor configurations.
 *
 * @param session
 *  ProcessSession to transfer FlowFiles
 */
private String inferAvroSchemaFromCSV(final FlowFile inputFlowFile, final ProcessContext context, final ProcessSession session) {
    // Determines the header line either from the property input or the first line of the delimited file.
    final AtomicReference<String> header = new AtomicReference<>();
    final AtomicReference<Boolean> hasHeader = new AtomicReference<>();
    if (context.getProperty(GET_CSV_HEADER_DEFINITION_FROM_INPUT).asBoolean() == Boolean.TRUE) {
        // Read the first line of the file to get the header value.
        session.read(inputFlowFile, new InputStreamCallback() {

            @Override
            public void process(InputStream in) throws IOException {
                BufferedReader br = new BufferedReader(new InputStreamReader(in));
                header.set(br.readLine());
                hasHeader.set(Boolean.TRUE);
                br.close();
            }
        });
        hasHeader.set(Boolean.TRUE);
    } else {
        header.set(context.getProperty(CSV_HEADER_DEFINITION).evaluateAttributeExpressions(inputFlowFile).getValue());
        hasHeader.set(Boolean.FALSE);
    }
    // Prepares the CSVProperties for kite
    CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(inputFlowFile).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(inputFlowFile).getValue()).quote(context.getProperty(QUOTE_STRING).evaluateAttributeExpressions(inputFlowFile).getValue()).escape(context.getProperty(ESCAPE_STRING).evaluateAttributeExpressions(inputFlowFile).getValue()).linesToSkip(context.getProperty(HEADER_LINE_SKIP_COUNT).evaluateAttributeExpressions(inputFlowFile).asInteger()).header(header.get()).hasHeader(hasHeader.get()).build();
    final AtomicReference<String> avroSchema = new AtomicReference<>();
    session.read(inputFlowFile, new InputStreamCallback() {

        @Override
        public void process(InputStream in) throws IOException {
            avroSchema.set(CSVUtil.inferSchema(context.getProperty(RECORD_NAME).evaluateAttributeExpressions(inputFlowFile).getValue(), in, props).toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean()));
        }
    });
    return avroSchema.get();
}
Also used : InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) CSVProperties(org.kitesdk.data.spi.filesystem.CSVProperties) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) BufferedReader(java.io.BufferedReader)

Aggregations

IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 CSVProperties (org.kitesdk.data.spi.filesystem.CSVProperties)2 BufferedReader (java.io.BufferedReader)1 InputStreamReader (java.io.InputStreamReader)1 OutputStream (java.io.OutputStream)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 Schema (org.apache.avro.Schema)1 DataFileWriter (org.apache.avro.file.DataFileWriter)1 Record (org.apache.avro.generic.GenericData.Record)1 FlowFile (org.apache.nifi.flowfile.FlowFile)1 ProcessException (org.apache.nifi.processor.exception.ProcessException)1 InputStreamCallback (org.apache.nifi.processor.io.InputStreamCallback)1 StreamCallback (org.apache.nifi.processor.io.StreamCallback)1 DatasetException (org.kitesdk.data.DatasetException)1 DatasetIOException (org.kitesdk.data.DatasetIOException)1 DatasetRecordException (org.kitesdk.data.DatasetRecordException)1 SchemaNotFoundException (org.kitesdk.data.SchemaNotFoundException)1 CSVFileReader (org.kitesdk.data.spi.filesystem.CSVFileReader)1