Search in sources :

Example 1 with DatasetIOException

use of org.kitesdk.data.DatasetIOException in project nifi by apache.

the class ConvertAvroSchema method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingAvro = session.get();
    if (incomingAvro == null) {
        return;
    }
    String inputSchemaProperty = context.getProperty(INPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
    final Schema inputSchema;
    try {
        inputSchema = getSchema(inputSchemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + inputSchemaProperty);
        session.transfer(incomingAvro, FAILURE);
        return;
    }
    String outputSchemaProperty = context.getProperty(OUTPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
    final Schema outputSchema;
    try {
        outputSchema = getSchema(outputSchemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + outputSchemaProperty);
        session.transfer(incomingAvro, FAILURE);
        return;
    }
    final Map<String, String> fieldMapping = new HashMap<>();
    for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
        if (entry.getKey().isDynamic()) {
            fieldMapping.put(entry.getKey().getName(), entry.getValue());
        }
    }
    // Set locale
    final String localeProperty = context.getProperty(LOCALE).getValue();
    final Locale locale = localeProperty.equals(DEFAULT_LOCALE_VALUE) ? Locale.getDefault() : LocaleUtils.toLocale(localeProperty);
    final AvroRecordConverter converter = new AvroRecordConverter(inputSchema, outputSchema, fieldMapping, locale);
    final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
    writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
    final DataFileWriter<Record> failureWriter = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
    failureWriter.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
    try {
        final AtomicLong written = new AtomicLong(0L);
        final FailureTracker failures = new FailureTracker();
        final List<Record> badRecords = Lists.newLinkedList();
        FlowFile incomingAvroCopy = session.clone(incomingAvro);
        FlowFile outgoingAvro = session.write(incomingAvro, new StreamCallback() {

            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                try (DataFileStream<Record> stream = new DataFileStream<Record>(in, new GenericDatumReader<Record>(converter.getInputSchema()))) {
                    try (DataFileWriter<Record> w = writer.create(outputSchema, out)) {
                        for (Record record : stream) {
                            try {
                                Record converted = converter.convert(record);
                                w.append(converted);
                                written.incrementAndGet();
                            } catch (AvroConversionException e) {
                                failures.add(e);
                                getLogger().error("Error converting data: " + e.getMessage());
                                badRecords.add(record);
                            }
                        }
                    }
                }
            }
        });
        FlowFile badOutput = session.write(incomingAvroCopy, new StreamCallback() {

            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                try (DataFileWriter<Record> w = failureWriter.create(inputSchema, out)) {
                    for (Record record : badRecords) {
                        w.append(record);
                    }
                }
            }
        });
        long errors = failures.count();
        // update only if file transfer is successful
        session.adjustCounter("Converted records", written.get(), false);
        // update only if file transfer is successful
        session.adjustCounter("Conversion errors", errors, false);
        if (written.get() > 0L) {
            session.transfer(outgoingAvro, SUCCESS);
        } else {
            session.remove(outgoingAvro);
            if (errors == 0L) {
                badOutput = session.putAttribute(badOutput, "errors", "No incoming records");
                session.transfer(badOutput, FAILURE);
            }
        }
        if (errors > 0L) {
            getLogger().warn("Failed to convert {}/{} records between Avro Schemas", new Object[] { errors, errors + written.get() });
            badOutput = session.putAttribute(badOutput, "errors", failures.summary());
            session.transfer(badOutput, FAILURE);
        } else {
            session.remove(badOutput);
        }
    } catch (ProcessException | DatasetIOException e) {
        getLogger().error("Failed reading or writing", e);
        session.transfer(incomingAvro, FAILURE);
    } catch (DatasetException e) {
        getLogger().error("Failed to read FlowFile", e);
        session.transfer(incomingAvro, FAILURE);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            getLogger().warn("Unable to close writer ressource", e);
        }
        try {
            failureWriter.close();
        } catch (IOException e) {
            getLogger().warn("Unable to close writer ressource", e);
        }
    }
}
Also used : Locale(java.util.Locale) HashMap(java.util.HashMap) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) OutputStream(java.io.OutputStream) DatasetException(org.kitesdk.data.DatasetException) Record(org.apache.avro.generic.GenericData.Record) FlowFile(org.apache.nifi.flowfile.FlowFile) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) StreamCallback(org.apache.nifi.processor.io.StreamCallback) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) AvroConversionException(org.apache.nifi.processors.kite.AvroRecordConverter.AvroConversionException) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) DatasetIOException(org.kitesdk.data.DatasetIOException) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with DatasetIOException

use of org.kitesdk.data.DatasetIOException in project nifi by apache.

the class ConvertCSVToAvro method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingCSV = session.get();
    if (incomingCSV == null) {
        return;
    }
    CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(incomingCSV).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(incomingCSV).getValue()).quote(context.getProperty(QUOTE).evaluateAttributeExpressions(incomingCSV).getValue()).escape(context.getProperty(ESCAPE).evaluateAttributeExpressions(incomingCSV).getValue()).hasHeader(context.getProperty(HAS_HEADER).evaluateAttributeExpressions(incomingCSV).asBoolean()).linesToSkip(context.getProperty(LINES_TO_SKIP).evaluateAttributeExpressions(incomingCSV).asInteger()).build();
    String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingCSV).getValue();
    final Schema schema;
    try {
        schema = getSchema(schemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + schemaProperty);
        session.transfer(incomingCSV, FAILURE);
        return;
    }
    try (final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class))) {
        writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
        try {
            final AtomicLong written = new AtomicLong(0L);
            final FailureTracker failures = new FailureTracker();
            FlowFile badRecords = session.clone(incomingCSV);
            FlowFile outgoingAvro = session.write(incomingCSV, new StreamCallback() {

                @Override
                public void process(InputStream in, OutputStream out) throws IOException {
                    try (CSVFileReader<Record> reader = new CSVFileReader<>(in, props, schema, Record.class)) {
                        reader.initialize();
                        try (DataFileWriter<Record> w = writer.create(schema, out)) {
                            while (reader.hasNext()) {
                                try {
                                    Record record = reader.next();
                                    w.append(record);
                                    written.incrementAndGet();
                                } catch (DatasetRecordException e) {
                                    failures.add(e);
                                }
                            }
                        }
                    }
                }
            });
            long errors = failures.count();
            session.adjustCounter("Converted records", written.get(), false);
            session.adjustCounter("Conversion errors", errors, false);
            if (written.get() > 0L) {
                session.transfer(outgoingAvro, SUCCESS);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors + written.get() });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                    session.transfer(badRecords, INCOMPATIBLE);
                } else {
                    session.remove(badRecords);
                }
            } else {
                session.remove(outgoingAvro);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                } else {
                    badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
                }
                session.transfer(badRecords, FAILURE);
            }
        } catch (ProcessException | DatasetIOException e) {
            getLogger().error("Failed reading or writing", e);
            session.transfer(incomingCSV, FAILURE);
        } catch (DatasetException e) {
            getLogger().error("Failed to read FlowFile", e);
            session.transfer(incomingCSV, FAILURE);
        }
    } catch (final IOException ioe) {
        throw new RuntimeException("Unable to close Avro Writer", ioe);
    }
}
Also used : Schema(org.apache.avro.Schema) OutputStream(java.io.OutputStream) DatasetRecordException(org.kitesdk.data.DatasetRecordException) DatasetException(org.kitesdk.data.DatasetException) Record(org.apache.avro.generic.GenericData.Record) FlowFile(org.apache.nifi.flowfile.FlowFile) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) StreamCallback(org.apache.nifi.processor.io.StreamCallback) CSVProperties(org.kitesdk.data.spi.filesystem.CSVProperties) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) CSVFileReader(org.kitesdk.data.spi.filesystem.CSVFileReader) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) DatasetIOException(org.kitesdk.data.DatasetIOException)

Example 3 with DatasetIOException

use of org.kitesdk.data.DatasetIOException in project nifi by apache.

the class ConvertJSONToAvro method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingJSON = session.get();
    if (incomingJSON == null) {
        return;
    }
    String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingJSON).getValue();
    final Schema schema;
    try {
        schema = getSchema(schemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + schemaProperty);
        session.transfer(incomingJSON, FAILURE);
        return;
    }
    final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class));
    writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
    try {
        final AtomicLong written = new AtomicLong(0L);
        final FailureTracker failures = new FailureTracker();
        FlowFile badRecords = session.clone(incomingJSON);
        FlowFile outgoingAvro = session.write(incomingJSON, new StreamCallback() {

            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                try (JSONFileReader<Record> reader = new JSONFileReader<>(in, schema, Record.class)) {
                    reader.initialize();
                    try (DataFileWriter<Record> w = writer.create(schema, out)) {
                        while (reader.hasNext()) {
                            try {
                                Record record = reader.next();
                                w.append(record);
                                written.incrementAndGet();
                            } catch (final DatasetRecordException e) {
                                failures.add(e);
                            }
                        }
                    }
                }
            }
        });
        long errors = failures.count();
        session.adjustCounter("Converted records", written.get(), false);
        session.adjustCounter("Conversion errors", errors, false);
        if (written.get() > 0L) {
            session.transfer(outgoingAvro, SUCCESS);
            if (errors > 0L) {
                getLogger().warn("Failed to convert {}/{} records from JSON to Avro", new Object[] { errors, errors + written.get() });
                badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                session.transfer(badRecords, INCOMPATIBLE);
            } else {
                session.remove(badRecords);
            }
        } else {
            session.remove(outgoingAvro);
            if (errors > 0L) {
                getLogger().warn("Failed to convert {}/{} records from JSON to Avro", new Object[] { errors, errors });
                badRecords = session.putAttribute(badRecords, "errors", failures.summary());
            } else {
                badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
            }
            session.transfer(badRecords, FAILURE);
        }
    } catch (ProcessException | DatasetIOException e) {
        getLogger().error("Failed reading or writing", e);
        session.transfer(incomingJSON, FAILURE);
    } catch (DatasetException e) {
        getLogger().error("Failed to read FlowFile", e);
        session.transfer(incomingJSON, FAILURE);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            getLogger().warn("Unable to close writer ressource", e);
        }
    }
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) InputStream(java.io.InputStream) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) OutputStream(java.io.OutputStream) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) StreamCallback(org.apache.nifi.processor.io.StreamCallback) DatasetRecordException(org.kitesdk.data.DatasetRecordException) DatasetException(org.kitesdk.data.DatasetException) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) Record(org.apache.avro.generic.GenericData.Record) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) JSONFileReader(org.kitesdk.data.spi.filesystem.JSONFileReader) DatasetIOException(org.kitesdk.data.DatasetIOException)

Example 4 with DatasetIOException

use of org.kitesdk.data.DatasetIOException in project nifi by apache.

the class StoreInKiteDataset method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final View<Record> target = load(context, flowFile);
    final Schema schema = target.getDataset().getDescriptor().getSchema();
    try {
        StopWatch timer = new StopWatch(true);
        session.read(flowFile, new InputStreamCallback() {

            @Override
            public void process(InputStream in) throws IOException {
                try (DataFileStream<Record> stream = new DataFileStream<>(in, AvroUtil.newDatumReader(schema, Record.class))) {
                    IncompatibleSchemaException.check(SchemaValidationUtil.canRead(stream.getSchema(), schema), "Incompatible file schema %s, expected %s", stream.getSchema(), schema);
                    long written = 0L;
                    try (DatasetWriter<Record> writer = target.newWriter()) {
                        for (Record record : stream) {
                            writer.write(record);
                            written += 1;
                        }
                    } finally {
                        session.adjustCounter("Stored records", written, true);
                    }
                }
            }
        });
        timer.stop();
        session.getProvenanceReporter().send(flowFile, target.getUri().toString(), timer.getDuration(TimeUnit.MILLISECONDS), true);
        session.transfer(flowFile, SUCCESS);
    } catch (ProcessException | DatasetIOException e) {
        getLogger().error("Failed to read FlowFile", e);
        session.transfer(flowFile, FAILURE);
    } catch (ValidationException e) {
        getLogger().error(e.getMessage());
        getLogger().debug("Incompatible schema error", e);
        session.transfer(flowFile, INCOMPATIBLE);
    }
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) ValidationException(org.kitesdk.data.ValidationException) InputStream(java.io.InputStream) Schema(org.apache.avro.Schema) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) DatasetWriter(org.kitesdk.data.DatasetWriter) StopWatch(org.apache.nifi.util.StopWatch) ProcessException(org.apache.nifi.processor.exception.ProcessException) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) Record(org.apache.avro.generic.GenericData.Record) DatasetIOException(org.kitesdk.data.DatasetIOException)

Aggregations

IOException (java.io.IOException)4 InputStream (java.io.InputStream)4 Schema (org.apache.avro.Schema)4 Record (org.apache.avro.generic.GenericData.Record)4 FlowFile (org.apache.nifi.flowfile.FlowFile)4 ProcessException (org.apache.nifi.processor.exception.ProcessException)4 DatasetIOException (org.kitesdk.data.DatasetIOException)4 OutputStream (java.io.OutputStream)3 AtomicLong (java.util.concurrent.atomic.AtomicLong)3 DataFileWriter (org.apache.avro.file.DataFileWriter)3 StreamCallback (org.apache.nifi.processor.io.StreamCallback)3 DatasetException (org.kitesdk.data.DatasetException)3 SchemaNotFoundException (org.kitesdk.data.SchemaNotFoundException)3 DataFileStream (org.apache.avro.file.DataFileStream)2 DatasetRecordException (org.kitesdk.data.DatasetRecordException)2 HashMap (java.util.HashMap)1 Locale (java.util.Locale)1 Map (java.util.Map)1 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)1 PropertyDescriptor (org.apache.nifi.components.PropertyDescriptor)1