Search in sources :

Example 56 with Record

use of org.apache.avro.generic.GenericData.Record in project nifi by apache.

the class AvroRecordConverter method convert.

/**
 * Converts one record to another given a input and output schema plus
 * explicit mappings for certain target fields.
 *
 * @param input
 *            Input record to convert conforming to the inputSchema this
 *            converter was created with.
 * @return Record converted to the outputSchema this converter was created
 *         with.
 * @throws AvroConversionException
 *             When schemas do not match or illegal conversions are
 *             attempted, such as when numeric data fails to parse.
 */
public Record convert(Record input) throws AvroConversionException {
    Record result = new Record(outputSchema);
    for (Field outputField : outputSchema.getFields()) {
        // Default to matching by name
        String inputFieldName = outputField.name();
        if (fieldMapping.containsKey(outputField.name())) {
            inputFieldName = fieldMapping.get(outputField.name());
        }
        IndexedRecord currentRecord = input;
        Schema currentSchema = getNonNullSchema(inputSchema);
        while (inputFieldName.contains(".")) {
            // Recurse down the schema to find the right field.
            int dotIndex = inputFieldName.indexOf('.');
            String entityName = inputFieldName.substring(0, dotIndex);
            // Get the record object
            Object innerRecord = currentRecord.get(currentSchema.getField(entityName).pos());
            if (innerRecord == null) {
                // Probably hit a null record here. Just break out of the
                // loop so that null object will be passed to convertData
                // below.
                currentRecord = null;
                break;
            }
            if (innerRecord != null && !(innerRecord instanceof IndexedRecord)) {
                throw new AvroConversionException(inputFieldName + " stepped through a non-record");
            }
            currentRecord = (IndexedRecord) innerRecord;
            // Get the schema. In case we had an optional record, choose
            // just the record.
            currentSchema = currentSchema.getField(entityName).schema();
            currentSchema = getNonNullSchema(currentSchema);
            inputFieldName = inputFieldName.substring(dotIndex + 1);
        }
        // Current should now be in the right place to read the record.
        Field f = currentSchema.getField(inputFieldName);
        if (currentRecord == null) {
            // We may have stepped into a null union type and gotten a null
            // result.
            Schema s = null;
            if (f != null) {
                s = f.schema();
            }
            result.put(outputField.name(), convertData(null, s, outputField.schema()));
        } else {
            result.put(outputField.name(), convertData(currentRecord.get(f.pos()), f.schema(), outputField.schema()));
        }
    }
    return result;
}
Also used : Field(org.apache.avro.Schema.Field) IndexedRecord(org.apache.avro.generic.IndexedRecord) Schema(org.apache.avro.Schema) Record(org.apache.avro.generic.GenericData.Record) IndexedRecord(org.apache.avro.generic.IndexedRecord)

Example 57 with Record

use of org.apache.avro.generic.GenericData.Record in project nifi by apache.

the class ConvertAvroSchema method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingAvro = session.get();
    if (incomingAvro == null) {
        return;
    }
    String inputSchemaProperty = context.getProperty(INPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
    final Schema inputSchema;
    try {
        inputSchema = getSchema(inputSchemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + inputSchemaProperty);
        session.transfer(incomingAvro, FAILURE);
        return;
    }
    String outputSchemaProperty = context.getProperty(OUTPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
    final Schema outputSchema;
    try {
        outputSchema = getSchema(outputSchemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + outputSchemaProperty);
        session.transfer(incomingAvro, FAILURE);
        return;
    }
    final Map<String, String> fieldMapping = new HashMap<>();
    for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
        if (entry.getKey().isDynamic()) {
            fieldMapping.put(entry.getKey().getName(), entry.getValue());
        }
    }
    // Set locale
    final String localeProperty = context.getProperty(LOCALE).getValue();
    final Locale locale = localeProperty.equals(DEFAULT_LOCALE_VALUE) ? Locale.getDefault() : LocaleUtils.toLocale(localeProperty);
    final AvroRecordConverter converter = new AvroRecordConverter(inputSchema, outputSchema, fieldMapping, locale);
    final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
    writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
    final DataFileWriter<Record> failureWriter = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
    failureWriter.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
    try {
        final AtomicLong written = new AtomicLong(0L);
        final FailureTracker failures = new FailureTracker();
        final List<Record> badRecords = Lists.newLinkedList();
        FlowFile incomingAvroCopy = session.clone(incomingAvro);
        FlowFile outgoingAvro = session.write(incomingAvro, new StreamCallback() {

            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                try (DataFileStream<Record> stream = new DataFileStream<Record>(in, new GenericDatumReader<Record>(converter.getInputSchema()))) {
                    try (DataFileWriter<Record> w = writer.create(outputSchema, out)) {
                        for (Record record : stream) {
                            try {
                                Record converted = converter.convert(record);
                                w.append(converted);
                                written.incrementAndGet();
                            } catch (AvroConversionException e) {
                                failures.add(e);
                                getLogger().error("Error converting data: " + e.getMessage());
                                badRecords.add(record);
                            }
                        }
                    }
                }
            }
        });
        FlowFile badOutput = session.write(incomingAvroCopy, new StreamCallback() {

            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                try (DataFileWriter<Record> w = failureWriter.create(inputSchema, out)) {
                    for (Record record : badRecords) {
                        w.append(record);
                    }
                }
            }
        });
        long errors = failures.count();
        // update only if file transfer is successful
        session.adjustCounter("Converted records", written.get(), false);
        // update only if file transfer is successful
        session.adjustCounter("Conversion errors", errors, false);
        if (written.get() > 0L) {
            session.transfer(outgoingAvro, SUCCESS);
        } else {
            session.remove(outgoingAvro);
            if (errors == 0L) {
                badOutput = session.putAttribute(badOutput, "errors", "No incoming records");
                session.transfer(badOutput, FAILURE);
            }
        }
        if (errors > 0L) {
            getLogger().warn("Failed to convert {}/{} records between Avro Schemas", new Object[] { errors, errors + written.get() });
            badOutput = session.putAttribute(badOutput, "errors", failures.summary());
            session.transfer(badOutput, FAILURE);
        } else {
            session.remove(badOutput);
        }
    } catch (ProcessException | DatasetIOException e) {
        getLogger().error("Failed reading or writing", e);
        session.transfer(incomingAvro, FAILURE);
    } catch (DatasetException e) {
        getLogger().error("Failed to read FlowFile", e);
        session.transfer(incomingAvro, FAILURE);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            getLogger().warn("Unable to close writer ressource", e);
        }
        try {
            failureWriter.close();
        } catch (IOException e) {
            getLogger().warn("Unable to close writer ressource", e);
        }
    }
}
Also used : Locale(java.util.Locale) HashMap(java.util.HashMap) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) OutputStream(java.io.OutputStream) DatasetException(org.kitesdk.data.DatasetException) Record(org.apache.avro.generic.GenericData.Record) FlowFile(org.apache.nifi.flowfile.FlowFile) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) StreamCallback(org.apache.nifi.processor.io.StreamCallback) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) AvroConversionException(org.apache.nifi.processors.kite.AvroRecordConverter.AvroConversionException) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) DatasetIOException(org.kitesdk.data.DatasetIOException) HashMap(java.util.HashMap) Map(java.util.Map)

Example 58 with Record

use of org.apache.avro.generic.GenericData.Record in project nifi by apache.

the class ConvertCSVToAvro method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingCSV = session.get();
    if (incomingCSV == null) {
        return;
    }
    CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(incomingCSV).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(incomingCSV).getValue()).quote(context.getProperty(QUOTE).evaluateAttributeExpressions(incomingCSV).getValue()).escape(context.getProperty(ESCAPE).evaluateAttributeExpressions(incomingCSV).getValue()).hasHeader(context.getProperty(HAS_HEADER).evaluateAttributeExpressions(incomingCSV).asBoolean()).linesToSkip(context.getProperty(LINES_TO_SKIP).evaluateAttributeExpressions(incomingCSV).asInteger()).build();
    String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingCSV).getValue();
    final Schema schema;
    try {
        schema = getSchema(schemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + schemaProperty);
        session.transfer(incomingCSV, FAILURE);
        return;
    }
    try (final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class))) {
        writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
        try {
            final AtomicLong written = new AtomicLong(0L);
            final FailureTracker failures = new FailureTracker();
            FlowFile badRecords = session.clone(incomingCSV);
            FlowFile outgoingAvro = session.write(incomingCSV, new StreamCallback() {

                @Override
                public void process(InputStream in, OutputStream out) throws IOException {
                    try (CSVFileReader<Record> reader = new CSVFileReader<>(in, props, schema, Record.class)) {
                        reader.initialize();
                        try (DataFileWriter<Record> w = writer.create(schema, out)) {
                            while (reader.hasNext()) {
                                try {
                                    Record record = reader.next();
                                    w.append(record);
                                    written.incrementAndGet();
                                } catch (DatasetRecordException e) {
                                    failures.add(e);
                                }
                            }
                        }
                    }
                }
            });
            long errors = failures.count();
            session.adjustCounter("Converted records", written.get(), false);
            session.adjustCounter("Conversion errors", errors, false);
            if (written.get() > 0L) {
                session.transfer(outgoingAvro, SUCCESS);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors + written.get() });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                    session.transfer(badRecords, INCOMPATIBLE);
                } else {
                    session.remove(badRecords);
                }
            } else {
                session.remove(outgoingAvro);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                } else {
                    badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
                }
                session.transfer(badRecords, FAILURE);
            }
        } catch (ProcessException | DatasetIOException e) {
            getLogger().error("Failed reading or writing", e);
            session.transfer(incomingCSV, FAILURE);
        } catch (DatasetException e) {
            getLogger().error("Failed to read FlowFile", e);
            session.transfer(incomingCSV, FAILURE);
        }
    } catch (final IOException ioe) {
        throw new RuntimeException("Unable to close Avro Writer", ioe);
    }
}
Also used : Schema(org.apache.avro.Schema) OutputStream(java.io.OutputStream) DatasetRecordException(org.kitesdk.data.DatasetRecordException) DatasetException(org.kitesdk.data.DatasetException) Record(org.apache.avro.generic.GenericData.Record) FlowFile(org.apache.nifi.flowfile.FlowFile) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) StreamCallback(org.apache.nifi.processor.io.StreamCallback) CSVProperties(org.kitesdk.data.spi.filesystem.CSVProperties) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) CSVFileReader(org.kitesdk.data.spi.filesystem.CSVFileReader) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) DatasetIOException(org.kitesdk.data.DatasetIOException)

Example 59 with Record

use of org.apache.avro.generic.GenericData.Record in project nifi by apache.

the class ConvertJSONToAvro method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingJSON = session.get();
    if (incomingJSON == null) {
        return;
    }
    String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingJSON).getValue();
    final Schema schema;
    try {
        schema = getSchema(schemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + schemaProperty);
        session.transfer(incomingJSON, FAILURE);
        return;
    }
    final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class));
    writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
    try {
        final AtomicLong written = new AtomicLong(0L);
        final FailureTracker failures = new FailureTracker();
        FlowFile badRecords = session.clone(incomingJSON);
        FlowFile outgoingAvro = session.write(incomingJSON, new StreamCallback() {

            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                try (JSONFileReader<Record> reader = new JSONFileReader<>(in, schema, Record.class)) {
                    reader.initialize();
                    try (DataFileWriter<Record> w = writer.create(schema, out)) {
                        while (reader.hasNext()) {
                            try {
                                Record record = reader.next();
                                w.append(record);
                                written.incrementAndGet();
                            } catch (final DatasetRecordException e) {
                                failures.add(e);
                            }
                        }
                    }
                }
            }
        });
        long errors = failures.count();
        session.adjustCounter("Converted records", written.get(), false);
        session.adjustCounter("Conversion errors", errors, false);
        if (written.get() > 0L) {
            session.transfer(outgoingAvro, SUCCESS);
            if (errors > 0L) {
                getLogger().warn("Failed to convert {}/{} records from JSON to Avro", new Object[] { errors, errors + written.get() });
                badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                session.transfer(badRecords, INCOMPATIBLE);
            } else {
                session.remove(badRecords);
            }
        } else {
            session.remove(outgoingAvro);
            if (errors > 0L) {
                getLogger().warn("Failed to convert {}/{} records from JSON to Avro", new Object[] { errors, errors });
                badRecords = session.putAttribute(badRecords, "errors", failures.summary());
            } else {
                badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
            }
            session.transfer(badRecords, FAILURE);
        }
    } catch (ProcessException | DatasetIOException e) {
        getLogger().error("Failed reading or writing", e);
        session.transfer(incomingJSON, FAILURE);
    } catch (DatasetException e) {
        getLogger().error("Failed to read FlowFile", e);
        session.transfer(incomingJSON, FAILURE);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            getLogger().warn("Unable to close writer ressource", e);
        }
    }
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) InputStream(java.io.InputStream) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) OutputStream(java.io.OutputStream) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) StreamCallback(org.apache.nifi.processor.io.StreamCallback) DatasetRecordException(org.kitesdk.data.DatasetRecordException) DatasetException(org.kitesdk.data.DatasetException) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) Record(org.apache.avro.generic.GenericData.Record) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) JSONFileReader(org.kitesdk.data.spi.filesystem.JSONFileReader) DatasetIOException(org.kitesdk.data.DatasetIOException)

Example 60 with Record

use of org.apache.avro.generic.GenericData.Record in project nifi by apache.

the class StoreInKiteDataset method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final View<Record> target = load(context, flowFile);
    final Schema schema = target.getDataset().getDescriptor().getSchema();
    try {
        StopWatch timer = new StopWatch(true);
        session.read(flowFile, new InputStreamCallback() {

            @Override
            public void process(InputStream in) throws IOException {
                try (DataFileStream<Record> stream = new DataFileStream<>(in, AvroUtil.newDatumReader(schema, Record.class))) {
                    IncompatibleSchemaException.check(SchemaValidationUtil.canRead(stream.getSchema(), schema), "Incompatible file schema %s, expected %s", stream.getSchema(), schema);
                    long written = 0L;
                    try (DatasetWriter<Record> writer = target.newWriter()) {
                        for (Record record : stream) {
                            writer.write(record);
                            written += 1;
                        }
                    } finally {
                        session.adjustCounter("Stored records", written, true);
                    }
                }
            }
        });
        timer.stop();
        session.getProvenanceReporter().send(flowFile, target.getUri().toString(), timer.getDuration(TimeUnit.MILLISECONDS), true);
        session.transfer(flowFile, SUCCESS);
    } catch (ProcessException | DatasetIOException e) {
        getLogger().error("Failed to read FlowFile", e);
        session.transfer(flowFile, FAILURE);
    } catch (ValidationException e) {
        getLogger().error(e.getMessage());
        getLogger().debug("Incompatible schema error", e);
        session.transfer(flowFile, INCOMPATIBLE);
    }
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) ValidationException(org.kitesdk.data.ValidationException) InputStream(java.io.InputStream) Schema(org.apache.avro.Schema) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) DatasetWriter(org.kitesdk.data.DatasetWriter) StopWatch(org.apache.nifi.util.StopWatch) ProcessException(org.apache.nifi.processor.exception.ProcessException) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) Record(org.apache.avro.generic.GenericData.Record) DatasetIOException(org.kitesdk.data.DatasetIOException)

Aggregations

Record (org.apache.avro.generic.GenericData.Record)96 Test (org.junit.Test)44 IndexedRecord (org.apache.avro.generic.IndexedRecord)43 Schema (org.apache.avro.Schema)33 ArrayList (java.util.ArrayList)24 GenericRecord (org.apache.avro.generic.GenericRecord)14 Field (org.apache.avro.Schema.Field)11 List (java.util.List)10 GenericData (org.apache.avro.generic.GenericData)10 TestRunner (org.apache.nifi.util.TestRunner)8 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)7 JsonObject (com.google.gson.JsonObject)6 DataFileStream (org.apache.avro.file.DataFileStream)6 DataFileWriter (org.apache.avro.file.DataFileWriter)6 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)6 Utf8 (org.apache.avro.util.Utf8)6 TMarketoOutputProperties (org.talend.components.marketo.tmarketooutput.TMarketoOutputProperties)6 ActivityRecord (com.marketo.mktows.ActivityRecord)5 ArrayOfLeadRecord (com.marketo.mktows.ArrayOfLeadRecord)5 LeadChangeRecord (com.marketo.mktows.LeadChangeRecord)5