Search in sources :

Example 66 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project nifi by apache.

the class TestConvertAvroToORC method test_onTrigger_array_of_records.

@Test
public void test_onTrigger_array_of_records() throws Exception {
    final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array_of_records.avsc"));
    List<GenericRecord> innerRecords = new LinkedList<>();
    final GenericRecord outerRecord = new GenericData.Record(schema);
    Schema arraySchema = schema.getField("records").schema();
    Schema innerRecordSchema = arraySchema.getElementType();
    final GenericRecord innerRecord1 = new GenericData.Record(innerRecordSchema);
    innerRecord1.put("name", "Joe");
    innerRecord1.put("age", 42);
    innerRecords.add(innerRecord1);
    final GenericRecord innerRecord2 = new GenericData.Record(innerRecordSchema);
    innerRecord2.put("name", "Mary");
    innerRecord2.put("age", 28);
    innerRecords.add(innerRecord2);
    GenericData.Array<GenericRecord> array = new GenericData.Array<>(arraySchema, innerRecords);
    outerRecord.put("records", array);
    final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
        dataFileWriter.create(schema, out);
        dataFileWriter.append(outerRecord);
    }
    out.close();
    // Build a flow file from the Avro record
    Map<String, String> attributes = new HashMap<String, String>() {

        {
            put(CoreAttributes.FILENAME.key(), "test");
        }
    };
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS org_apache_nifi_outer_record " + "(records ARRAY<STRUCT<name:STRING, age:INT>>)" + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("1", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(schema));
    // Verify the record contains an array
    Object arrayFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("records"));
    assertTrue(arrayFieldObject instanceof ArrayList);
    ArrayList<?> arrayField = (ArrayList<?>) arrayFieldObject;
    assertEquals(2, arrayField.size());
    // Verify the first element. Should be a record with two fields "name" and "age"
    Object element = arrayField.get(0);
    assertTrue(element instanceof OrcStruct);
    StructObjectInspector elementInspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(innerRecordSchema));
    Object nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
    assertTrue(nameObject instanceof Text);
    assertEquals("Joe", nameObject.toString());
    Object ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
    assertTrue(ageObject instanceof IntWritable);
    assertEquals(42, ((IntWritable) ageObject).get());
    // Verify the first element. Should be a record with two fields "name" and "age"
    element = arrayField.get(1);
    assertTrue(element instanceof OrcStruct);
    nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
    assertTrue(nameObject instanceof Text);
    assertEquals("Mary", nameObject.toString());
    ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
    assertTrue(ageObject instanceof IntWritable);
    assertEquals(28, ((IntWritable) ageObject).get());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) ArrayList(java.util.ArrayList) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) FileSystem(org.apache.hadoop.fs.FileSystem) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) DataFileWriter(org.apache.avro.file.DataFileWriter) Text(org.apache.hadoop.io.Text) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericData(org.apache.avro.generic.GenericData) LinkedList(java.util.LinkedList) MockFlowFile(org.apache.nifi.util.MockFlowFile) FileOutputStream(java.io.FileOutputStream) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) File(java.io.File) MockFlowFile(org.apache.nifi.util.MockFlowFile) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 67 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project nifi by apache.

the class TestPutHiveStreaming method createAvroRecord.

private byte[] createAvroRecord(List<Map<String, Object>> records) throws IOException {
    final Schema schema = new Schema.Parser().parse(new File("src/test/resources/user.avsc"));
    List<GenericRecord> users = new LinkedList<>();
    for (Map<String, Object> record : records) {
        final GenericRecord user = new GenericData.Record(schema);
        user.put("name", record.get("name"));
        user.put("favorite_number", record.get("favorite_number"));
        user.put("favorite_color", record.get("favorite_color"));
        users.add(user);
    }
    final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
        dataFileWriter.create(schema, out);
        for (final GenericRecord user : users) {
            dataFileWriter.append(user);
        }
    }
    return out.toByteArray();
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) Matchers.anyString(org.mockito.Matchers.anyString) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) LinkedList(java.util.LinkedList) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) MockFlowFile(org.apache.nifi.util.MockFlowFile) File(java.io.File)

Example 68 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project nifi by apache.

the class ConvertAvroSchema method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingAvro = session.get();
    if (incomingAvro == null) {
        return;
    }
    String inputSchemaProperty = context.getProperty(INPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
    final Schema inputSchema;
    try {
        inputSchema = getSchema(inputSchemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + inputSchemaProperty);
        session.transfer(incomingAvro, FAILURE);
        return;
    }
    String outputSchemaProperty = context.getProperty(OUTPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
    final Schema outputSchema;
    try {
        outputSchema = getSchema(outputSchemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + outputSchemaProperty);
        session.transfer(incomingAvro, FAILURE);
        return;
    }
    final Map<String, String> fieldMapping = new HashMap<>();
    for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
        if (entry.getKey().isDynamic()) {
            fieldMapping.put(entry.getKey().getName(), entry.getValue());
        }
    }
    // Set locale
    final String localeProperty = context.getProperty(LOCALE).getValue();
    final Locale locale = localeProperty.equals(DEFAULT_LOCALE_VALUE) ? Locale.getDefault() : LocaleUtils.toLocale(localeProperty);
    final AvroRecordConverter converter = new AvroRecordConverter(inputSchema, outputSchema, fieldMapping, locale);
    final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
    writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
    final DataFileWriter<Record> failureWriter = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
    failureWriter.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
    try {
        final AtomicLong written = new AtomicLong(0L);
        final FailureTracker failures = new FailureTracker();
        final List<Record> badRecords = Lists.newLinkedList();
        FlowFile incomingAvroCopy = session.clone(incomingAvro);
        FlowFile outgoingAvro = session.write(incomingAvro, new StreamCallback() {

            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                try (DataFileStream<Record> stream = new DataFileStream<Record>(in, new GenericDatumReader<Record>(converter.getInputSchema()))) {
                    try (DataFileWriter<Record> w = writer.create(outputSchema, out)) {
                        for (Record record : stream) {
                            try {
                                Record converted = converter.convert(record);
                                w.append(converted);
                                written.incrementAndGet();
                            } catch (AvroConversionException e) {
                                failures.add(e);
                                getLogger().error("Error converting data: " + e.getMessage());
                                badRecords.add(record);
                            }
                        }
                    }
                }
            }
        });
        FlowFile badOutput = session.write(incomingAvroCopy, new StreamCallback() {

            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                try (DataFileWriter<Record> w = failureWriter.create(inputSchema, out)) {
                    for (Record record : badRecords) {
                        w.append(record);
                    }
                }
            }
        });
        long errors = failures.count();
        // update only if file transfer is successful
        session.adjustCounter("Converted records", written.get(), false);
        // update only if file transfer is successful
        session.adjustCounter("Conversion errors", errors, false);
        if (written.get() > 0L) {
            session.transfer(outgoingAvro, SUCCESS);
        } else {
            session.remove(outgoingAvro);
            if (errors == 0L) {
                badOutput = session.putAttribute(badOutput, "errors", "No incoming records");
                session.transfer(badOutput, FAILURE);
            }
        }
        if (errors > 0L) {
            getLogger().warn("Failed to convert {}/{} records between Avro Schemas", new Object[] { errors, errors + written.get() });
            badOutput = session.putAttribute(badOutput, "errors", failures.summary());
            session.transfer(badOutput, FAILURE);
        } else {
            session.remove(badOutput);
        }
    } catch (ProcessException | DatasetIOException e) {
        getLogger().error("Failed reading or writing", e);
        session.transfer(incomingAvro, FAILURE);
    } catch (DatasetException e) {
        getLogger().error("Failed to read FlowFile", e);
        session.transfer(incomingAvro, FAILURE);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            getLogger().warn("Unable to close writer ressource", e);
        }
        try {
            failureWriter.close();
        } catch (IOException e) {
            getLogger().warn("Unable to close writer ressource", e);
        }
    }
}
Also used : Locale(java.util.Locale) HashMap(java.util.HashMap) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) OutputStream(java.io.OutputStream) DatasetException(org.kitesdk.data.DatasetException) Record(org.apache.avro.generic.GenericData.Record) FlowFile(org.apache.nifi.flowfile.FlowFile) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) StreamCallback(org.apache.nifi.processor.io.StreamCallback) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) AvroConversionException(org.apache.nifi.processors.kite.AvroRecordConverter.AvroConversionException) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) DatasetIOException(org.kitesdk.data.DatasetIOException) HashMap(java.util.HashMap) Map(java.util.Map)

Example 69 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project nifi by apache.

the class ConvertCSVToAvro method onTrigger.

@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingCSV = session.get();
    if (incomingCSV == null) {
        return;
    }
    CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(incomingCSV).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(incomingCSV).getValue()).quote(context.getProperty(QUOTE).evaluateAttributeExpressions(incomingCSV).getValue()).escape(context.getProperty(ESCAPE).evaluateAttributeExpressions(incomingCSV).getValue()).hasHeader(context.getProperty(HAS_HEADER).evaluateAttributeExpressions(incomingCSV).asBoolean()).linesToSkip(context.getProperty(LINES_TO_SKIP).evaluateAttributeExpressions(incomingCSV).asInteger()).build();
    String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingCSV).getValue();
    final Schema schema;
    try {
        schema = getSchema(schemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + schemaProperty);
        session.transfer(incomingCSV, FAILURE);
        return;
    }
    try (final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class))) {
        writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
        try {
            final AtomicLong written = new AtomicLong(0L);
            final FailureTracker failures = new FailureTracker();
            FlowFile badRecords = session.clone(incomingCSV);
            FlowFile outgoingAvro = session.write(incomingCSV, new StreamCallback() {

                @Override
                public void process(InputStream in, OutputStream out) throws IOException {
                    try (CSVFileReader<Record> reader = new CSVFileReader<>(in, props, schema, Record.class)) {
                        reader.initialize();
                        try (DataFileWriter<Record> w = writer.create(schema, out)) {
                            while (reader.hasNext()) {
                                try {
                                    Record record = reader.next();
                                    w.append(record);
                                    written.incrementAndGet();
                                } catch (DatasetRecordException e) {
                                    failures.add(e);
                                }
                            }
                        }
                    }
                }
            });
            long errors = failures.count();
            session.adjustCounter("Converted records", written.get(), false);
            session.adjustCounter("Conversion errors", errors, false);
            if (written.get() > 0L) {
                session.transfer(outgoingAvro, SUCCESS);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors + written.get() });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                    session.transfer(badRecords, INCOMPATIBLE);
                } else {
                    session.remove(badRecords);
                }
            } else {
                session.remove(outgoingAvro);
                if (errors > 0L) {
                    getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors });
                    badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                } else {
                    badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
                }
                session.transfer(badRecords, FAILURE);
            }
        } catch (ProcessException | DatasetIOException e) {
            getLogger().error("Failed reading or writing", e);
            session.transfer(incomingCSV, FAILURE);
        } catch (DatasetException e) {
            getLogger().error("Failed to read FlowFile", e);
            session.transfer(incomingCSV, FAILURE);
        }
    } catch (final IOException ioe) {
        throw new RuntimeException("Unable to close Avro Writer", ioe);
    }
}
Also used : Schema(org.apache.avro.Schema) OutputStream(java.io.OutputStream) DatasetRecordException(org.kitesdk.data.DatasetRecordException) DatasetException(org.kitesdk.data.DatasetException) Record(org.apache.avro.generic.GenericData.Record) FlowFile(org.apache.nifi.flowfile.FlowFile) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) StreamCallback(org.apache.nifi.processor.io.StreamCallback) CSVProperties(org.kitesdk.data.spi.filesystem.CSVProperties) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) CSVFileReader(org.kitesdk.data.spi.filesystem.CSVFileReader) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) DatasetIOException(org.kitesdk.data.DatasetIOException)

Example 70 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project nifi by apache.

the class ConvertJSONToAvro method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile incomingJSON = session.get();
    if (incomingJSON == null) {
        return;
    }
    String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingJSON).getValue();
    final Schema schema;
    try {
        schema = getSchema(schemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
        getLogger().error("Cannot find schema: " + schemaProperty);
        session.transfer(incomingJSON, FAILURE);
        return;
    }
    final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class));
    writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
    try {
        final AtomicLong written = new AtomicLong(0L);
        final FailureTracker failures = new FailureTracker();
        FlowFile badRecords = session.clone(incomingJSON);
        FlowFile outgoingAvro = session.write(incomingJSON, new StreamCallback() {

            @Override
            public void process(InputStream in, OutputStream out) throws IOException {
                try (JSONFileReader<Record> reader = new JSONFileReader<>(in, schema, Record.class)) {
                    reader.initialize();
                    try (DataFileWriter<Record> w = writer.create(schema, out)) {
                        while (reader.hasNext()) {
                            try {
                                Record record = reader.next();
                                w.append(record);
                                written.incrementAndGet();
                            } catch (final DatasetRecordException e) {
                                failures.add(e);
                            }
                        }
                    }
                }
            }
        });
        long errors = failures.count();
        session.adjustCounter("Converted records", written.get(), false);
        session.adjustCounter("Conversion errors", errors, false);
        if (written.get() > 0L) {
            session.transfer(outgoingAvro, SUCCESS);
            if (errors > 0L) {
                getLogger().warn("Failed to convert {}/{} records from JSON to Avro", new Object[] { errors, errors + written.get() });
                badRecords = session.putAttribute(badRecords, "errors", failures.summary());
                session.transfer(badRecords, INCOMPATIBLE);
            } else {
                session.remove(badRecords);
            }
        } else {
            session.remove(outgoingAvro);
            if (errors > 0L) {
                getLogger().warn("Failed to convert {}/{} records from JSON to Avro", new Object[] { errors, errors });
                badRecords = session.putAttribute(badRecords, "errors", failures.summary());
            } else {
                badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
            }
            session.transfer(badRecords, FAILURE);
        }
    } catch (ProcessException | DatasetIOException e) {
        getLogger().error("Failed reading or writing", e);
        session.transfer(incomingJSON, FAILURE);
    } catch (DatasetException e) {
        getLogger().error("Failed to read FlowFile", e);
        session.transfer(incomingJSON, FAILURE);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            getLogger().warn("Unable to close writer ressource", e);
        }
    }
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) InputStream(java.io.InputStream) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) OutputStream(java.io.OutputStream) DatasetIOException(org.kitesdk.data.DatasetIOException) IOException(java.io.IOException) StreamCallback(org.apache.nifi.processor.io.StreamCallback) DatasetRecordException(org.kitesdk.data.DatasetRecordException) DatasetException(org.kitesdk.data.DatasetException) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProcessException(org.apache.nifi.processor.exception.ProcessException) Record(org.apache.avro.generic.GenericData.Record) SchemaNotFoundException(org.kitesdk.data.SchemaNotFoundException) JSONFileReader(org.kitesdk.data.spi.filesystem.JSONFileReader) DatasetIOException(org.kitesdk.data.DatasetIOException)

Aggregations

DataFileWriter (org.apache.avro.file.DataFileWriter)102 GenericRecord (org.apache.avro.generic.GenericRecord)58 Schema (org.apache.avro.Schema)50 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)47 File (java.io.File)38 ByteArrayOutputStream (java.io.ByteArrayOutputStream)22 IOException (java.io.IOException)22 GenericData (org.apache.avro.generic.GenericData)17 FileOutputStream (java.io.FileOutputStream)15 Test (org.junit.Test)14 HashMap (java.util.HashMap)11 InputStream (java.io.InputStream)10 SpecificDatumWriter (org.apache.avro.specific.SpecificDatumWriter)10 ArrayList (java.util.ArrayList)9 Path (org.apache.hadoop.fs.Path)9 ByteArrayInputStream (java.io.ByteArrayInputStream)8 OutputStream (java.io.OutputStream)8 ByteBuffer (java.nio.ByteBuffer)7 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)7 MockFlowFile (org.apache.nifi.util.MockFlowFile)7