use of org.apache.avro.file.DataFileWriter in project nifi by apache.
the class TestConvertAvroToORC method test_onTrigger_array_of_records.
@Test
public void test_onTrigger_array_of_records() throws Exception {
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array_of_records.avsc"));
List<GenericRecord> innerRecords = new LinkedList<>();
final GenericRecord outerRecord = new GenericData.Record(schema);
Schema arraySchema = schema.getField("records").schema();
Schema innerRecordSchema = arraySchema.getElementType();
final GenericRecord innerRecord1 = new GenericData.Record(innerRecordSchema);
innerRecord1.put("name", "Joe");
innerRecord1.put("age", 42);
innerRecords.add(innerRecord1);
final GenericRecord innerRecord2 = new GenericData.Record(innerRecordSchema);
innerRecord2.put("name", "Mary");
innerRecord2.put("age", 28);
innerRecords.add(innerRecord2);
GenericData.Array<GenericRecord> array = new GenericData.Array<>(arraySchema, innerRecords);
outerRecord.put("records", array);
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
ByteArrayOutputStream out = new ByteArrayOutputStream();
try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
dataFileWriter.create(schema, out);
dataFileWriter.append(outerRecord);
}
out.close();
// Build a flow file from the Avro record
Map<String, String> attributes = new HashMap<String, String>() {
{
put(CoreAttributes.FILENAME.key(), "test");
}
};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
// Write the flow file out to disk, since the ORC Reader needs a path
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS org_apache_nifi_outer_record " + "(records ARRAY<STRUCT<name:STRING, age:INT>>)" + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
assertEquals("1", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
FileOutputStream fos = new FileOutputStream("target/test1.orc");
fos.write(resultContents);
fos.flush();
fos.close();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
Object o = rows.next(null);
assertNotNull(o);
assertTrue(o instanceof OrcStruct);
StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(schema));
// Verify the record contains an array
Object arrayFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("records"));
assertTrue(arrayFieldObject instanceof ArrayList);
ArrayList<?> arrayField = (ArrayList<?>) arrayFieldObject;
assertEquals(2, arrayField.size());
// Verify the first element. Should be a record with two fields "name" and "age"
Object element = arrayField.get(0);
assertTrue(element instanceof OrcStruct);
StructObjectInspector elementInspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(innerRecordSchema));
Object nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
assertTrue(nameObject instanceof Text);
assertEquals("Joe", nameObject.toString());
Object ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
assertTrue(ageObject instanceof IntWritable);
assertEquals(42, ((IntWritable) ageObject).get());
// Verify the first element. Should be a record with two fields "name" and "age"
element = arrayField.get(1);
assertTrue(element instanceof OrcStruct);
nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
assertTrue(nameObject instanceof Text);
assertEquals("Mary", nameObject.toString());
ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
assertTrue(ageObject instanceof IntWritable);
assertEquals(28, ((IntWritable) ageObject).get());
}
use of org.apache.avro.file.DataFileWriter in project nifi by apache.
the class TestPutHiveStreaming method createAvroRecord.
private byte[] createAvroRecord(List<Map<String, Object>> records) throws IOException {
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/user.avsc"));
List<GenericRecord> users = new LinkedList<>();
for (Map<String, Object> record : records) {
final GenericRecord user = new GenericData.Record(schema);
user.put("name", record.get("name"));
user.put("favorite_number", record.get("favorite_number"));
user.put("favorite_color", record.get("favorite_color"));
users.add(user);
}
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
ByteArrayOutputStream out = new ByteArrayOutputStream();
try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
dataFileWriter.create(schema, out);
for (final GenericRecord user : users) {
dataFileWriter.append(user);
}
}
return out.toByteArray();
}
use of org.apache.avro.file.DataFileWriter in project nifi by apache.
the class ConvertAvroSchema method onTrigger.
@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile incomingAvro = session.get();
if (incomingAvro == null) {
return;
}
String inputSchemaProperty = context.getProperty(INPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
final Schema inputSchema;
try {
inputSchema = getSchema(inputSchemaProperty, DefaultConfiguration.get());
} catch (SchemaNotFoundException e) {
getLogger().error("Cannot find schema: " + inputSchemaProperty);
session.transfer(incomingAvro, FAILURE);
return;
}
String outputSchemaProperty = context.getProperty(OUTPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
final Schema outputSchema;
try {
outputSchema = getSchema(outputSchemaProperty, DefaultConfiguration.get());
} catch (SchemaNotFoundException e) {
getLogger().error("Cannot find schema: " + outputSchemaProperty);
session.transfer(incomingAvro, FAILURE);
return;
}
final Map<String, String> fieldMapping = new HashMap<>();
for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
if (entry.getKey().isDynamic()) {
fieldMapping.put(entry.getKey().getName(), entry.getValue());
}
}
// Set locale
final String localeProperty = context.getProperty(LOCALE).getValue();
final Locale locale = localeProperty.equals(DEFAULT_LOCALE_VALUE) ? Locale.getDefault() : LocaleUtils.toLocale(localeProperty);
final AvroRecordConverter converter = new AvroRecordConverter(inputSchema, outputSchema, fieldMapping, locale);
final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
final DataFileWriter<Record> failureWriter = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
failureWriter.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
try {
final AtomicLong written = new AtomicLong(0L);
final FailureTracker failures = new FailureTracker();
final List<Record> badRecords = Lists.newLinkedList();
FlowFile incomingAvroCopy = session.clone(incomingAvro);
FlowFile outgoingAvro = session.write(incomingAvro, new StreamCallback() {
@Override
public void process(InputStream in, OutputStream out) throws IOException {
try (DataFileStream<Record> stream = new DataFileStream<Record>(in, new GenericDatumReader<Record>(converter.getInputSchema()))) {
try (DataFileWriter<Record> w = writer.create(outputSchema, out)) {
for (Record record : stream) {
try {
Record converted = converter.convert(record);
w.append(converted);
written.incrementAndGet();
} catch (AvroConversionException e) {
failures.add(e);
getLogger().error("Error converting data: " + e.getMessage());
badRecords.add(record);
}
}
}
}
}
});
FlowFile badOutput = session.write(incomingAvroCopy, new StreamCallback() {
@Override
public void process(InputStream in, OutputStream out) throws IOException {
try (DataFileWriter<Record> w = failureWriter.create(inputSchema, out)) {
for (Record record : badRecords) {
w.append(record);
}
}
}
});
long errors = failures.count();
// update only if file transfer is successful
session.adjustCounter("Converted records", written.get(), false);
// update only if file transfer is successful
session.adjustCounter("Conversion errors", errors, false);
if (written.get() > 0L) {
session.transfer(outgoingAvro, SUCCESS);
} else {
session.remove(outgoingAvro);
if (errors == 0L) {
badOutput = session.putAttribute(badOutput, "errors", "No incoming records");
session.transfer(badOutput, FAILURE);
}
}
if (errors > 0L) {
getLogger().warn("Failed to convert {}/{} records between Avro Schemas", new Object[] { errors, errors + written.get() });
badOutput = session.putAttribute(badOutput, "errors", failures.summary());
session.transfer(badOutput, FAILURE);
} else {
session.remove(badOutput);
}
} catch (ProcessException | DatasetIOException e) {
getLogger().error("Failed reading or writing", e);
session.transfer(incomingAvro, FAILURE);
} catch (DatasetException e) {
getLogger().error("Failed to read FlowFile", e);
session.transfer(incomingAvro, FAILURE);
} finally {
try {
writer.close();
} catch (IOException e) {
getLogger().warn("Unable to close writer ressource", e);
}
try {
failureWriter.close();
} catch (IOException e) {
getLogger().warn("Unable to close writer ressource", e);
}
}
}
use of org.apache.avro.file.DataFileWriter in project nifi by apache.
the class ConvertCSVToAvro method onTrigger.
@Override
public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile incomingCSV = session.get();
if (incomingCSV == null) {
return;
}
CSVProperties props = new CSVProperties.Builder().charset(context.getProperty(CHARSET).evaluateAttributeExpressions(incomingCSV).getValue()).delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(incomingCSV).getValue()).quote(context.getProperty(QUOTE).evaluateAttributeExpressions(incomingCSV).getValue()).escape(context.getProperty(ESCAPE).evaluateAttributeExpressions(incomingCSV).getValue()).hasHeader(context.getProperty(HAS_HEADER).evaluateAttributeExpressions(incomingCSV).asBoolean()).linesToSkip(context.getProperty(LINES_TO_SKIP).evaluateAttributeExpressions(incomingCSV).asInteger()).build();
String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingCSV).getValue();
final Schema schema;
try {
schema = getSchema(schemaProperty, DefaultConfiguration.get());
} catch (SchemaNotFoundException e) {
getLogger().error("Cannot find schema: " + schemaProperty);
session.transfer(incomingCSV, FAILURE);
return;
}
try (final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class))) {
writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
try {
final AtomicLong written = new AtomicLong(0L);
final FailureTracker failures = new FailureTracker();
FlowFile badRecords = session.clone(incomingCSV);
FlowFile outgoingAvro = session.write(incomingCSV, new StreamCallback() {
@Override
public void process(InputStream in, OutputStream out) throws IOException {
try (CSVFileReader<Record> reader = new CSVFileReader<>(in, props, schema, Record.class)) {
reader.initialize();
try (DataFileWriter<Record> w = writer.create(schema, out)) {
while (reader.hasNext()) {
try {
Record record = reader.next();
w.append(record);
written.incrementAndGet();
} catch (DatasetRecordException e) {
failures.add(e);
}
}
}
}
}
});
long errors = failures.count();
session.adjustCounter("Converted records", written.get(), false);
session.adjustCounter("Conversion errors", errors, false);
if (written.get() > 0L) {
session.transfer(outgoingAvro, SUCCESS);
if (errors > 0L) {
getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors + written.get() });
badRecords = session.putAttribute(badRecords, "errors", failures.summary());
session.transfer(badRecords, INCOMPATIBLE);
} else {
session.remove(badRecords);
}
} else {
session.remove(outgoingAvro);
if (errors > 0L) {
getLogger().warn("Failed to convert {}/{} records from CSV to Avro", new Object[] { errors, errors });
badRecords = session.putAttribute(badRecords, "errors", failures.summary());
} else {
badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
}
session.transfer(badRecords, FAILURE);
}
} catch (ProcessException | DatasetIOException e) {
getLogger().error("Failed reading or writing", e);
session.transfer(incomingCSV, FAILURE);
} catch (DatasetException e) {
getLogger().error("Failed to read FlowFile", e);
session.transfer(incomingCSV, FAILURE);
}
} catch (final IOException ioe) {
throw new RuntimeException("Unable to close Avro Writer", ioe);
}
}
use of org.apache.avro.file.DataFileWriter in project nifi by apache.
the class ConvertJSONToAvro method onTrigger.
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile incomingJSON = session.get();
if (incomingJSON == null) {
return;
}
String schemaProperty = context.getProperty(SCHEMA).evaluateAttributeExpressions(incomingJSON).getValue();
final Schema schema;
try {
schema = getSchema(schemaProperty, DefaultConfiguration.get());
} catch (SchemaNotFoundException e) {
getLogger().error("Cannot find schema: " + schemaProperty);
session.transfer(incomingJSON, FAILURE);
return;
}
final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(schema, Record.class));
writer.setCodec(getCodecFactory(context.getProperty(COMPRESSION_TYPE).getValue()));
try {
final AtomicLong written = new AtomicLong(0L);
final FailureTracker failures = new FailureTracker();
FlowFile badRecords = session.clone(incomingJSON);
FlowFile outgoingAvro = session.write(incomingJSON, new StreamCallback() {
@Override
public void process(InputStream in, OutputStream out) throws IOException {
try (JSONFileReader<Record> reader = new JSONFileReader<>(in, schema, Record.class)) {
reader.initialize();
try (DataFileWriter<Record> w = writer.create(schema, out)) {
while (reader.hasNext()) {
try {
Record record = reader.next();
w.append(record);
written.incrementAndGet();
} catch (final DatasetRecordException e) {
failures.add(e);
}
}
}
}
}
});
long errors = failures.count();
session.adjustCounter("Converted records", written.get(), false);
session.adjustCounter("Conversion errors", errors, false);
if (written.get() > 0L) {
session.transfer(outgoingAvro, SUCCESS);
if (errors > 0L) {
getLogger().warn("Failed to convert {}/{} records from JSON to Avro", new Object[] { errors, errors + written.get() });
badRecords = session.putAttribute(badRecords, "errors", failures.summary());
session.transfer(badRecords, INCOMPATIBLE);
} else {
session.remove(badRecords);
}
} else {
session.remove(outgoingAvro);
if (errors > 0L) {
getLogger().warn("Failed to convert {}/{} records from JSON to Avro", new Object[] { errors, errors });
badRecords = session.putAttribute(badRecords, "errors", failures.summary());
} else {
badRecords = session.putAttribute(badRecords, "errors", "No incoming records");
}
session.transfer(badRecords, FAILURE);
}
} catch (ProcessException | DatasetIOException e) {
getLogger().error("Failed reading or writing", e);
session.transfer(incomingJSON, FAILURE);
} catch (DatasetException e) {
getLogger().error("Failed to read FlowFile", e);
session.transfer(incomingJSON, FAILURE);
} finally {
try {
writer.close();
} catch (IOException e) {
getLogger().warn("Unable to close writer ressource", e);
}
}
}
Aggregations