Search in sources :

Example 1 with OrcFlowFileWriter

use of org.apache.hadoop.hive.ql.io.orc.OrcFlowFileWriter in project nifi by apache.

the class ConvertAvroToORC method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    try {
        long startTime = System.currentTimeMillis();
        final long stripeSize = context.getProperty(STRIPE_SIZE).asDataSize(DataUnit.B).longValue();
        final int bufferSize = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B).intValue();
        final CompressionKind compressionType = CompressionKind.valueOf(context.getProperty(COMPRESSION_TYPE).getValue());
        final AtomicReference<Schema> hiveAvroSchema = new AtomicReference<>(null);
        final AtomicInteger totalRecordCount = new AtomicInteger(0);
        final String fileName = flowFile.getAttribute(CoreAttributes.FILENAME.key());
        flowFile = session.write(flowFile, (rawIn, rawOut) -> {
            try (final InputStream in = new BufferedInputStream(rawIn);
                final OutputStream out = new BufferedOutputStream(rawOut);
                final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<>())) {
                // Create ORC schema from Avro schema
                Schema avroSchema = reader.getSchema();
                TypeInfo orcSchema = NiFiOrcUtils.getOrcField(avroSchema);
                if (orcConfig == null) {
                    orcConfig = new Configuration();
                }
                OrcFlowFileWriter orcWriter = NiFiOrcUtils.createWriter(out, new Path(fileName), orcConfig, orcSchema, stripeSize, compressionType, bufferSize);
                try {
                    int recordCount = 0;
                    GenericRecord currRecord = null;
                    while (reader.hasNext()) {
                        currRecord = reader.next(currRecord);
                        List<Schema.Field> fields = currRecord.getSchema().getFields();
                        if (fields != null) {
                            Object[] row = new Object[fields.size()];
                            for (int i = 0; i < fields.size(); i++) {
                                Schema.Field field = fields.get(i);
                                Schema fieldSchema = field.schema();
                                Object o = currRecord.get(field.name());
                                try {
                                    row[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), o);
                                } catch (ArrayIndexOutOfBoundsException aioobe) {
                                    getLogger().error("Index out of bounds at record {} for column {}, type {}, and object {}", new Object[] { recordCount, i, fieldSchema.getType().getName(), o.toString() }, aioobe);
                                    throw new IOException(aioobe);
                                }
                            }
                            orcWriter.addRow(NiFiOrcUtils.createOrcStruct(orcSchema, row));
                            recordCount++;
                        }
                    }
                    hiveAvroSchema.set(avroSchema);
                    totalRecordCount.set(recordCount);
                } finally {
                    // finished writing this record, close the writer (which will flush to the flow file)
                    orcWriter.close();
                }
            }
        });
        final String hiveTableName = context.getProperty(HIVE_TABLE_NAME).isSet() ? context.getProperty(HIVE_TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue() : NiFiOrcUtils.normalizeHiveTableName(hiveAvroSchema.get().getFullName());
        String hiveDDL = NiFiOrcUtils.generateHiveDDL(hiveAvroSchema.get(), hiveTableName);
        // Add attributes and transfer to success
        flowFile = session.putAttribute(flowFile, RECORD_COUNT_ATTRIBUTE, Integer.toString(totalRecordCount.get()));
        flowFile = session.putAttribute(flowFile, HIVE_DDL_ATTRIBUTE, hiveDDL);
        StringBuilder newFilename = new StringBuilder();
        int extensionIndex = fileName.lastIndexOf(".");
        if (extensionIndex != -1) {
            newFilename.append(fileName.substring(0, extensionIndex));
        } else {
            newFilename.append(fileName);
        }
        newFilename.append(".orc");
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), ORC_MIME_TYPE);
        flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename.toString());
        session.transfer(flowFile, REL_SUCCESS);
        session.getProvenanceReporter().modifyContent(flowFile, "Converted " + totalRecordCount.get() + " records", System.currentTimeMillis() - startTime);
    } catch (final ProcessException pe) {
        getLogger().error("Failed to convert {} from Avro to ORC due to {}; transferring to failure", new Object[] { flowFile, pe });
        session.transfer(flowFile, REL_FAILURE);
    }
}
Also used : StandardValidators(org.apache.nifi.processor.util.StandardValidators) BufferedInputStream(java.io.BufferedInputStream) CapabilityDescription(org.apache.nifi.annotation.documentation.CapabilityDescription) SideEffectFree(org.apache.nifi.annotation.behavior.SideEffectFree) AtomicReference(java.util.concurrent.atomic.AtomicReference) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) ProcessException(org.apache.nifi.processor.exception.ProcessException) NiFiOrcUtils(org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils) BufferedOutputStream(java.io.BufferedOutputStream) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) WritesAttributes(org.apache.nifi.annotation.behavior.WritesAttributes) Relationship(org.apache.nifi.processor.Relationship) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) CompressionKind(org.apache.hadoop.hive.ql.io.orc.CompressionKind) HiveJdbcCommon(org.apache.nifi.util.hive.HiveJdbcCommon) OutputStream(java.io.OutputStream) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) FlowFile(org.apache.nifi.flowfile.FlowFile) ProcessContext(org.apache.nifi.processor.ProcessContext) DataFileStream(org.apache.avro.file.DataFileStream) Set(java.util.Set) OrcFlowFileWriter(org.apache.hadoop.hive.ql.io.orc.OrcFlowFileWriter) ProcessSession(org.apache.nifi.processor.ProcessSession) IOException(java.io.IOException) WritesAttribute(org.apache.nifi.annotation.behavior.WritesAttribute) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) HiveUtils(org.apache.nifi.util.hive.HiveUtils) InputRequirement(org.apache.nifi.annotation.behavior.InputRequirement) OnScheduled(org.apache.nifi.annotation.lifecycle.OnScheduled) List(java.util.List) SupportsBatching(org.apache.nifi.annotation.behavior.SupportsBatching) AbstractProcessor(org.apache.nifi.processor.AbstractProcessor) Tags(org.apache.nifi.annotation.documentation.Tags) DataUnit(org.apache.nifi.processor.DataUnit) CoreAttributes(org.apache.nifi.flowfile.attributes.CoreAttributes) Collections(java.util.Collections) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) InputStream(java.io.InputStream) Configuration(org.apache.hadoop.conf.Configuration) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) BufferedInputStream(java.io.BufferedInputStream) OrcFlowFileWriter(org.apache.hadoop.hive.ql.io.orc.OrcFlowFileWriter) ArrayList(java.util.ArrayList) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) BufferedOutputStream(java.io.BufferedOutputStream) Path(org.apache.hadoop.fs.Path) FlowFile(org.apache.nifi.flowfile.FlowFile) CompressionKind(org.apache.hadoop.hive.ql.io.orc.CompressionKind) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) ProcessException(org.apache.nifi.processor.exception.ProcessException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger)

Aggregations

BufferedInputStream (java.io.BufferedInputStream)1 BufferedOutputStream (java.io.BufferedOutputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 OutputStream (java.io.OutputStream)1 ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Set (java.util.Set)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 Schema (org.apache.avro.Schema)1 DataFileStream (org.apache.avro.file.DataFileStream)1 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 CompressionKind (org.apache.hadoop.hive.ql.io.orc.CompressionKind)1 NiFiOrcUtils (org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils)1