Search in sources :

Example 1 with CompressionKind

use of org.apache.hadoop.hive.ql.io.orc.CompressionKind in project nifi by apache.

the class ConvertAvroToORC method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    try {
        long startTime = System.currentTimeMillis();
        final long stripeSize = context.getProperty(STRIPE_SIZE).asDataSize(DataUnit.B).longValue();
        final int bufferSize = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B).intValue();
        final CompressionKind compressionType = CompressionKind.valueOf(context.getProperty(COMPRESSION_TYPE).getValue());
        final AtomicReference<Schema> hiveAvroSchema = new AtomicReference<>(null);
        final AtomicInteger totalRecordCount = new AtomicInteger(0);
        final String fileName = flowFile.getAttribute(CoreAttributes.FILENAME.key());
        flowFile = session.write(flowFile, (rawIn, rawOut) -> {
            try (final InputStream in = new BufferedInputStream(rawIn);
                final OutputStream out = new BufferedOutputStream(rawOut);
                final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<>())) {
                // Create ORC schema from Avro schema
                Schema avroSchema = reader.getSchema();
                TypeInfo orcSchema = NiFiOrcUtils.getOrcField(avroSchema);
                if (orcConfig == null) {
                    orcConfig = new Configuration();
                }
                OrcFlowFileWriter orcWriter = NiFiOrcUtils.createWriter(out, new Path(fileName), orcConfig, orcSchema, stripeSize, compressionType, bufferSize);
                try {
                    int recordCount = 0;
                    GenericRecord currRecord = null;
                    while (reader.hasNext()) {
                        currRecord = reader.next(currRecord);
                        List<Schema.Field> fields = currRecord.getSchema().getFields();
                        if (fields != null) {
                            Object[] row = new Object[fields.size()];
                            for (int i = 0; i < fields.size(); i++) {
                                Schema.Field field = fields.get(i);
                                Schema fieldSchema = field.schema();
                                Object o = currRecord.get(field.name());
                                try {
                                    row[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), o);
                                } catch (ArrayIndexOutOfBoundsException aioobe) {
                                    getLogger().error("Index out of bounds at record {} for column {}, type {}, and object {}", new Object[] { recordCount, i, fieldSchema.getType().getName(), o.toString() }, aioobe);
                                    throw new IOException(aioobe);
                                }
                            }
                            orcWriter.addRow(NiFiOrcUtils.createOrcStruct(orcSchema, row));
                            recordCount++;
                        }
                    }
                    hiveAvroSchema.set(avroSchema);
                    totalRecordCount.set(recordCount);
                } finally {
                    // finished writing this record, close the writer (which will flush to the flow file)
                    orcWriter.close();
                }
            }
        });
        final String hiveTableName = context.getProperty(HIVE_TABLE_NAME).isSet() ? context.getProperty(HIVE_TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue() : NiFiOrcUtils.normalizeHiveTableName(hiveAvroSchema.get().getFullName());
        String hiveDDL = NiFiOrcUtils.generateHiveDDL(hiveAvroSchema.get(), hiveTableName);
        // Add attributes and transfer to success
        flowFile = session.putAttribute(flowFile, RECORD_COUNT_ATTRIBUTE, Integer.toString(totalRecordCount.get()));
        flowFile = session.putAttribute(flowFile, HIVE_DDL_ATTRIBUTE, hiveDDL);
        StringBuilder newFilename = new StringBuilder();
        int extensionIndex = fileName.lastIndexOf(".");
        if (extensionIndex != -1) {
            newFilename.append(fileName.substring(0, extensionIndex));
        } else {
            newFilename.append(fileName);
        }
        newFilename.append(".orc");
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), ORC_MIME_TYPE);
        flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename.toString());
        session.transfer(flowFile, REL_SUCCESS);
        session.getProvenanceReporter().modifyContent(flowFile, "Converted " + totalRecordCount.get() + " records", System.currentTimeMillis() - startTime);
    } catch (final ProcessException pe) {
        getLogger().error("Failed to convert {} from Avro to ORC due to {}; transferring to failure", new Object[] { flowFile, pe });
        session.transfer(flowFile, REL_FAILURE);
    }
}
Also used : StandardValidators(org.apache.nifi.processor.util.StandardValidators) BufferedInputStream(java.io.BufferedInputStream) CapabilityDescription(org.apache.nifi.annotation.documentation.CapabilityDescription) SideEffectFree(org.apache.nifi.annotation.behavior.SideEffectFree) AtomicReference(java.util.concurrent.atomic.AtomicReference) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) ProcessException(org.apache.nifi.processor.exception.ProcessException) NiFiOrcUtils(org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils) BufferedOutputStream(java.io.BufferedOutputStream) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) WritesAttributes(org.apache.nifi.annotation.behavior.WritesAttributes) Relationship(org.apache.nifi.processor.Relationship) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) CompressionKind(org.apache.hadoop.hive.ql.io.orc.CompressionKind) HiveJdbcCommon(org.apache.nifi.util.hive.HiveJdbcCommon) OutputStream(java.io.OutputStream) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) FlowFile(org.apache.nifi.flowfile.FlowFile) ProcessContext(org.apache.nifi.processor.ProcessContext) DataFileStream(org.apache.avro.file.DataFileStream) Set(java.util.Set) OrcFlowFileWriter(org.apache.hadoop.hive.ql.io.orc.OrcFlowFileWriter) ProcessSession(org.apache.nifi.processor.ProcessSession) IOException(java.io.IOException) WritesAttribute(org.apache.nifi.annotation.behavior.WritesAttribute) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) HiveUtils(org.apache.nifi.util.hive.HiveUtils) InputRequirement(org.apache.nifi.annotation.behavior.InputRequirement) OnScheduled(org.apache.nifi.annotation.lifecycle.OnScheduled) List(java.util.List) SupportsBatching(org.apache.nifi.annotation.behavior.SupportsBatching) AbstractProcessor(org.apache.nifi.processor.AbstractProcessor) Tags(org.apache.nifi.annotation.documentation.Tags) DataUnit(org.apache.nifi.processor.DataUnit) CoreAttributes(org.apache.nifi.flowfile.attributes.CoreAttributes) Collections(java.util.Collections) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) InputStream(java.io.InputStream) Configuration(org.apache.hadoop.conf.Configuration) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) BufferedInputStream(java.io.BufferedInputStream) OrcFlowFileWriter(org.apache.hadoop.hive.ql.io.orc.OrcFlowFileWriter) ArrayList(java.util.ArrayList) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) BufferedOutputStream(java.io.BufferedOutputStream) Path(org.apache.hadoop.fs.Path) FlowFile(org.apache.nifi.flowfile.FlowFile) CompressionKind(org.apache.hadoop.hive.ql.io.orc.CompressionKind) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) ProcessException(org.apache.nifi.processor.exception.ProcessException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger)

Example 2 with CompressionKind

use of org.apache.hadoop.hive.ql.io.orc.CompressionKind in project cdap by caskdata.

the class PartitionConcatenateTest method writeSmallOrcFiles.

private List<String> writeSmallOrcFiles(Location baseLocation, int numInputFiles) throws IOException {
    TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("struct<key:string>");
    ObjectInspector objectInspector = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);
    Configuration hConf = new Configuration();
    FileSystem fileSystem = FileSystem.get(hConf);
    long stripeSize = HiveConf.getLongVar(hConf, HiveConf.ConfVars.HIVE_ORC_DEFAULT_STRIPE_SIZE);
    CompressionKind compressionKind = CompressionKind.valueOf(HiveConf.getVar(hConf, HiveConf.ConfVars.HIVE_ORC_DEFAULT_COMPRESS));
    int bufferSize = HiveConf.getIntVar(hConf, HiveConf.ConfVars.HIVE_ORC_DEFAULT_BUFFER_SIZE);
    int rowIndexStride = HiveConf.getIntVar(hConf, HiveConf.ConfVars.HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE);
    List<String> writtenData = new ArrayList<>();
    for (int i = 0; i < numInputFiles; i++) {
        Location childFile = baseLocation.append("child_" + i);
        Writer orcWriter = OrcFile.createWriter(fileSystem, new Path(childFile.toURI()), hConf, objectInspector, stripeSize, compressionKind, bufferSize, rowIndexStride);
        try {
            String toWrite = "outputData" + i;
            orcWriter.addRow(Collections.singletonList(toWrite));
            writtenData.add(toWrite);
        } finally {
            orcWriter.close();
        }
    }
    Collections.sort(writtenData);
    return writtenData;
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) CompressionKind(org.apache.hadoop.hive.ql.io.orc.CompressionKind) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) FileSystem(org.apache.hadoop.fs.FileSystem) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) Location(org.apache.twill.filesystem.Location)

Aggregations

ArrayList (java.util.ArrayList)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 CompressionKind (org.apache.hadoop.hive.ql.io.orc.CompressionKind)2 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)2 BufferedInputStream (java.io.BufferedInputStream)1 BufferedOutputStream (java.io.BufferedOutputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 OutputStream (java.io.OutputStream)1 Collections (java.util.Collections)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Set (java.util.Set)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 Schema (org.apache.avro.Schema)1 DataFileStream (org.apache.avro.file.DataFileStream)1 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)1 GenericRecord (org.apache.avro.generic.GenericRecord)1