Examples with HiveOptions - org.apache.nifi.util.hive.HiveOptions

Example 1 with HiveOptions

use of org.apache.nifi.util.hive.HiveOptions in project nifi by apache.

the class PutHiveStreaming method onTrigger.

private void onTrigger(ProcessContext context, ProcessSession session, FunctionContext functionContext) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final String dbName = context.getProperty(DB_NAME).evaluateAttributeExpressions(flowFile).getValue();
    final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
    // Only allow one thread to work on a DB/table at a time
    final Semaphore newSemaphore = new Semaphore(1);
    Semaphore semaphore = tableSemaphoreMap.putIfAbsent(dbName + "." + tableName, newSemaphore);
    if (semaphore == null) {
        semaphore = newSemaphore;
    }
    boolean gotSemaphore = false;
    try {
        gotSemaphore = semaphore.tryAcquire(0, TimeUnit.SECONDS);
    } catch (InterruptedException ie) {
    // Nothing to do, gotSemaphore defaults to false
    }
    if (!gotSemaphore) {
        // We didn't get a chance to acquire, so rollback the session and try again next time
        session.rollback();
        return;
    }
    final ComponentLog log = getLogger();
    final String metastoreUri = context.getProperty(METASTORE_URI).evaluateAttributeExpressions(flowFile).getValue();
    final boolean autoCreatePartitions = context.getProperty(AUTOCREATE_PARTITIONS).asBoolean();
    final Integer maxConnections = context.getProperty(MAX_OPEN_CONNECTIONS).asInteger();
    final Integer heartbeatInterval = context.getProperty(HEARTBEAT_INTERVAL).evaluateAttributeExpressions().asInteger();
    final Integer txnsPerBatch = context.getProperty(TXNS_PER_BATCH).evaluateAttributeExpressions(flowFile).asInteger();
    final Integer recordsPerTxn = context.getProperty(RECORDS_PER_TXN).evaluateAttributeExpressions(flowFile).asInteger();
    final Map<HiveEndPoint, HiveWriter> myWriters = new ConcurrentHashMap<>();
    threadWriterList.add(myWriters);
    HiveOptions o = new HiveOptions(metastoreUri, dbName, tableName).withTxnsPerBatch(txnsPerBatch).withAutoCreatePartitions(autoCreatePartitions).withMaxOpenConnections(maxConnections).withHeartBeatInterval(heartbeatInterval).withCallTimeout(callTimeout);
    if (SecurityUtil.isSecurityEnabled(hiveConfig)) {
        final String explicitPrincipal = context.getProperty(kerberosProperties.getKerberosPrincipal()).evaluateAttributeExpressions().getValue();
        final String explicitKeytab = context.getProperty(kerberosProperties.getKerberosKeytab()).evaluateAttributeExpressions().getValue();
        final KerberosCredentialsService credentialsService = context.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class);
        final String resolvedPrincipal;
        final String resolvedKeytab;
        if (credentialsService == null) {
            resolvedPrincipal = explicitPrincipal;
            resolvedKeytab = explicitKeytab;
        } else {
            resolvedPrincipal = credentialsService.getPrincipal();
            resolvedKeytab = credentialsService.getKeytab();
        }
        o = o.withKerberosPrincipal(resolvedPrincipal).withKerberosKeytab(resolvedKeytab);
    }
    final HiveOptions options = o;
    // Store the original class loader, then explicitly set it to this class's classloader (for use by the Hive Metastore)
    ClassLoader originalClassloader = Thread.currentThread().getContextClassLoader();
    Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
    final List<String> partitionColumnList;
    final String partitionColumns = context.getProperty(PARTITION_COLUMNS).evaluateAttributeExpressions().getValue();
    if (partitionColumns == null || partitionColumns.isEmpty()) {
        partitionColumnList = Collections.emptyList();
    } else {
        String[] partitionCols = partitionColumns.split(",");
        partitionColumnList = new ArrayList<>(partitionCols.length);
        for (String col : partitionCols) {
            partitionColumnList.add(col.trim());
        }
    }
    final AtomicReference<List<HiveStreamingRecord>> successfulRecords = new AtomicReference<>();
    successfulRecords.set(new ArrayList<>());
    final FlowFile inputFlowFile = flowFile;
    final RoutingResult result = new RoutingResult();
    final ExceptionHandler<FunctionContext> exceptionHandler = new ExceptionHandler<>();
    exceptionHandler.mapException(s -> {
        try {
            if (s == null) {
                return ErrorTypes.PersistentFailure;
            }
            throw s;
        } catch (IllegalArgumentException | HiveWriter.WriteFailure | SerializationError inputError) {
            return ErrorTypes.InvalidInput;
        } catch (HiveWriter.CommitFailure | HiveWriter.TxnBatchFailure | HiveWriter.TxnFailure writerTxError) {
            return ErrorTypes.TemporalInputFailure;
        } catch (ConnectionError | HiveWriter.ConnectFailure connectionError) {
            // Can't connect to Hive endpoint.
            log.error("Error connecting to Hive endpoint: table {} at {}", new Object[] { options.getTableName(), options.getMetaStoreURI() });
            return ErrorTypes.TemporalFailure;
        } catch (IOException | InterruptedException tempError) {
            return ErrorTypes.TemporalFailure;
        } catch (Exception t) {
            return ErrorTypes.UnknownFailure;
        }
    });
    final BiFunction<FunctionContext, ErrorTypes, ErrorTypes.Result> adjustError = RollbackOnFailure.createAdjustError(getLogger());
    exceptionHandler.adjustError(adjustError);
    // Create output flow files and their Avro writers
    functionContext.setFlowFiles(session.create(inputFlowFile), session.create(inputFlowFile));
    try {
        session.read(inputFlowFile, new InputStreamCallback() {

            @Override
            public void process(InputStream in) throws IOException {
                try (final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
                    GenericRecord currRecord = null;
                    // Copy codec and schema information to all writers
                    final String codec = reader.getMetaString(DataFileConstants.CODEC) == null ? DataFileConstants.NULL_CODEC : reader.getMetaString(DataFileConstants.CODEC);
                    functionContext.initAvroWriters(session, codec, reader);
                    Runnable flushSuccessfulRecords = () -> {
                        // Now send the records to the successful FlowFile and update the success count
                        functionContext.appendRecordsToSuccess(session, successfulRecords.get());
                        // Clear the list of successful records, we'll use it at the end when we flush whatever records are left
                        successfulRecords.set(new ArrayList<>());
                    };
                    while (reader.hasNext()) {
                        // We can NOT reuse currRecord here, because currRecord is accumulated in successful records.
                        // If we use the same GenericRecord instance, every record ends up having the same contents.
                        // To avoid this, we need to create a brand new GenericRecord instance here each time.
                        currRecord = reader.next();
                        functionContext.recordCount.incrementAndGet();
                        // Extract the partition values (they must be put separately into the Hive Streaming API)
                        List<String> partitionValues = new ArrayList<>();
                        if (!exceptionHandler.execute(functionContext, currRecord, input -> {
                            for (String partition : partitionColumnList) {
                                Object partitionValue = input.get(partition);
                                if (partitionValue == null) {
                                    throw new IllegalArgumentException("Partition column '" + partition + "' not found in Avro record");
                                }
                                partitionValues.add(partitionValue.toString());
                            }
                        }, onRecordError(context, session, myWriters))) {
                            continue;
                        }
                        final HiveStreamingRecord record = new HiveStreamingRecord(partitionValues, currRecord);
                        final AtomicReference<HiveWriter> hiveWriterRef = new AtomicReference<>();
                        // Write record to Hive streaming
                        if (!exceptionHandler.execute(functionContext, record, input -> {
                            final HiveEndPoint endPoint = makeHiveEndPoint(record.getPartitionValues(), options);
                            final HiveWriter hiveWriter = getOrCreateWriter(myWriters, options, endPoint);
                            hiveWriterRef.set(hiveWriter);
                            hiveWriter.write(record.getRecord().toString().getBytes(StandardCharsets.UTF_8));
                            successfulRecords.get().add(record);
                        }, onHiveRecordError(context, session, myWriters))) {
                            continue;
                        }
                        // If we've reached the records-per-transaction limit, flush the Hive Writer and update the Avro Writer for successful records
                        final HiveWriter hiveWriter = hiveWriterRef.get();
                        if (hiveWriter.getTotalRecords() >= recordsPerTxn) {
                            exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
                                hiveWriter.flush(true);
                                // Proceed function context. Process session can't be rollback anymore.
                                functionContext.proceed();
                                // Now send the records to the success relationship and update the success count
                                flushSuccessfulRecords.run();
                            }, onHiveRecordsError(context, session, myWriters).andThen((fc, input, res, commitException) -> {
                                // Reset hiveWriter for succeeding records.
                                switch(res.destination()) {
                                    case Retry:
                                    case Failure:
                                        try {
                                            // Abort current tx and move to next.
                                            hiveWriter.abort();
                                        } catch (Exception e) {
                                            // Can't even abort properly, throw a process exception
                                            throw new ProcessException(e);
                                        }
                                }
                            }));
                        }
                    }
                    exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
                        // Finish any transactions
                        flushAllWriters(myWriters, true);
                        closeAllWriters(myWriters);
                        // Now send any remaining records to the success relationship and update the count
                        flushSuccessfulRecords.run();
                    // Append successfulRecords on failure.
                    }, onHiveRecordsError(context, session, myWriters));
                } catch (IOException ioe) {
                    // The Avro file is invalid (or may not be an Avro file at all), send it to failure
                    final ErrorTypes.Result adjusted = adjustError.apply(functionContext, ErrorTypes.InvalidInput);
                    final String msg = "The incoming flow file can not be read as an Avro file";
                    switch(adjusted.destination()) {
                        case Failure:
                            log.error(msg, ioe);
                            result.routeTo(inputFlowFile, REL_FAILURE);
                            break;
                        case ProcessException:
                            throw new ProcessException(msg, ioe);
                    }
                }
            }
        });
        // If we got here, we've processed the outgoing flow files correctly, so remove the incoming one if necessary
        if (result.getRoutedFlowFiles().values().stream().noneMatch(routed -> routed.contains(inputFlowFile))) {
            session.remove(inputFlowFile);
        }
    } catch (DiscontinuedException e) {
        // The input FlowFile processing is discontinued. Keep it in the input queue.
        getLogger().warn("Discontinued processing for {} due to {}", new Object[] { flowFile, e }, e);
        result.routeTo(flowFile, Relationship.SELF);
    } catch (ShouldRetryException e) {
        // This exception is already a result of adjusting an error, so simply transfer the FlowFile to retry.
        getLogger().error(e.getMessage(), e);
        flowFile = session.penalize(flowFile);
        result.routeTo(flowFile, REL_RETRY);
    } finally {
        threadWriterList.remove(myWriters);
        functionContext.transferFlowFiles(session, result, options);
        // Restore original class loader, might not be necessary but is good practice since the processor task changed it
        Thread.currentThread().setContextClassLoader(originalClassloader);
        semaphore.release();
    }
}

Also used : StandardValidators(org.apache.nifi.processor.util.StandardValidators) ConnectionError(org.apache.hive.hcatalog.streaming.ConnectionError) BiFunction(java.util.function.BiFunction) Timer(java.util.Timer) StreamingException(org.apache.hive.hcatalog.streaming.StreamingException) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) ErrorTypes(org.apache.nifi.processor.util.pattern.ErrorTypes) HiveEndPoint(org.apache.hive.hcatalog.streaming.HiveEndPoint) Snappy(org.xerial.snappy.Snappy) RoutingResult(org.apache.nifi.processor.util.pattern.RoutingResult) DiscontinuedException(org.apache.nifi.processor.util.pattern.DiscontinuedException) WritesAttributes(org.apache.nifi.annotation.behavior.WritesAttributes) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) AuthenticationFailedException(org.apache.nifi.util.hive.AuthenticationFailedException) Map(java.util.Map) ExceptionHandler(org.apache.nifi.processor.util.pattern.ExceptionHandler) CodecFactory(org.apache.avro.file.CodecFactory) TimerTask(java.util.TimerTask) DataFileConstants(org.apache.avro.file.DataFileConstants) HiveConfigurator(org.apache.nifi.util.hive.HiveConfigurator) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) SecurityUtil(org.apache.nifi.hadoop.SecurityUtil) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) RequiresInstanceClassLoading(org.apache.nifi.annotation.behavior.RequiresInstanceClassLoading) FlowFile(org.apache.nifi.flowfile.FlowFile) KerberosProperties(org.apache.nifi.hadoop.KerberosProperties) SerializationError(org.apache.hive.hcatalog.streaming.SerializationError) Collection(java.util.Collection) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) DataFileWriter(org.apache.avro.file.DataFileWriter) WritesAttribute(org.apache.nifi.annotation.behavior.WritesAttribute) StandardCharsets(java.nio.charset.StandardCharsets) Executors(java.util.concurrent.Executors) List(java.util.List) SeekableByteArrayInput(org.apache.avro.file.SeekableByteArrayInput) Tags(org.apache.nifi.annotation.documentation.Tags) Pattern(java.util.regex.Pattern) ValidationResources(org.apache.nifi.util.hive.ValidationResources) ProcessorInitializationContext(org.apache.nifi.processor.ProcessorInitializationContext) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) RollbackOnFailure(org.apache.nifi.processor.util.pattern.RollbackOnFailure) CapabilityDescription(org.apache.nifi.annotation.documentation.CapabilityDescription) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ValidationContext(org.apache.nifi.components.ValidationContext) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) HashMap(java.util.HashMap) ComponentLog(org.apache.nifi.logging.ComponentLog) AtomicReference(java.util.concurrent.atomic.AtomicReference) ProcessException(org.apache.nifi.processor.exception.ProcessException) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Relationship(org.apache.nifi.processor.Relationship) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) AbstractSessionFactoryProcessor(org.apache.nifi.processor.AbstractSessionFactoryProcessor) ValidationResult(org.apache.nifi.components.ValidationResult) ExecutorService(java.util.concurrent.ExecutorService) GenericRecord(org.apache.avro.generic.GenericRecord) Validator(org.apache.nifi.components.Validator) Semaphore(java.util.concurrent.Semaphore) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ProcessContext(org.apache.nifi.processor.ProcessContext) DataFileStream(org.apache.avro.file.DataFileStream) ProcessSession(org.apache.nifi.processor.ProcessSession) IOException(java.io.IOException) ProcessSessionFactory(org.apache.nifi.processor.ProcessSessionFactory) File(java.io.File) TimeUnit(java.util.concurrent.TimeUnit) HiveUtils(org.apache.nifi.util.hive.HiveUtils) OnScheduled(org.apache.nifi.annotation.lifecycle.OnScheduled) KerberosCredentialsService(org.apache.nifi.kerberos.KerberosCredentialsService) HiveOptions(org.apache.nifi.util.hive.HiveOptions) HiveWriter(org.apache.nifi.util.hive.HiveWriter) OnStopped(org.apache.nifi.annotation.lifecycle.OnStopped) Collections(java.util.Collections) InputStream(java.io.InputStream) RoutingResult(org.apache.nifi.processor.util.pattern.RoutingResult) ValidationResult(org.apache.nifi.components.ValidationResult) RoutingResult(org.apache.nifi.processor.util.pattern.RoutingResult) ExceptionHandler(org.apache.nifi.processor.util.pattern.ExceptionHandler) List(java.util.List) ArrayList(java.util.ArrayList) FlowFile(org.apache.nifi.flowfile.FlowFile) ComponentLog(org.apache.nifi.logging.ComponentLog) SerializationError(org.apache.hive.hcatalog.streaming.SerializationError) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Semaphore(java.util.concurrent.Semaphore) ErrorTypes(org.apache.nifi.processor.util.pattern.ErrorTypes) HiveEndPoint(org.apache.hive.hcatalog.streaming.HiveEndPoint) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) GenericRecord(org.apache.avro.generic.GenericRecord) HiveWriter(org.apache.nifi.util.hive.HiveWriter) InputStream(java.io.InputStream) ConnectionError(org.apache.hive.hcatalog.streaming.ConnectionError) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) StreamingException(org.apache.hive.hcatalog.streaming.StreamingException) DiscontinuedException(org.apache.nifi.processor.util.pattern.DiscontinuedException) AuthenticationFailedException(org.apache.nifi.util.hive.AuthenticationFailedException) ProcessException(org.apache.nifi.processor.exception.ProcessException) IOException(java.io.IOException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ProcessException(org.apache.nifi.processor.exception.ProcessException) KerberosCredentialsService(org.apache.nifi.kerberos.KerberosCredentialsService) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) DiscontinuedException(org.apache.nifi.processor.util.pattern.DiscontinuedException) HiveOptions(org.apache.nifi.util.hive.HiveOptions)

Aggregations

ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 File (java.io.File)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 StandardCharsets (java.nio.charset.StandardCharsets)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 Timer (java.util.Timer)1 TimerTask (java.util.TimerTask)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)1 ExecutorService (java.util.concurrent.ExecutorService)1 Executors (java.util.concurrent.Executors)1