Search in sources :

Example 1 with SerializationError

use of org.apache.hive.hcatalog.streaming.SerializationError in project nifi by apache.

the class PutHiveStreaming method onTrigger.

private void onTrigger(ProcessContext context, ProcessSession session, FunctionContext functionContext) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final String dbName = context.getProperty(DB_NAME).evaluateAttributeExpressions(flowFile).getValue();
    final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
    // Only allow one thread to work on a DB/table at a time
    final Semaphore newSemaphore = new Semaphore(1);
    Semaphore semaphore = tableSemaphoreMap.putIfAbsent(dbName + "." + tableName, newSemaphore);
    if (semaphore == null) {
        semaphore = newSemaphore;
    }
    boolean gotSemaphore = false;
    try {
        gotSemaphore = semaphore.tryAcquire(0, TimeUnit.SECONDS);
    } catch (InterruptedException ie) {
    // Nothing to do, gotSemaphore defaults to false
    }
    if (!gotSemaphore) {
        // We didn't get a chance to acquire, so rollback the session and try again next time
        session.rollback();
        return;
    }
    final ComponentLog log = getLogger();
    final String metastoreUri = context.getProperty(METASTORE_URI).evaluateAttributeExpressions(flowFile).getValue();
    final boolean autoCreatePartitions = context.getProperty(AUTOCREATE_PARTITIONS).asBoolean();
    final Integer maxConnections = context.getProperty(MAX_OPEN_CONNECTIONS).asInteger();
    final Integer heartbeatInterval = context.getProperty(HEARTBEAT_INTERVAL).evaluateAttributeExpressions().asInteger();
    final Integer txnsPerBatch = context.getProperty(TXNS_PER_BATCH).evaluateAttributeExpressions(flowFile).asInteger();
    final Integer recordsPerTxn = context.getProperty(RECORDS_PER_TXN).evaluateAttributeExpressions(flowFile).asInteger();
    final Map<HiveEndPoint, HiveWriter> myWriters = new ConcurrentHashMap<>();
    threadWriterList.add(myWriters);
    HiveOptions o = new HiveOptions(metastoreUri, dbName, tableName).withTxnsPerBatch(txnsPerBatch).withAutoCreatePartitions(autoCreatePartitions).withMaxOpenConnections(maxConnections).withHeartBeatInterval(heartbeatInterval).withCallTimeout(callTimeout);
    if (SecurityUtil.isSecurityEnabled(hiveConfig)) {
        final String explicitPrincipal = context.getProperty(kerberosProperties.getKerberosPrincipal()).evaluateAttributeExpressions().getValue();
        final String explicitKeytab = context.getProperty(kerberosProperties.getKerberosKeytab()).evaluateAttributeExpressions().getValue();
        final KerberosCredentialsService credentialsService = context.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class);
        final String resolvedPrincipal;
        final String resolvedKeytab;
        if (credentialsService == null) {
            resolvedPrincipal = explicitPrincipal;
            resolvedKeytab = explicitKeytab;
        } else {
            resolvedPrincipal = credentialsService.getPrincipal();
            resolvedKeytab = credentialsService.getKeytab();
        }
        o = o.withKerberosPrincipal(resolvedPrincipal).withKerberosKeytab(resolvedKeytab);
    }
    final HiveOptions options = o;
    // Store the original class loader, then explicitly set it to this class's classloader (for use by the Hive Metastore)
    ClassLoader originalClassloader = Thread.currentThread().getContextClassLoader();
    Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
    final List<String> partitionColumnList;
    final String partitionColumns = context.getProperty(PARTITION_COLUMNS).evaluateAttributeExpressions().getValue();
    if (partitionColumns == null || partitionColumns.isEmpty()) {
        partitionColumnList = Collections.emptyList();
    } else {
        String[] partitionCols = partitionColumns.split(",");
        partitionColumnList = new ArrayList<>(partitionCols.length);
        for (String col : partitionCols) {
            partitionColumnList.add(col.trim());
        }
    }
    final AtomicReference<List<HiveStreamingRecord>> successfulRecords = new AtomicReference<>();
    successfulRecords.set(new ArrayList<>());
    final FlowFile inputFlowFile = flowFile;
    final RoutingResult result = new RoutingResult();
    final ExceptionHandler<FunctionContext> exceptionHandler = new ExceptionHandler<>();
    exceptionHandler.mapException(s -> {
        try {
            if (s == null) {
                return ErrorTypes.PersistentFailure;
            }
            throw s;
        } catch (IllegalArgumentException | HiveWriter.WriteFailure | SerializationError inputError) {
            return ErrorTypes.InvalidInput;
        } catch (HiveWriter.CommitFailure | HiveWriter.TxnBatchFailure | HiveWriter.TxnFailure writerTxError) {
            return ErrorTypes.TemporalInputFailure;
        } catch (ConnectionError | HiveWriter.ConnectFailure connectionError) {
            // Can't connect to Hive endpoint.
            log.error("Error connecting to Hive endpoint: table {} at {}", new Object[] { options.getTableName(), options.getMetaStoreURI() });
            return ErrorTypes.TemporalFailure;
        } catch (IOException | InterruptedException tempError) {
            return ErrorTypes.TemporalFailure;
        } catch (Exception t) {
            return ErrorTypes.UnknownFailure;
        }
    });
    final BiFunction<FunctionContext, ErrorTypes, ErrorTypes.Result> adjustError = RollbackOnFailure.createAdjustError(getLogger());
    exceptionHandler.adjustError(adjustError);
    // Create output flow files and their Avro writers
    functionContext.setFlowFiles(session.create(inputFlowFile), session.create(inputFlowFile));
    try {
        session.read(inputFlowFile, new InputStreamCallback() {

            @Override
            public void process(InputStream in) throws IOException {
                try (final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
                    GenericRecord currRecord = null;
                    // Copy codec and schema information to all writers
                    final String codec = reader.getMetaString(DataFileConstants.CODEC) == null ? DataFileConstants.NULL_CODEC : reader.getMetaString(DataFileConstants.CODEC);
                    functionContext.initAvroWriters(session, codec, reader);
                    Runnable flushSuccessfulRecords = () -> {
                        // Now send the records to the successful FlowFile and update the success count
                        functionContext.appendRecordsToSuccess(session, successfulRecords.get());
                        // Clear the list of successful records, we'll use it at the end when we flush whatever records are left
                        successfulRecords.set(new ArrayList<>());
                    };
                    while (reader.hasNext()) {
                        // We can NOT reuse currRecord here, because currRecord is accumulated in successful records.
                        // If we use the same GenericRecord instance, every record ends up having the same contents.
                        // To avoid this, we need to create a brand new GenericRecord instance here each time.
                        currRecord = reader.next();
                        functionContext.recordCount.incrementAndGet();
                        // Extract the partition values (they must be put separately into the Hive Streaming API)
                        List<String> partitionValues = new ArrayList<>();
                        if (!exceptionHandler.execute(functionContext, currRecord, input -> {
                            for (String partition : partitionColumnList) {
                                Object partitionValue = input.get(partition);
                                if (partitionValue == null) {
                                    throw new IllegalArgumentException("Partition column '" + partition + "' not found in Avro record");
                                }
                                partitionValues.add(partitionValue.toString());
                            }
                        }, onRecordError(context, session, myWriters))) {
                            continue;
                        }
                        final HiveStreamingRecord record = new HiveStreamingRecord(partitionValues, currRecord);
                        final AtomicReference<HiveWriter> hiveWriterRef = new AtomicReference<>();
                        // Write record to Hive streaming
                        if (!exceptionHandler.execute(functionContext, record, input -> {
                            final HiveEndPoint endPoint = makeHiveEndPoint(record.getPartitionValues(), options);
                            final HiveWriter hiveWriter = getOrCreateWriter(myWriters, options, endPoint);
                            hiveWriterRef.set(hiveWriter);
                            hiveWriter.write(record.getRecord().toString().getBytes(StandardCharsets.UTF_8));
                            successfulRecords.get().add(record);
                        }, onHiveRecordError(context, session, myWriters))) {
                            continue;
                        }
                        // If we've reached the records-per-transaction limit, flush the Hive Writer and update the Avro Writer for successful records
                        final HiveWriter hiveWriter = hiveWriterRef.get();
                        if (hiveWriter.getTotalRecords() >= recordsPerTxn) {
                            exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
                                hiveWriter.flush(true);
                                // Proceed function context. Process session can't be rollback anymore.
                                functionContext.proceed();
                                // Now send the records to the success relationship and update the success count
                                flushSuccessfulRecords.run();
                            }, onHiveRecordsError(context, session, myWriters).andThen((fc, input, res, commitException) -> {
                                // Reset hiveWriter for succeeding records.
                                switch(res.destination()) {
                                    case Retry:
                                    case Failure:
                                        try {
                                            // Abort current tx and move to next.
                                            hiveWriter.abort();
                                        } catch (Exception e) {
                                            // Can't even abort properly, throw a process exception
                                            throw new ProcessException(e);
                                        }
                                }
                            }));
                        }
                    }
                    exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
                        // Finish any transactions
                        flushAllWriters(myWriters, true);
                        closeAllWriters(myWriters);
                        // Now send any remaining records to the success relationship and update the count
                        flushSuccessfulRecords.run();
                    // Append successfulRecords on failure.
                    }, onHiveRecordsError(context, session, myWriters));
                } catch (IOException ioe) {
                    // The Avro file is invalid (or may not be an Avro file at all), send it to failure
                    final ErrorTypes.Result adjusted = adjustError.apply(functionContext, ErrorTypes.InvalidInput);
                    final String msg = "The incoming flow file can not be read as an Avro file";
                    switch(adjusted.destination()) {
                        case Failure:
                            log.error(msg, ioe);
                            result.routeTo(inputFlowFile, REL_FAILURE);
                            break;
                        case ProcessException:
                            throw new ProcessException(msg, ioe);
                    }
                }
            }
        });
        // If we got here, we've processed the outgoing flow files correctly, so remove the incoming one if necessary
        if (result.getRoutedFlowFiles().values().stream().noneMatch(routed -> routed.contains(inputFlowFile))) {
            session.remove(inputFlowFile);
        }
    } catch (DiscontinuedException e) {
        // The input FlowFile processing is discontinued. Keep it in the input queue.
        getLogger().warn("Discontinued processing for {} due to {}", new Object[] { flowFile, e }, e);
        result.routeTo(flowFile, Relationship.SELF);
    } catch (ShouldRetryException e) {
        // This exception is already a result of adjusting an error, so simply transfer the FlowFile to retry.
        getLogger().error(e.getMessage(), e);
        flowFile = session.penalize(flowFile);
        result.routeTo(flowFile, REL_RETRY);
    } finally {
        threadWriterList.remove(myWriters);
        functionContext.transferFlowFiles(session, result, options);
        // Restore original class loader, might not be necessary but is good practice since the processor task changed it
        Thread.currentThread().setContextClassLoader(originalClassloader);
        semaphore.release();
    }
}
Also used : StandardValidators(org.apache.nifi.processor.util.StandardValidators) ConnectionError(org.apache.hive.hcatalog.streaming.ConnectionError) BiFunction(java.util.function.BiFunction) Timer(java.util.Timer) StreamingException(org.apache.hive.hcatalog.streaming.StreamingException) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) ErrorTypes(org.apache.nifi.processor.util.pattern.ErrorTypes) HiveEndPoint(org.apache.hive.hcatalog.streaming.HiveEndPoint) Snappy(org.xerial.snappy.Snappy) RoutingResult(org.apache.nifi.processor.util.pattern.RoutingResult) DiscontinuedException(org.apache.nifi.processor.util.pattern.DiscontinuedException) WritesAttributes(org.apache.nifi.annotation.behavior.WritesAttributes) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) AuthenticationFailedException(org.apache.nifi.util.hive.AuthenticationFailedException) Map(java.util.Map) ExceptionHandler(org.apache.nifi.processor.util.pattern.ExceptionHandler) CodecFactory(org.apache.avro.file.CodecFactory) TimerTask(java.util.TimerTask) DataFileConstants(org.apache.avro.file.DataFileConstants) HiveConfigurator(org.apache.nifi.util.hive.HiveConfigurator) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) SecurityUtil(org.apache.nifi.hadoop.SecurityUtil) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) RequiresInstanceClassLoading(org.apache.nifi.annotation.behavior.RequiresInstanceClassLoading) FlowFile(org.apache.nifi.flowfile.FlowFile) KerberosProperties(org.apache.nifi.hadoop.KerberosProperties) SerializationError(org.apache.hive.hcatalog.streaming.SerializationError) Collection(java.util.Collection) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) DataFileWriter(org.apache.avro.file.DataFileWriter) WritesAttribute(org.apache.nifi.annotation.behavior.WritesAttribute) StandardCharsets(java.nio.charset.StandardCharsets) Executors(java.util.concurrent.Executors) List(java.util.List) SeekableByteArrayInput(org.apache.avro.file.SeekableByteArrayInput) Tags(org.apache.nifi.annotation.documentation.Tags) Pattern(java.util.regex.Pattern) ValidationResources(org.apache.nifi.util.hive.ValidationResources) ProcessorInitializationContext(org.apache.nifi.processor.ProcessorInitializationContext) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) RollbackOnFailure(org.apache.nifi.processor.util.pattern.RollbackOnFailure) CapabilityDescription(org.apache.nifi.annotation.documentation.CapabilityDescription) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ValidationContext(org.apache.nifi.components.ValidationContext) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) HashMap(java.util.HashMap) ComponentLog(org.apache.nifi.logging.ComponentLog) AtomicReference(java.util.concurrent.atomic.AtomicReference) ProcessException(org.apache.nifi.processor.exception.ProcessException) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Relationship(org.apache.nifi.processor.Relationship) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) AbstractSessionFactoryProcessor(org.apache.nifi.processor.AbstractSessionFactoryProcessor) ValidationResult(org.apache.nifi.components.ValidationResult) ExecutorService(java.util.concurrent.ExecutorService) GenericRecord(org.apache.avro.generic.GenericRecord) Validator(org.apache.nifi.components.Validator) Semaphore(java.util.concurrent.Semaphore) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ProcessContext(org.apache.nifi.processor.ProcessContext) DataFileStream(org.apache.avro.file.DataFileStream) ProcessSession(org.apache.nifi.processor.ProcessSession) IOException(java.io.IOException) ProcessSessionFactory(org.apache.nifi.processor.ProcessSessionFactory) File(java.io.File) TimeUnit(java.util.concurrent.TimeUnit) HiveUtils(org.apache.nifi.util.hive.HiveUtils) OnScheduled(org.apache.nifi.annotation.lifecycle.OnScheduled) KerberosCredentialsService(org.apache.nifi.kerberos.KerberosCredentialsService) HiveOptions(org.apache.nifi.util.hive.HiveOptions) HiveWriter(org.apache.nifi.util.hive.HiveWriter) OnStopped(org.apache.nifi.annotation.lifecycle.OnStopped) Collections(java.util.Collections) InputStream(java.io.InputStream) RoutingResult(org.apache.nifi.processor.util.pattern.RoutingResult) ValidationResult(org.apache.nifi.components.ValidationResult) RoutingResult(org.apache.nifi.processor.util.pattern.RoutingResult) ExceptionHandler(org.apache.nifi.processor.util.pattern.ExceptionHandler) List(java.util.List) ArrayList(java.util.ArrayList) FlowFile(org.apache.nifi.flowfile.FlowFile) ComponentLog(org.apache.nifi.logging.ComponentLog) SerializationError(org.apache.hive.hcatalog.streaming.SerializationError) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Semaphore(java.util.concurrent.Semaphore) ErrorTypes(org.apache.nifi.processor.util.pattern.ErrorTypes) HiveEndPoint(org.apache.hive.hcatalog.streaming.HiveEndPoint) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) GenericRecord(org.apache.avro.generic.GenericRecord) HiveWriter(org.apache.nifi.util.hive.HiveWriter) InputStream(java.io.InputStream) ConnectionError(org.apache.hive.hcatalog.streaming.ConnectionError) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) StreamingException(org.apache.hive.hcatalog.streaming.StreamingException) DiscontinuedException(org.apache.nifi.processor.util.pattern.DiscontinuedException) AuthenticationFailedException(org.apache.nifi.util.hive.AuthenticationFailedException) ProcessException(org.apache.nifi.processor.exception.ProcessException) IOException(java.io.IOException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ProcessException(org.apache.nifi.processor.exception.ProcessException) KerberosCredentialsService(org.apache.nifi.kerberos.KerberosCredentialsService) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) DiscontinuedException(org.apache.nifi.processor.util.pattern.DiscontinuedException) HiveOptions(org.apache.nifi.util.hive.HiveOptions)

Example 2 with SerializationError

use of org.apache.hive.hcatalog.streaming.SerializationError in project nifi by apache.

the class HiveWriter method write.

/**
 * Write the record data to Hive
 *
 * @throws IOException if an error occurs during the write
 * @throws InterruptedException if the write operation is interrupted
 */
public synchronized void write(final byte[] record) throws WriteFailure, SerializationError, InterruptedException {
    if (closed) {
        throw new IllegalStateException("This hive streaming writer was closed " + "and thus no longer able to write : " + endPoint);
    }
    // write the tuple
    try {
        LOG.debug("Writing event to {}", endPoint);
        callWithTimeout(new CallRunner<Void>() {

            @Override
            public Void call() throws StreamingException, InterruptedException {
                txnBatch.write(record);
                totalRecords++;
                return null;
            }
        });
    } catch (SerializationError se) {
        throw new SerializationError(endPoint.toString() + " SerializationError", se);
    } catch (StreamingException | TimeoutException e) {
        throw new WriteFailure(endPoint, txnBatch.getCurrentTxnId(), e);
    }
}
Also used : StreamingException(org.apache.hive.hcatalog.streaming.StreamingException) SerializationError(org.apache.hive.hcatalog.streaming.SerializationError) TimeoutException(java.util.concurrent.TimeoutException)

Example 3 with SerializationError

use of org.apache.hive.hcatalog.streaming.SerializationError in project storm by apache.

the class HiveWriter method write.

/**
 * Write data.
 */
public synchronized void write(final byte[] record) throws WriteFailure, SerializationError, InterruptedException {
    if (closed) {
        throw new IllegalStateException("This hive streaming writer was closed " + "and thus no longer able to write : " + endPoint);
    }
    // write the tuple
    try {
        LOG.debug("Writing event to {}", endPoint);
        callWithTimeout(new CallRunner<Void>() {

            @Override
            public Void call() throws StreamingException, InterruptedException {
                txnBatch.write(record);
                totalRecords++;
                return null;
            }
        });
    } catch (SerializationError se) {
        throw new SerializationError(endPoint.toString() + " SerializationError", se);
    } catch (StreamingException e) {
        throw new WriteFailure(endPoint, txnBatch.getCurrentTxnId(), e);
    } catch (TimeoutException e) {
        throw new WriteFailure(endPoint, txnBatch.getCurrentTxnId(), e);
    }
}
Also used : StreamingException(org.apache.hive.hcatalog.streaming.StreamingException) SerializationError(org.apache.hive.hcatalog.streaming.SerializationError) TimeoutException(java.util.concurrent.TimeoutException)

Example 4 with SerializationError

use of org.apache.hive.hcatalog.streaming.SerializationError in project storm by apache.

the class HiveBolt method execute.

@Override
public void execute(Tuple tuple) {
    try {
        if (batchHelper.shouldHandle(tuple)) {
            List<String> partitionVals = options.getMapper().mapPartitions(tuple);
            HiveEndPoint endPoint = HiveUtils.makeEndPoint(partitionVals, options);
            HiveWriter writer = getOrCreateWriter(endPoint);
            writer.write(options.getMapper().mapRecord(tuple));
            batchHelper.addBatch(tuple);
        }
        if (batchHelper.shouldFlush()) {
            flushAllWriters(true);
            LOG.info("acknowledging tuples after writers flushed ");
            batchHelper.ack();
        }
        if (TupleUtils.isTick(tuple)) {
            retireIdleWriters();
        }
    } catch (SerializationError se) {
        LOG.info("Serialization exception occurred, tuple is acknowledged but not written to Hive.", tuple);
        this.collector.reportError(se);
        collector.ack(tuple);
    } catch (Exception e) {
        batchHelper.fail(e);
        abortAndCloseWriters();
    }
}
Also used : SerializationError(org.apache.hive.hcatalog.streaming.SerializationError) HiveWriter(org.apache.storm.hive.common.HiveWriter) HiveEndPoint(org.apache.hive.hcatalog.streaming.HiveEndPoint) StreamingException(org.apache.hive.hcatalog.streaming.StreamingException) IOException(java.io.IOException)

Aggregations

SerializationError (org.apache.hive.hcatalog.streaming.SerializationError)4 StreamingException (org.apache.hive.hcatalog.streaming.StreamingException)4 IOException (java.io.IOException)2 TimeoutException (java.util.concurrent.TimeoutException)2 HiveEndPoint (org.apache.hive.hcatalog.streaming.HiveEndPoint)2 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 File (java.io.File)1 InputStream (java.io.InputStream)1 StandardCharsets (java.nio.charset.StandardCharsets)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 Timer (java.util.Timer)1 TimerTask (java.util.TimerTask)1