use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method cleanup.
public void cleanup() {
// trigger re-validation of resources
ComponentLog log = getLogger();
for (Map<HiveEndPoint, HiveWriter> allWriters : threadWriterList) {
for (Map.Entry<HiveEndPoint, HiveWriter> entry : allWriters.entrySet()) {
try {
HiveWriter w = entry.getValue();
} catch (Exception ex) {
log.warn("Error while closing writer to " + entry.getKey() + ". Exception follows.", ex);
if (ex instanceof InterruptedException) {
if (callTimeoutPool != null) {
try {
while (!callTimeoutPool.isTerminated()) {
callTimeoutPool.awaitTermination(callTimeout, TimeUnit.MILLISECONDS);
} catch (Throwable t) {
log.warn("shutdown interrupted on " + callTimeoutPool, t);
callTimeoutPool = null;
ugi = null;
use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method getOrCreateWriter.
private HiveWriter getOrCreateWriter(Map<HiveEndPoint, HiveWriter> writers, HiveOptions options, HiveEndPoint endPoint) throws HiveWriter.ConnectFailure, InterruptedException {
ComponentLog log = getLogger();
try {
HiveWriter writer = writers.get(endPoint);
if (writer == null) {
log.debug("Creating Writer to Hive end point : " + endPoint);
writer = makeHiveWriter(endPoint, callTimeoutPool, ugi, options);
if (writers.size() > (options.getMaxOpenConnections() - 1)) {"cached HiveEndPoint size {} exceeded maxOpenConnections {} ", new Object[] { writers.size(), options.getMaxOpenConnections() });
int retired = retireIdleWriters(writers, options.getIdleTimeout());
if (retired == 0) {
writers.put(endPoint, writer);
return writer;
} catch (HiveWriter.ConnectFailure e) {
log.error("Failed to create HiveWriter for endpoint: " + endPoint, e);
throw e;
use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method onTrigger.
private void onTrigger(ProcessContext context, ProcessSession session, FunctionContext functionContext) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
final String dbName = context.getProperty(DB_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
// Only allow one thread to work on a DB/table at a time
final Semaphore newSemaphore = new Semaphore(1);
Semaphore semaphore = tableSemaphoreMap.putIfAbsent(dbName + "." + tableName, newSemaphore);
if (semaphore == null) {
semaphore = newSemaphore;
boolean gotSemaphore = false;
try {
gotSemaphore = semaphore.tryAcquire(0, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
// Nothing to do, gotSemaphore defaults to false
if (!gotSemaphore) {
// We didn't get a chance to acquire, so rollback the session and try again next time
final ComponentLog log = getLogger();
final String metastoreUri = context.getProperty(METASTORE_URI).evaluateAttributeExpressions(flowFile).getValue();
final boolean autoCreatePartitions = context.getProperty(AUTOCREATE_PARTITIONS).asBoolean();
final Integer maxConnections = context.getProperty(MAX_OPEN_CONNECTIONS).asInteger();
final Integer heartbeatInterval = context.getProperty(HEARTBEAT_INTERVAL).evaluateAttributeExpressions().asInteger();
final Integer txnsPerBatch = context.getProperty(TXNS_PER_BATCH).evaluateAttributeExpressions(flowFile).asInteger();
final Integer recordsPerTxn = context.getProperty(RECORDS_PER_TXN).evaluateAttributeExpressions(flowFile).asInteger();
final Map<HiveEndPoint, HiveWriter> myWriters = new ConcurrentHashMap<>();
HiveOptions o = new HiveOptions(metastoreUri, dbName, tableName).withTxnsPerBatch(txnsPerBatch).withAutoCreatePartitions(autoCreatePartitions).withMaxOpenConnections(maxConnections).withHeartBeatInterval(heartbeatInterval).withCallTimeout(callTimeout);
if (SecurityUtil.isSecurityEnabled(hiveConfig)) {
final String explicitPrincipal = context.getProperty(kerberosProperties.getKerberosPrincipal()).evaluateAttributeExpressions().getValue();
final String explicitKeytab = context.getProperty(kerberosProperties.getKerberosKeytab()).evaluateAttributeExpressions().getValue();
final KerberosCredentialsService credentialsService = context.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class);
final String resolvedPrincipal;
final String resolvedKeytab;
if (credentialsService == null) {
resolvedPrincipal = explicitPrincipal;
resolvedKeytab = explicitKeytab;
} else {
resolvedPrincipal = credentialsService.getPrincipal();
resolvedKeytab = credentialsService.getKeytab();
o = o.withKerberosPrincipal(resolvedPrincipal).withKerberosKeytab(resolvedKeytab);
final HiveOptions options = o;
// Store the original class loader, then explicitly set it to this class's classloader (for use by the Hive Metastore)
ClassLoader originalClassloader = Thread.currentThread().getContextClassLoader();
final List<String> partitionColumnList;
final String partitionColumns = context.getProperty(PARTITION_COLUMNS).evaluateAttributeExpressions().getValue();
if (partitionColumns == null || partitionColumns.isEmpty()) {
partitionColumnList = Collections.emptyList();
} else {
String[] partitionCols = partitionColumns.split(",");
partitionColumnList = new ArrayList<>(partitionCols.length);
for (String col : partitionCols) {
final AtomicReference<List<HiveStreamingRecord>> successfulRecords = new AtomicReference<>();
successfulRecords.set(new ArrayList<>());
final FlowFile inputFlowFile = flowFile;
final RoutingResult result = new RoutingResult();
final ExceptionHandler<FunctionContext> exceptionHandler = new ExceptionHandler<>();
exceptionHandler.mapException(s -> {
try {
if (s == null) {
return ErrorTypes.PersistentFailure;
throw s;
} catch (IllegalArgumentException | HiveWriter.WriteFailure | SerializationError inputError) {
return ErrorTypes.InvalidInput;
} catch (HiveWriter.CommitFailure | HiveWriter.TxnBatchFailure | HiveWriter.TxnFailure writerTxError) {
return ErrorTypes.TemporalInputFailure;
} catch (ConnectionError | HiveWriter.ConnectFailure connectionError) {
// Can't connect to Hive endpoint.
log.error("Error connecting to Hive endpoint: table {} at {}", new Object[] { options.getTableName(), options.getMetaStoreURI() });
return ErrorTypes.TemporalFailure;
} catch (IOException | InterruptedException tempError) {
return ErrorTypes.TemporalFailure;
} catch (Exception t) {
return ErrorTypes.UnknownFailure;
final BiFunction<FunctionContext, ErrorTypes, ErrorTypes.Result> adjustError = RollbackOnFailure.createAdjustError(getLogger());
// Create output flow files and their Avro writers
functionContext.setFlowFiles(session.create(inputFlowFile), session.create(inputFlowFile));
try {, new InputStreamCallback() {
public void process(InputStream in) throws IOException {
try (final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
GenericRecord currRecord = null;
// Copy codec and schema information to all writers
final String codec = reader.getMetaString(DataFileConstants.CODEC) == null ? DataFileConstants.NULL_CODEC : reader.getMetaString(DataFileConstants.CODEC);
functionContext.initAvroWriters(session, codec, reader);
Runnable flushSuccessfulRecords = () -> {
// Now send the records to the successful FlowFile and update the success count
functionContext.appendRecordsToSuccess(session, successfulRecords.get());
// Clear the list of successful records, we'll use it at the end when we flush whatever records are left
successfulRecords.set(new ArrayList<>());
while (reader.hasNext()) {
// We can NOT reuse currRecord here, because currRecord is accumulated in successful records.
// If we use the same GenericRecord instance, every record ends up having the same contents.
// To avoid this, we need to create a brand new GenericRecord instance here each time.
currRecord =;
// Extract the partition values (they must be put separately into the Hive Streaming API)
List<String> partitionValues = new ArrayList<>();
if (!exceptionHandler.execute(functionContext, currRecord, input -> {
for (String partition : partitionColumnList) {
Object partitionValue = input.get(partition);
if (partitionValue == null) {
throw new IllegalArgumentException("Partition column '" + partition + "' not found in Avro record");
}, onRecordError(context, session, myWriters))) {
final HiveStreamingRecord record = new HiveStreamingRecord(partitionValues, currRecord);
final AtomicReference<HiveWriter> hiveWriterRef = new AtomicReference<>();
// Write record to Hive streaming
if (!exceptionHandler.execute(functionContext, record, input -> {
final HiveEndPoint endPoint = makeHiveEndPoint(record.getPartitionValues(), options);
final HiveWriter hiveWriter = getOrCreateWriter(myWriters, options, endPoint);
}, onHiveRecordError(context, session, myWriters))) {
// If we've reached the records-per-transaction limit, flush the Hive Writer and update the Avro Writer for successful records
final HiveWriter hiveWriter = hiveWriterRef.get();
if (hiveWriter.getTotalRecords() >= recordsPerTxn) {
exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
// Proceed function context. Process session can't be rollback anymore.
// Now send the records to the success relationship and update the success count;
}, onHiveRecordsError(context, session, myWriters).andThen((fc, input, res, commitException) -> {
// Reset hiveWriter for succeeding records.
switch(res.destination()) {
case Retry:
case Failure:
try {
// Abort current tx and move to next.
} catch (Exception e) {
// Can't even abort properly, throw a process exception
throw new ProcessException(e);
exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
// Finish any transactions
flushAllWriters(myWriters, true);
// Now send any remaining records to the success relationship and update the count;
// Append successfulRecords on failure.
}, onHiveRecordsError(context, session, myWriters));
} catch (IOException ioe) {
// The Avro file is invalid (or may not be an Avro file at all), send it to failure
final ErrorTypes.Result adjusted = adjustError.apply(functionContext, ErrorTypes.InvalidInput);
final String msg = "The incoming flow file can not be read as an Avro file";
switch(adjusted.destination()) {
case Failure:
log.error(msg, ioe);
result.routeTo(inputFlowFile, REL_FAILURE);
case ProcessException:
throw new ProcessException(msg, ioe);
// If we got here, we've processed the outgoing flow files correctly, so remove the incoming one if necessary
if (result.getRoutedFlowFiles().values().stream().noneMatch(routed -> routed.contains(inputFlowFile))) {
} catch (DiscontinuedException e) {
// The input FlowFile processing is discontinued. Keep it in the input queue.
getLogger().warn("Discontinued processing for {} due to {}", new Object[] { flowFile, e }, e);
result.routeTo(flowFile, Relationship.SELF);
} catch (ShouldRetryException e) {
// This exception is already a result of adjusting an error, so simply transfer the FlowFile to retry.
getLogger().error(e.getMessage(), e);
flowFile = session.penalize(flowFile);
result.routeTo(flowFile, REL_RETRY);
} finally {
functionContext.transferFlowFiles(session, result, options);
// Restore original class loader, might not be necessary but is good practice since the processor task changed it
use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method retireIdleWriters.
* Locate all writers past idle timeout and retire them
* @return number of writers retired
private int retireIdleWriters(Map<HiveEndPoint, HiveWriter> writers, int idleTimeout) {
ComponentLog log = getLogger();"Attempting to close idle HiveWriters");
int count = 0;
long now = System.currentTimeMillis();
ArrayList<HiveEndPoint> retirees = new ArrayList<>();
// 1) Find retirement candidates
for (Map.Entry<HiveEndPoint, HiveWriter> entry : writers.entrySet()) {
if (now - entry.getValue().getLastUsed() > idleTimeout) {
// 2) Retire them
for (HiveEndPoint ep : retirees) {
try {"Closing idle Writer to Hive end point : {}", new Object[] { ep });
} catch (IOException e) {
log.warn("Failed to close HiveWriter for end point: {}. Error: " + ep, e);
} catch (InterruptedException e) {
log.warn("Interrupted when attempting to close HiveWriter for end point: " + ep, e);
} catch (Exception e) {
log.warn("Interrupted when attempting to close HiveWriter for end point: " + ep, e);
return count;
use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method retireEldestWriter.
* Locate writer that has not been used for longest time and retire it
private void retireEldestWriter(Map<HiveEndPoint, HiveWriter> writers) {
ComponentLog log = getLogger();"Attempting close eldest writers");
long oldestTimeStamp = System.currentTimeMillis();
HiveEndPoint eldest = null;
for (Map.Entry<HiveEndPoint, HiveWriter> entry : writers.entrySet()) {
if (entry.getValue().getLastUsed() < oldestTimeStamp) {
eldest = entry.getKey();
oldestTimeStamp = entry.getValue().getLastUsed();
try {"Closing least used Writer to Hive end point : " + eldest);
} catch (IOException e) {
log.warn("Failed to close writer for end point: " + eldest, e);
} catch (InterruptedException e) {
log.warn("Interrupted when attempting to close writer for end point: " + eldest, e);
} catch (Exception e) {
log.warn("Interrupted when attempting to close writer for end point: " + eldest, e);