use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method cleanup.
@OnStopped
public void cleanup() {
// trigger re-validation of resources
validationResourceHolder.set(null);
ComponentLog log = getLogger();
sendHeartBeat.set(false);
for (Map<HiveEndPoint, HiveWriter> allWriters : threadWriterList) {
for (Map.Entry<HiveEndPoint, HiveWriter> entry : allWriters.entrySet()) {
try {
HiveWriter w = entry.getValue();
w.flushAndClose();
} catch (Exception ex) {
log.warn("Error while closing writer to " + entry.getKey() + ". Exception follows.", ex);
if (ex instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
}
}
allWriters.clear();
}
if (callTimeoutPool != null) {
callTimeoutPool.shutdown();
try {
while (!callTimeoutPool.isTerminated()) {
callTimeoutPool.awaitTermination(callTimeout, TimeUnit.MILLISECONDS);
}
} catch (Throwable t) {
log.warn("shutdown interrupted on " + callTimeoutPool, t);
}
callTimeoutPool = null;
}
ugi = null;
}
use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method getOrCreateWriter.
private HiveWriter getOrCreateWriter(Map<HiveEndPoint, HiveWriter> writers, HiveOptions options, HiveEndPoint endPoint) throws HiveWriter.ConnectFailure, InterruptedException {
ComponentLog log = getLogger();
try {
HiveWriter writer = writers.get(endPoint);
if (writer == null) {
log.debug("Creating Writer to Hive end point : " + endPoint);
writer = makeHiveWriter(endPoint, callTimeoutPool, ugi, options);
if (writers.size() > (options.getMaxOpenConnections() - 1)) {
log.info("cached HiveEndPoint size {} exceeded maxOpenConnections {} ", new Object[] { writers.size(), options.getMaxOpenConnections() });
int retired = retireIdleWriters(writers, options.getIdleTimeout());
if (retired == 0) {
retireEldestWriter(writers);
}
}
writers.put(endPoint, writer);
HiveUtils.logAllHiveEndPoints(writers);
}
return writer;
} catch (HiveWriter.ConnectFailure e) {
log.error("Failed to create HiveWriter for endpoint: " + endPoint, e);
throw e;
}
}
use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method onTrigger.
private void onTrigger(ProcessContext context, ProcessSession session, FunctionContext functionContext) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final String dbName = context.getProperty(DB_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
// Only allow one thread to work on a DB/table at a time
final Semaphore newSemaphore = new Semaphore(1);
Semaphore semaphore = tableSemaphoreMap.putIfAbsent(dbName + "." + tableName, newSemaphore);
if (semaphore == null) {
semaphore = newSemaphore;
}
boolean gotSemaphore = false;
try {
gotSemaphore = semaphore.tryAcquire(0, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
// Nothing to do, gotSemaphore defaults to false
}
if (!gotSemaphore) {
// We didn't get a chance to acquire, so rollback the session and try again next time
session.rollback();
return;
}
final ComponentLog log = getLogger();
final String metastoreUri = context.getProperty(METASTORE_URI).evaluateAttributeExpressions(flowFile).getValue();
final boolean autoCreatePartitions = context.getProperty(AUTOCREATE_PARTITIONS).asBoolean();
final Integer maxConnections = context.getProperty(MAX_OPEN_CONNECTIONS).asInteger();
final Integer heartbeatInterval = context.getProperty(HEARTBEAT_INTERVAL).evaluateAttributeExpressions().asInteger();
final Integer txnsPerBatch = context.getProperty(TXNS_PER_BATCH).evaluateAttributeExpressions(flowFile).asInteger();
final Integer recordsPerTxn = context.getProperty(RECORDS_PER_TXN).evaluateAttributeExpressions(flowFile).asInteger();
final Map<HiveEndPoint, HiveWriter> myWriters = new ConcurrentHashMap<>();
threadWriterList.add(myWriters);
HiveOptions o = new HiveOptions(metastoreUri, dbName, tableName).withTxnsPerBatch(txnsPerBatch).withAutoCreatePartitions(autoCreatePartitions).withMaxOpenConnections(maxConnections).withHeartBeatInterval(heartbeatInterval).withCallTimeout(callTimeout);
if (SecurityUtil.isSecurityEnabled(hiveConfig)) {
final String explicitPrincipal = context.getProperty(kerberosProperties.getKerberosPrincipal()).evaluateAttributeExpressions().getValue();
final String explicitKeytab = context.getProperty(kerberosProperties.getKerberosKeytab()).evaluateAttributeExpressions().getValue();
final KerberosCredentialsService credentialsService = context.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class);
final String resolvedPrincipal;
final String resolvedKeytab;
if (credentialsService == null) {
resolvedPrincipal = explicitPrincipal;
resolvedKeytab = explicitKeytab;
} else {
resolvedPrincipal = credentialsService.getPrincipal();
resolvedKeytab = credentialsService.getKeytab();
}
o = o.withKerberosPrincipal(resolvedPrincipal).withKerberosKeytab(resolvedKeytab);
}
final HiveOptions options = o;
// Store the original class loader, then explicitly set it to this class's classloader (for use by the Hive Metastore)
ClassLoader originalClassloader = Thread.currentThread().getContextClassLoader();
Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
final List<String> partitionColumnList;
final String partitionColumns = context.getProperty(PARTITION_COLUMNS).evaluateAttributeExpressions().getValue();
if (partitionColumns == null || partitionColumns.isEmpty()) {
partitionColumnList = Collections.emptyList();
} else {
String[] partitionCols = partitionColumns.split(",");
partitionColumnList = new ArrayList<>(partitionCols.length);
for (String col : partitionCols) {
partitionColumnList.add(col.trim());
}
}
final AtomicReference<List<HiveStreamingRecord>> successfulRecords = new AtomicReference<>();
successfulRecords.set(new ArrayList<>());
final FlowFile inputFlowFile = flowFile;
final RoutingResult result = new RoutingResult();
final ExceptionHandler<FunctionContext> exceptionHandler = new ExceptionHandler<>();
exceptionHandler.mapException(s -> {
try {
if (s == null) {
return ErrorTypes.PersistentFailure;
}
throw s;
} catch (IllegalArgumentException | HiveWriter.WriteFailure | SerializationError inputError) {
return ErrorTypes.InvalidInput;
} catch (HiveWriter.CommitFailure | HiveWriter.TxnBatchFailure | HiveWriter.TxnFailure writerTxError) {
return ErrorTypes.TemporalInputFailure;
} catch (ConnectionError | HiveWriter.ConnectFailure connectionError) {
// Can't connect to Hive endpoint.
log.error("Error connecting to Hive endpoint: table {} at {}", new Object[] { options.getTableName(), options.getMetaStoreURI() });
return ErrorTypes.TemporalFailure;
} catch (IOException | InterruptedException tempError) {
return ErrorTypes.TemporalFailure;
} catch (Exception t) {
return ErrorTypes.UnknownFailure;
}
});
final BiFunction<FunctionContext, ErrorTypes, ErrorTypes.Result> adjustError = RollbackOnFailure.createAdjustError(getLogger());
exceptionHandler.adjustError(adjustError);
// Create output flow files and their Avro writers
functionContext.setFlowFiles(session.create(inputFlowFile), session.create(inputFlowFile));
try {
session.read(inputFlowFile, new InputStreamCallback() {
@Override
public void process(InputStream in) throws IOException {
try (final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
GenericRecord currRecord = null;
// Copy codec and schema information to all writers
final String codec = reader.getMetaString(DataFileConstants.CODEC) == null ? DataFileConstants.NULL_CODEC : reader.getMetaString(DataFileConstants.CODEC);
functionContext.initAvroWriters(session, codec, reader);
Runnable flushSuccessfulRecords = () -> {
// Now send the records to the successful FlowFile and update the success count
functionContext.appendRecordsToSuccess(session, successfulRecords.get());
// Clear the list of successful records, we'll use it at the end when we flush whatever records are left
successfulRecords.set(new ArrayList<>());
};
while (reader.hasNext()) {
// We can NOT reuse currRecord here, because currRecord is accumulated in successful records.
// If we use the same GenericRecord instance, every record ends up having the same contents.
// To avoid this, we need to create a brand new GenericRecord instance here each time.
currRecord = reader.next();
functionContext.recordCount.incrementAndGet();
// Extract the partition values (they must be put separately into the Hive Streaming API)
List<String> partitionValues = new ArrayList<>();
if (!exceptionHandler.execute(functionContext, currRecord, input -> {
for (String partition : partitionColumnList) {
Object partitionValue = input.get(partition);
if (partitionValue == null) {
throw new IllegalArgumentException("Partition column '" + partition + "' not found in Avro record");
}
partitionValues.add(partitionValue.toString());
}
}, onRecordError(context, session, myWriters))) {
continue;
}
final HiveStreamingRecord record = new HiveStreamingRecord(partitionValues, currRecord);
final AtomicReference<HiveWriter> hiveWriterRef = new AtomicReference<>();
// Write record to Hive streaming
if (!exceptionHandler.execute(functionContext, record, input -> {
final HiveEndPoint endPoint = makeHiveEndPoint(record.getPartitionValues(), options);
final HiveWriter hiveWriter = getOrCreateWriter(myWriters, options, endPoint);
hiveWriterRef.set(hiveWriter);
hiveWriter.write(record.getRecord().toString().getBytes(StandardCharsets.UTF_8));
successfulRecords.get().add(record);
}, onHiveRecordError(context, session, myWriters))) {
continue;
}
// If we've reached the records-per-transaction limit, flush the Hive Writer and update the Avro Writer for successful records
final HiveWriter hiveWriter = hiveWriterRef.get();
if (hiveWriter.getTotalRecords() >= recordsPerTxn) {
exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
hiveWriter.flush(true);
// Proceed function context. Process session can't be rollback anymore.
functionContext.proceed();
// Now send the records to the success relationship and update the success count
flushSuccessfulRecords.run();
}, onHiveRecordsError(context, session, myWriters).andThen((fc, input, res, commitException) -> {
// Reset hiveWriter for succeeding records.
switch(res.destination()) {
case Retry:
case Failure:
try {
// Abort current tx and move to next.
hiveWriter.abort();
} catch (Exception e) {
// Can't even abort properly, throw a process exception
throw new ProcessException(e);
}
}
}));
}
}
exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
// Finish any transactions
flushAllWriters(myWriters, true);
closeAllWriters(myWriters);
// Now send any remaining records to the success relationship and update the count
flushSuccessfulRecords.run();
// Append successfulRecords on failure.
}, onHiveRecordsError(context, session, myWriters));
} catch (IOException ioe) {
// The Avro file is invalid (or may not be an Avro file at all), send it to failure
final ErrorTypes.Result adjusted = adjustError.apply(functionContext, ErrorTypes.InvalidInput);
final String msg = "The incoming flow file can not be read as an Avro file";
switch(adjusted.destination()) {
case Failure:
log.error(msg, ioe);
result.routeTo(inputFlowFile, REL_FAILURE);
break;
case ProcessException:
throw new ProcessException(msg, ioe);
}
}
}
});
// If we got here, we've processed the outgoing flow files correctly, so remove the incoming one if necessary
if (result.getRoutedFlowFiles().values().stream().noneMatch(routed -> routed.contains(inputFlowFile))) {
session.remove(inputFlowFile);
}
} catch (DiscontinuedException e) {
// The input FlowFile processing is discontinued. Keep it in the input queue.
getLogger().warn("Discontinued processing for {} due to {}", new Object[] { flowFile, e }, e);
result.routeTo(flowFile, Relationship.SELF);
} catch (ShouldRetryException e) {
// This exception is already a result of adjusting an error, so simply transfer the FlowFile to retry.
getLogger().error(e.getMessage(), e);
flowFile = session.penalize(flowFile);
result.routeTo(flowFile, REL_RETRY);
} finally {
threadWriterList.remove(myWriters);
functionContext.transferFlowFiles(session, result, options);
// Restore original class loader, might not be necessary but is good practice since the processor task changed it
Thread.currentThread().setContextClassLoader(originalClassloader);
semaphore.release();
}
}
use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method retireIdleWriters.
/**
* Locate all writers past idle timeout and retire them
*
* @return number of writers retired
*/
private int retireIdleWriters(Map<HiveEndPoint, HiveWriter> writers, int idleTimeout) {
ComponentLog log = getLogger();
log.info("Attempting to close idle HiveWriters");
int count = 0;
long now = System.currentTimeMillis();
ArrayList<HiveEndPoint> retirees = new ArrayList<>();
// 1) Find retirement candidates
for (Map.Entry<HiveEndPoint, HiveWriter> entry : writers.entrySet()) {
if (now - entry.getValue().getLastUsed() > idleTimeout) {
++count;
retirees.add(entry.getKey());
}
}
// 2) Retire them
for (HiveEndPoint ep : retirees) {
try {
log.info("Closing idle Writer to Hive end point : {}", new Object[] { ep });
writers.remove(ep).flushAndClose();
} catch (IOException e) {
log.warn("Failed to close HiveWriter for end point: {}. Error: " + ep, e);
} catch (InterruptedException e) {
log.warn("Interrupted when attempting to close HiveWriter for end point: " + ep, e);
Thread.currentThread().interrupt();
} catch (Exception e) {
log.warn("Interrupted when attempting to close HiveWriter for end point: " + ep, e);
}
}
return count;
}
use of org.apache.nifi.util.hive.HiveWriter in project nifi by apache.
the class PutHiveStreaming method retireEldestWriter.
/**
* Locate writer that has not been used for longest time and retire it
*/
private void retireEldestWriter(Map<HiveEndPoint, HiveWriter> writers) {
ComponentLog log = getLogger();
log.info("Attempting close eldest writers");
long oldestTimeStamp = System.currentTimeMillis();
HiveEndPoint eldest = null;
for (Map.Entry<HiveEndPoint, HiveWriter> entry : writers.entrySet()) {
if (entry.getValue().getLastUsed() < oldestTimeStamp) {
eldest = entry.getKey();
oldestTimeStamp = entry.getValue().getLastUsed();
}
}
try {
log.info("Closing least used Writer to Hive end point : " + eldest);
writers.remove(eldest).flushAndClose();
} catch (IOException e) {
log.warn("Failed to close writer for end point: " + eldest, e);
} catch (InterruptedException e) {
log.warn("Interrupted when attempting to close writer for end point: " + eldest, e);
Thread.currentThread().interrupt();
} catch (Exception e) {
log.warn("Interrupted when attempting to close writer for end point: " + eldest, e);
}
}
Aggregations