use of org.apache.hive.hcatalog.streaming.SerializationError in project nifi by apache.
the class PutHiveStreaming method onTrigger.
private void onTrigger(ProcessContext context, ProcessSession session, FunctionContext functionContext) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final String dbName = context.getProperty(DB_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
// Only allow one thread to work on a DB/table at a time
final Semaphore newSemaphore = new Semaphore(1);
Semaphore semaphore = tableSemaphoreMap.putIfAbsent(dbName + "." + tableName, newSemaphore);
if (semaphore == null) {
semaphore = newSemaphore;
}
boolean gotSemaphore = false;
try {
gotSemaphore = semaphore.tryAcquire(0, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
// Nothing to do, gotSemaphore defaults to false
}
if (!gotSemaphore) {
// We didn't get a chance to acquire, so rollback the session and try again next time
session.rollback();
return;
}
final ComponentLog log = getLogger();
final String metastoreUri = context.getProperty(METASTORE_URI).evaluateAttributeExpressions(flowFile).getValue();
final boolean autoCreatePartitions = context.getProperty(AUTOCREATE_PARTITIONS).asBoolean();
final Integer maxConnections = context.getProperty(MAX_OPEN_CONNECTIONS).asInteger();
final Integer heartbeatInterval = context.getProperty(HEARTBEAT_INTERVAL).evaluateAttributeExpressions().asInteger();
final Integer txnsPerBatch = context.getProperty(TXNS_PER_BATCH).evaluateAttributeExpressions(flowFile).asInteger();
final Integer recordsPerTxn = context.getProperty(RECORDS_PER_TXN).evaluateAttributeExpressions(flowFile).asInteger();
final Map<HiveEndPoint, HiveWriter> myWriters = new ConcurrentHashMap<>();
threadWriterList.add(myWriters);
HiveOptions o = new HiveOptions(metastoreUri, dbName, tableName).withTxnsPerBatch(txnsPerBatch).withAutoCreatePartitions(autoCreatePartitions).withMaxOpenConnections(maxConnections).withHeartBeatInterval(heartbeatInterval).withCallTimeout(callTimeout);
if (SecurityUtil.isSecurityEnabled(hiveConfig)) {
final String explicitPrincipal = context.getProperty(kerberosProperties.getKerberosPrincipal()).evaluateAttributeExpressions().getValue();
final String explicitKeytab = context.getProperty(kerberosProperties.getKerberosKeytab()).evaluateAttributeExpressions().getValue();
final KerberosCredentialsService credentialsService = context.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class);
final String resolvedPrincipal;
final String resolvedKeytab;
if (credentialsService == null) {
resolvedPrincipal = explicitPrincipal;
resolvedKeytab = explicitKeytab;
} else {
resolvedPrincipal = credentialsService.getPrincipal();
resolvedKeytab = credentialsService.getKeytab();
}
o = o.withKerberosPrincipal(resolvedPrincipal).withKerberosKeytab(resolvedKeytab);
}
final HiveOptions options = o;
// Store the original class loader, then explicitly set it to this class's classloader (for use by the Hive Metastore)
ClassLoader originalClassloader = Thread.currentThread().getContextClassLoader();
Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
final List<String> partitionColumnList;
final String partitionColumns = context.getProperty(PARTITION_COLUMNS).evaluateAttributeExpressions().getValue();
if (partitionColumns == null || partitionColumns.isEmpty()) {
partitionColumnList = Collections.emptyList();
} else {
String[] partitionCols = partitionColumns.split(",");
partitionColumnList = new ArrayList<>(partitionCols.length);
for (String col : partitionCols) {
partitionColumnList.add(col.trim());
}
}
final AtomicReference<List<HiveStreamingRecord>> successfulRecords = new AtomicReference<>();
successfulRecords.set(new ArrayList<>());
final FlowFile inputFlowFile = flowFile;
final RoutingResult result = new RoutingResult();
final ExceptionHandler<FunctionContext> exceptionHandler = new ExceptionHandler<>();
exceptionHandler.mapException(s -> {
try {
if (s == null) {
return ErrorTypes.PersistentFailure;
}
throw s;
} catch (IllegalArgumentException | HiveWriter.WriteFailure | SerializationError inputError) {
return ErrorTypes.InvalidInput;
} catch (HiveWriter.CommitFailure | HiveWriter.TxnBatchFailure | HiveWriter.TxnFailure writerTxError) {
return ErrorTypes.TemporalInputFailure;
} catch (ConnectionError | HiveWriter.ConnectFailure connectionError) {
// Can't connect to Hive endpoint.
log.error("Error connecting to Hive endpoint: table {} at {}", new Object[] { options.getTableName(), options.getMetaStoreURI() });
return ErrorTypes.TemporalFailure;
} catch (IOException | InterruptedException tempError) {
return ErrorTypes.TemporalFailure;
} catch (Exception t) {
return ErrorTypes.UnknownFailure;
}
});
final BiFunction<FunctionContext, ErrorTypes, ErrorTypes.Result> adjustError = RollbackOnFailure.createAdjustError(getLogger());
exceptionHandler.adjustError(adjustError);
// Create output flow files and their Avro writers
functionContext.setFlowFiles(session.create(inputFlowFile), session.create(inputFlowFile));
try {
session.read(inputFlowFile, new InputStreamCallback() {
@Override
public void process(InputStream in) throws IOException {
try (final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
GenericRecord currRecord = null;
// Copy codec and schema information to all writers
final String codec = reader.getMetaString(DataFileConstants.CODEC) == null ? DataFileConstants.NULL_CODEC : reader.getMetaString(DataFileConstants.CODEC);
functionContext.initAvroWriters(session, codec, reader);
Runnable flushSuccessfulRecords = () -> {
// Now send the records to the successful FlowFile and update the success count
functionContext.appendRecordsToSuccess(session, successfulRecords.get());
// Clear the list of successful records, we'll use it at the end when we flush whatever records are left
successfulRecords.set(new ArrayList<>());
};
while (reader.hasNext()) {
// We can NOT reuse currRecord here, because currRecord is accumulated in successful records.
// If we use the same GenericRecord instance, every record ends up having the same contents.
// To avoid this, we need to create a brand new GenericRecord instance here each time.
currRecord = reader.next();
functionContext.recordCount.incrementAndGet();
// Extract the partition values (they must be put separately into the Hive Streaming API)
List<String> partitionValues = new ArrayList<>();
if (!exceptionHandler.execute(functionContext, currRecord, input -> {
for (String partition : partitionColumnList) {
Object partitionValue = input.get(partition);
if (partitionValue == null) {
throw new IllegalArgumentException("Partition column '" + partition + "' not found in Avro record");
}
partitionValues.add(partitionValue.toString());
}
}, onRecordError(context, session, myWriters))) {
continue;
}
final HiveStreamingRecord record = new HiveStreamingRecord(partitionValues, currRecord);
final AtomicReference<HiveWriter> hiveWriterRef = new AtomicReference<>();
// Write record to Hive streaming
if (!exceptionHandler.execute(functionContext, record, input -> {
final HiveEndPoint endPoint = makeHiveEndPoint(record.getPartitionValues(), options);
final HiveWriter hiveWriter = getOrCreateWriter(myWriters, options, endPoint);
hiveWriterRef.set(hiveWriter);
hiveWriter.write(record.getRecord().toString().getBytes(StandardCharsets.UTF_8));
successfulRecords.get().add(record);
}, onHiveRecordError(context, session, myWriters))) {
continue;
}
// If we've reached the records-per-transaction limit, flush the Hive Writer and update the Avro Writer for successful records
final HiveWriter hiveWriter = hiveWriterRef.get();
if (hiveWriter.getTotalRecords() >= recordsPerTxn) {
exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
hiveWriter.flush(true);
// Proceed function context. Process session can't be rollback anymore.
functionContext.proceed();
// Now send the records to the success relationship and update the success count
flushSuccessfulRecords.run();
}, onHiveRecordsError(context, session, myWriters).andThen((fc, input, res, commitException) -> {
// Reset hiveWriter for succeeding records.
switch(res.destination()) {
case Retry:
case Failure:
try {
// Abort current tx and move to next.
hiveWriter.abort();
} catch (Exception e) {
// Can't even abort properly, throw a process exception
throw new ProcessException(e);
}
}
}));
}
}
exceptionHandler.execute(functionContext, successfulRecords.get(), input -> {
// Finish any transactions
flushAllWriters(myWriters, true);
closeAllWriters(myWriters);
// Now send any remaining records to the success relationship and update the count
flushSuccessfulRecords.run();
// Append successfulRecords on failure.
}, onHiveRecordsError(context, session, myWriters));
} catch (IOException ioe) {
// The Avro file is invalid (or may not be an Avro file at all), send it to failure
final ErrorTypes.Result adjusted = adjustError.apply(functionContext, ErrorTypes.InvalidInput);
final String msg = "The incoming flow file can not be read as an Avro file";
switch(adjusted.destination()) {
case Failure:
log.error(msg, ioe);
result.routeTo(inputFlowFile, REL_FAILURE);
break;
case ProcessException:
throw new ProcessException(msg, ioe);
}
}
}
});
// If we got here, we've processed the outgoing flow files correctly, so remove the incoming one if necessary
if (result.getRoutedFlowFiles().values().stream().noneMatch(routed -> routed.contains(inputFlowFile))) {
session.remove(inputFlowFile);
}
} catch (DiscontinuedException e) {
// The input FlowFile processing is discontinued. Keep it in the input queue.
getLogger().warn("Discontinued processing for {} due to {}", new Object[] { flowFile, e }, e);
result.routeTo(flowFile, Relationship.SELF);
} catch (ShouldRetryException e) {
// This exception is already a result of adjusting an error, so simply transfer the FlowFile to retry.
getLogger().error(e.getMessage(), e);
flowFile = session.penalize(flowFile);
result.routeTo(flowFile, REL_RETRY);
} finally {
threadWriterList.remove(myWriters);
functionContext.transferFlowFiles(session, result, options);
// Restore original class loader, might not be necessary but is good practice since the processor task changed it
Thread.currentThread().setContextClassLoader(originalClassloader);
semaphore.release();
}
}
use of org.apache.hive.hcatalog.streaming.SerializationError in project nifi by apache.
the class HiveWriter method write.
/**
* Write the record data to Hive
*
* @throws IOException if an error occurs during the write
* @throws InterruptedException if the write operation is interrupted
*/
public synchronized void write(final byte[] record) throws WriteFailure, SerializationError, InterruptedException {
if (closed) {
throw new IllegalStateException("This hive streaming writer was closed " + "and thus no longer able to write : " + endPoint);
}
// write the tuple
try {
LOG.debug("Writing event to {}", endPoint);
callWithTimeout(new CallRunner<Void>() {
@Override
public Void call() throws StreamingException, InterruptedException {
txnBatch.write(record);
totalRecords++;
return null;
}
});
} catch (SerializationError se) {
throw new SerializationError(endPoint.toString() + " SerializationError", se);
} catch (StreamingException | TimeoutException e) {
throw new WriteFailure(endPoint, txnBatch.getCurrentTxnId(), e);
}
}
use of org.apache.hive.hcatalog.streaming.SerializationError in project storm by apache.
the class HiveWriter method write.
/**
* Write data.
*/
public synchronized void write(final byte[] record) throws WriteFailure, SerializationError, InterruptedException {
if (closed) {
throw new IllegalStateException("This hive streaming writer was closed " + "and thus no longer able to write : " + endPoint);
}
// write the tuple
try {
LOG.debug("Writing event to {}", endPoint);
callWithTimeout(new CallRunner<Void>() {
@Override
public Void call() throws StreamingException, InterruptedException {
txnBatch.write(record);
totalRecords++;
return null;
}
});
} catch (SerializationError se) {
throw new SerializationError(endPoint.toString() + " SerializationError", se);
} catch (StreamingException e) {
throw new WriteFailure(endPoint, txnBatch.getCurrentTxnId(), e);
} catch (TimeoutException e) {
throw new WriteFailure(endPoint, txnBatch.getCurrentTxnId(), e);
}
}
use of org.apache.hive.hcatalog.streaming.SerializationError in project storm by apache.
the class HiveBolt method execute.
@Override
public void execute(Tuple tuple) {
try {
if (batchHelper.shouldHandle(tuple)) {
List<String> partitionVals = options.getMapper().mapPartitions(tuple);
HiveEndPoint endPoint = HiveUtils.makeEndPoint(partitionVals, options);
HiveWriter writer = getOrCreateWriter(endPoint);
writer.write(options.getMapper().mapRecord(tuple));
batchHelper.addBatch(tuple);
}
if (batchHelper.shouldFlush()) {
flushAllWriters(true);
LOG.info("acknowledging tuples after writers flushed ");
batchHelper.ack();
}
if (TupleUtils.isTick(tuple)) {
retireIdleWriters();
}
} catch (SerializationError se) {
LOG.info("Serialization exception occurred, tuple is acknowledged but not written to Hive.", tuple);
this.collector.reportError(se);
collector.ack(tuple);
} catch (Exception e) {
batchHelper.fail(e);
abortAndCloseWriters();
}
}
Aggregations