use of com.ibm.streamsx.kafka.KafkaOperatorResetFailedException in project streamsx.kafka by IBMStreams.
the class AbstractKafkaConsumerOperator method reset.
@Override
public void reset(Checkpoint checkpoint) throws Exception {
final int attempt = crContext == null ? -1 : crContext.getResetAttempt();
final long sequenceId = checkpoint.getSequenceId();
logger.log(DEBUG_LEVEL, MsgFormatter.format(">>> RESET (ckpt id/attempt={0,number,#}/{1})", sequenceId, (crContext == null ? "-" : "" + attempt)));
final long before = System.currentTimeMillis();
try {
final ObjectInputStream inputStream = checkpoint.getInputStream();
final int chkptMagic = inputStream.readInt();
logger.info("magic read from checkpoint: " + chkptMagic);
ConsumerClient consumer = consumerRef.get();
if (chkptMagic == consumer.getImplementationMagic()) {
logger.info("checkpoint fits current ConsumerClient implementation.");
} else {
logger.info("checkpoint does not fit current ConsumerClient implementation. Building matching client ...");
if (consumer.isProcessing()) {
consumer.onShutdown(SHUTDOWN_TIMEOUT, SHUTDOWN_TIMEOUT_TIMEUNIT);
}
final ConsumerClientBuilder builder = magics.get(chkptMagic);
final ConsumerClient newClient = builder.build();
if (consumerRef.compareAndSet(consumer, newClient)) {
try {
newClient.startConsumer();
logger.info(MsgFormatter.format("consumer client implementation {0} replaced by {1}", consumer.getClass().getName(), newClient.getClass().getName()));
} catch (KafkaClientInitializationException e) {
logger.error(e.getLocalizedMessage(), e);
logger.error("root cause: " + e.getRootCause());
throw new KafkaOperatorResetFailedException("consumer client replacement failed", e);
}
} else {
if (consumerRef.get().getImplementationMagic() != chkptMagic) {
logger.warn(MsgFormatter.format("consumer client replacement failed"));
throw new KafkaOperatorResetFailedException("consumer client replacement failed");
}
}
}
consumer = consumerRef.get();
if (consumer.isProcessing()) {
// it is up to the consumer client implementation to stop polling.
consumer.onReset(checkpoint);
}
} catch (InterruptedException e) {
logger.log(DEBUG_LEVEL, "RESET interrupted)");
return;
} finally {
// by another PE, i.e. when relaunch count == 0 in initialize(context)
if (resettingLatch != null)
resettingLatch.countDown();
final long after = System.currentTimeMillis();
final long duration = after - before;
logger.log(DEBUG_LEVEL, MsgFormatter.format(">>> RESET took {0,number,#} ms (ckpt id/attempt={1,number,#}/{2,number,#})", duration, sequenceId, attempt));
}
}
use of com.ibm.streamsx.kafka.KafkaOperatorResetFailedException in project streamsx.kafka by IBMStreams.
the class CrKafkaConsumerGroupClient method createSeekOffsetMap.
/**
* The seek offsets are created with following algorithm from the checkpoint:
* <ul>
* <li>read the contributing operator names from the checkpoint (operator names of the consumer group)
* <li>read the seek offsets from the checkpoint.
* These are the offsets of only those partitions the consumer was assigned at checkpoint time.</li>
* <li>send the offsets of the prior partitions together with the number of operators and the own operator name to the CrGroupCoordinator MXBean.
* The other consumer operators will also send their prior partition-to-offset mappings, and and their dsitinct operator name.</li>
* <li>wait for the JMX notification that the partition-to-offset map has merged</li>
* <li>fetch the merged map from the MX bean so that the operator has the seek offsets of all partitions of
* all topics (the group's view) and store this in the 'seekOffsetMap' member variable.</li>
* </ul>
* @param checkpoint
* @throws InterruptedException
*/
@SuppressWarnings("unchecked")
private void createSeekOffsetMap(Checkpoint checkpoint) throws InterruptedException {
final String operatorName = getOperatorContext().getName();
long chkptSeqId = checkpoint.getSequenceId();
int resetAttempt = getCrContext().getResetAttempt();
MergeKey key = new MergeKey(chkptSeqId, resetAttempt);
trace.info(MsgFormatter.format("createSeekOffsetMap() [{0}] - entering. chkptSeqId = {1,number,#}, resetAttempt = {2}", state, chkptSeqId, resetAttempt));
try {
final ObjectInputStream inputStream = checkpoint.getInputStream();
final String myOperatorNameInCkpt = (String) inputStream.readObject();
Set<String> contributingOperators = (Set<String>) inputStream.readObject();
OffsetManager offsMgr = (OffsetManager) inputStream.readObject();
trace.info(MsgFormatter.format("createSeekOffsetMap() - merging {0} operator checkpoints into a single group checkpoint", contributingOperators.size()));
if (trace.isEnabledFor(DEBUG_LEVEL)) {
trace.log(DEBUG_LEVEL, MsgFormatter.format("createSeekOffsetMap(): myOperatorName read from checkpoint: {0}", myOperatorNameInCkpt));
trace.log(DEBUG_LEVEL, MsgFormatter.format("createSeekOffsetMap(): contributingOperators read from checkpoint: {0}", contributingOperators));
trace.log(DEBUG_LEVEL, MsgFormatter.format("createSeekOffsetMap(): offset manager read from checkpoint: {0}", offsMgr));
}
if (!operatorName.equals(myOperatorNameInCkpt)) {
trace.warn(MsgFormatter.format("Operator name in checkpoint ({0}) differs from current operator name: {1}", myOperatorNameInCkpt, operatorName));
}
if (!contributingOperators.contains(operatorName)) {
final String msg = MsgFormatter.format("This operator''s name ({0}) not found in contributing operator names: {1}", operatorName, contributingOperators);
trace.error(msg);
throw new KafkaOperatorResetFailedException(msg);
}
trace.info(MsgFormatter.format("contributing {0} partition => offset mappings to the group''s checkpoint.", offsMgr.size()));
if (contributingOperators.size() == 1) {
trace.info("this single operator participated in consumer group at checkpoint time. Checkpoint merge and distribution via MXBean disabled.");
assert (contributingOperators.contains(operatorName));
initSeekOffsetMap();
for (TopicPartition tp : offsMgr.getMappedTopicPartitions()) {
final String topic = tp.topic();
final int partition = tp.partition();
final Long offset = offsMgr.getOffset(topic, partition);
this.seekOffsetMap.put(tp, offset);
}
} else {
// send checkpoint data to CrGroupCoordinator MXBean and wait for the notification
// to fetch the group's complete checkpoint. Then, process the group's checkpoint.
Map<CrConsumerGroupCoordinator.TP, Long> partialOffsetMap = new HashMap<>();
for (TopicPartition tp : offsMgr.getMappedTopicPartitions()) {
final String topic = tp.topic();
final int partition = tp.partition();
final Long offset = offsMgr.getOffset(topic, partition);
partialOffsetMap.put(new TP(topic, partition), offset);
}
trace.info(MsgFormatter.format("Merging my group''s checkpoint contribution: partialOffsetMap = {0}, myOperatorName = {1}", partialOffsetMap, operatorName));
this.crGroupCoordinatorMxBean.mergeConsumerCheckpoint(chkptSeqId, resetAttempt, contributingOperators.size(), partialOffsetMap, operatorName);
// check JMX notification and wait for notification
jmxNotificationConditionLock.lock();
long waitStartTime = System.currentTimeMillis();
// increase timeout exponentially with every reset attempt by 20%
// long timeoutMillis = (long)(Math.pow (1.2, resetAttempt) * (double)timeouts.getJmxResetNotificationTimeout());
long timeoutMillis = timeouts.getJmxResetNotificationTimeout();
boolean waitTimeLeft = true;
int nWaits = 0;
long timeElapsed = 0;
trace.log(DEBUG_LEVEL, MsgFormatter.format("checking receiption of JMX notification {0} for sequenceId {1}. timeout = {2,number,#} ms.", CrConsumerGroupCoordinatorMXBean.MERGE_COMPLETE_NTF_TYPE, key, timeoutMillis));
while (!jmxMergeCompletedNotifMap.containsKey(key) && waitTimeLeft) {
long remainingTime = timeoutMillis - timeElapsed;
waitTimeLeft = remainingTime > 0;
if (waitTimeLeft) {
if (nWaits++ % 50 == 0)
trace.log(DEBUG_LEVEL, MsgFormatter.format("waiting for JMX notification {0} for sequenceId {1}. Remaining time = {2,number,#} of {3,number,#} ms", CrConsumerGroupCoordinatorMXBean.MERGE_COMPLETE_NTF_TYPE, key, remainingTime, timeoutMillis));
jmxNotificationCondition.await(100, TimeUnit.MILLISECONDS);
}
timeElapsed = System.currentTimeMillis() - waitStartTime;
}
CrConsumerGroupCoordinator.CheckpointMerge merge = jmxMergeCompletedNotifMap.get(key);
jmxNotificationConditionLock.unlock();
if (merge == null) {
final String msg = MsgFormatter.format("timeout receiving {0} JMX notification for {1} from MXBean {2} in JCP. Current timeout is {3,number,#} milliseconds.", CrConsumerGroupCoordinatorMXBean.MERGE_COMPLETE_NTF_TYPE, key, crGroupCoordinatorMXBeanName, timeoutMillis);
trace.error(msg);
throw new KafkaOperatorResetFailedException(msg);
} else {
trace.info(MsgFormatter.format("waiting for JMX notification for sequenceId {0} took {1} ms", key, timeElapsed));
}
Map<TP, Long> mergedOffsetMap = merge.getConsolidatedOffsetMap();
trace.info("reset offsets (group's checkpoint) received from MXBean: " + mergedOffsetMap);
initSeekOffsetMap();
mergedOffsetMap.forEach((tp, offset) -> {
this.seekOffsetMap.put(new TopicPartition(tp.getTopic(), tp.getPartition()), offset);
});
}
} catch (InterruptedException e) {
trace.log(DEBUG_LEVEL, "createSeekOffsetMap(): interrupted waiting for the JMX notification");
return;
} catch (IOException | ClassNotFoundException e) {
trace.error("reset failed: " + e.getLocalizedMessage());
throw new KafkaOperatorResetFailedException(MsgFormatter.format("resetting operator {0} to checkpoint sequence ID {1} failed: {2}", getOperatorContext().getName(), chkptSeqId, e.getLocalizedMessage()), e);
}
trace.log(DEBUG_LEVEL, "createSeekOffsetMap(): seekOffsetMap = " + this.seekOffsetMap);
}
use of com.ibm.streamsx.kafka.KafkaOperatorResetFailedException in project streamsx.kafka by IBMStreams.
the class NonCrKafkaConsumerGroupClient method processResetEvent.
/**
* Resets the client by restoring the checkpointed subscription.
* @see com.ibm.streamsx.kafka.clients.consumer.AbstractKafkaConsumerClient#processResetEvent(Checkpoint)
*/
@Override
@SuppressWarnings("unchecked")
protected void processResetEvent(Checkpoint checkpoint) {
final long chkptSeqId = checkpoint.getSequenceId();
trace.log(DEBUG_LEVEL, "processResetEvent() - entering. seq = " + chkptSeqId);
try {
final Set<String> topics = (Set<String>) checkpoint.getInputStream().readObject();
trace.info("topics from checkpoint = " + topics);
// subscribe, fetch offset is last committed offset.
subscribe(topics, this);
} catch (IllegalStateException | ClassNotFoundException | IOException e) {
trace.error("reset failed: " + e.getLocalizedMessage());
throw new KafkaOperatorResetFailedException(MsgFormatter.format("resetting operator {0} to checkpoint sequence ID {1,number,#} failed: {2}", getOperatorContext().getName(), chkptSeqId, e.getLocalizedMessage()), e);
}
}
use of com.ibm.streamsx.kafka.KafkaOperatorResetFailedException in project streamsx.kafka by IBMStreams.
the class NonCrKafkaConsumerClient method processResetEvent.
/**
* Restores the state from a checkpoint when there is a control port.
* @see com.ibm.streamsx.kafka.clients.consumer.AbstractNonCrKafkaConsumerClient#processResetEvent(com.ibm.streams.operator.state.Checkpoint)
*/
@Override
@SuppressWarnings("unchecked")
protected void processResetEvent(Checkpoint checkpoint) {
final long chkptSeqId = checkpoint.getSequenceId();
trace.log(DEBUG_LEVEL, "processResetEvent() - entering. seq = " + chkptSeqId);
try {
final Set<TopicPartition> partitions = (Set<TopicPartition>) checkpoint.getInputStream().readObject();
trace.info("topic partitions from checkpoint = " + partitions);
// only assign, fetch offset is last committed offset.
assign(partitions);
} catch (IllegalStateException | ClassNotFoundException | IOException e) {
trace.error("reset failed: " + e.getLocalizedMessage());
throw new KafkaOperatorResetFailedException(MsgFormatter.format("resetting operator {0} to checkpoint sequence ID {1,number,#} failed: {2}", getOperatorContext().getName(), chkptSeqId, e.getLocalizedMessage()), e);
}
}
Aggregations