use of org.apache.helix.ZNRecord in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManager method createConsumingSegment.
/**
* Create a consuming segment for the kafka partitions that are missing one.
*
* @param realtimeTableName is the name of the realtime table (e.g. "table_REALTIME")
* @param nonConsumingPartitions is a set of integers (kafka partitions that do not have a consuming segment)
* @param llcSegments is a list of segment names in the ideal state as was observed last.
*/
public void createConsumingSegment(final String realtimeTableName, final Set<Integer> nonConsumingPartitions, final List<String> llcSegments, final AbstractTableConfig tableConfig) {
final KafkaStreamMetadata kafkaStreamMetadata = new KafkaStreamMetadata(tableConfig.getIndexingConfig().getStreamConfigs());
final ZNRecord partitionAssignment = getKafkaPartitionAssignment(realtimeTableName);
final HashMap<Integer, LLCSegmentName> ncPartitionToLatestSegment = new HashMap<>(nonConsumingPartitions.size());
// Number of replicas (should be same for all partitions)
final int nReplicas = partitionAssignment.getListField("0").size();
// (null if there is none).
for (String segmentId : llcSegments) {
LLCSegmentName segmentName = new LLCSegmentName(segmentId);
int partitionId = segmentName.getPartitionId();
if (nonConsumingPartitions.contains(partitionId)) {
LLCSegmentName hashedSegName = ncPartitionToLatestSegment.get(partitionId);
if (hashedSegName == null || hashedSegName.getSequenceNumber() < segmentName.getSequenceNumber()) {
ncPartitionToLatestSegment.put(partitionId, segmentName);
}
}
}
// and completed), or the table configuration (smallest/largest).
for (int partition : nonConsumingPartitions) {
try {
LLCSegmentName latestSegment = ncPartitionToLatestSegment.get(partition);
long startOffset;
int nextSeqNum;
List<String> instances = partitionAssignment.getListField(Integer.toString(partition));
if (latestSegment == null) {
// No segment yet in partition, Create a new one with a starting offset as per table config specification.
nextSeqNum = STARTING_SEQUENCE_NUMBER;
LOGGER.info("Creating CONSUMING segment for {} partition {} with seq {}", realtimeTableName, partition, nextSeqNum);
String consumerStartOffsetSpec = kafkaStreamMetadata.getKafkaConsumerProperties().get(CommonConstants.Helix.DataSource.Realtime.Kafka.AUTO_OFFSET_RESET);
startOffset = getKafkaPartitionOffset(kafkaStreamMetadata, consumerStartOffsetSpec, partition);
LOGGER.info("Found kafka offset {} for table {} for partition {}", startOffset, realtimeTableName, partition);
} else {
nextSeqNum = latestSegment.getSequenceNumber() + 1;
LOGGER.info("Creating CONSUMING segment for {} partition {} with seq {}", realtimeTableName, partition, nextSeqNum);
// To begin with, set startOffset to the oldest available offset in kafka. Fix it to be the one we want,
// depending on what the prev segment had.
startOffset = getKafkaPartitionOffset(kafkaStreamMetadata, "smallest", partition);
LOGGER.info("Found kafka offset {} for table {} for partition {}", startOffset, realtimeTableName, partition);
startOffset = getBetterStartOffsetIfNeeded(realtimeTableName, partition, latestSegment, startOffset, nextSeqNum);
}
createSegment(realtimeTableName, nReplicas, partition, nextSeqNum, instances, startOffset, partitionAssignment);
} catch (Exception e) {
LOGGER.error("Exception creating CONSUMING segment for {} partition {}", realtimeTableName, partition, e);
}
}
}
use of org.apache.helix.ZNRecord in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManager method completeCommittingSegments.
protected void completeCommittingSegments(String realtimeTableName) {
List<ZNRecord> segmentMetadataList = getExistingSegmentMetadata(realtimeTableName);
if (segmentMetadataList == null || segmentMetadataList.isEmpty()) {
return;
}
final List<String> segmentIds = new ArrayList<>(segmentMetadataList.size());
for (ZNRecord segment : segmentMetadataList) {
if (SegmentName.isLowLevelConsumerSegmentName(segment.getId())) {
segmentIds.add(segment.getId());
}
}
if (segmentIds.isEmpty()) {
return;
}
completeCommittingSegments(realtimeTableName, segmentIds);
}
use of org.apache.helix.ZNRecord in project pinot by linkedin.
the class PinotHelixResourceManager method createHelixEntriesForHighLevelConsumer.
private void createHelixEntriesForHighLevelConsumer(AbstractTableConfig config, String realtimeTableName, IdealState idealState) {
if (idealState == null) {
idealState = PinotTableIdealStateBuilder.buildInitialHighLevelRealtimeIdealStateFor(realtimeTableName, config, _helixAdmin, _helixClusterName, _propertyStore);
LOGGER.info("Adding helix resource with empty HLC IdealState for {}", realtimeTableName);
_helixAdmin.addResource(_helixClusterName, realtimeTableName, idealState);
} else {
// TODO jfim: We get in this block if we're trying to add a HLC or it already exists. If it doesn't already exist, we need to set instance configs properly (which is done in buildInitialHighLevelRealtimeIdealState, surprisingly enough). For now, do nothing.
LOGGER.info("Not reconfiguring HLC for table {}", realtimeTableName);
}
LOGGER.info("Successfully created empty ideal state for high level consumer for {} ", realtimeTableName);
// Finally, create the propertystore entry that will trigger watchers to create segments
String tablePropertyStorePath = ZKMetadataProvider.constructPropertyStorePathForResource(realtimeTableName);
if (!_propertyStore.exists(tablePropertyStorePath, AccessOption.PERSISTENT)) {
_propertyStore.create(tablePropertyStorePath, new ZNRecord(realtimeTableName), AccessOption.PERSISTENT);
}
}
use of org.apache.helix.ZNRecord in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManagerTest method testAutoReplaceConsumingSegment.
public void testAutoReplaceConsumingSegment(final String tableConfigStartOffset) throws Exception {
FakePinotLLCRealtimeSegmentManager segmentManager = new FakePinotLLCRealtimeSegmentManager(true, null);
final int nPartitions = 8;
final int nInstances = 3;
final int nReplicas = 2;
final String topic = "someTopic";
final String rtTableName = "table_REALTIME";
List<String> instances = getInstanceList(nInstances);
final String startOffset = KAFKA_OFFSET;
IdealState idealState = PinotTableIdealStateBuilder.buildEmptyKafkaConsumerRealtimeIdealStateFor(rtTableName, nReplicas);
segmentManager.setupHelixEntries(topic, rtTableName, nPartitions, instances, nReplicas, startOffset, DUMMY_HOST, idealState, false, 10000);
// Add another segment for each partition
long now = System.currentTimeMillis();
List<String> existingSegments = new ArrayList<>(segmentManager._idealStateEntries.keySet());
final int partitionToBeFixed = 3;
final int partitionWithHigherOffset = 4;
final int emptyPartition = 5;
final long smallestPartitionOffset = 0x259080984568L;
final long largestPartitionOffset = smallestPartitionOffset + 100000;
final long higherOffset = smallestPartitionOffset + 100;
for (String segmentNameStr : existingSegments) {
LLCSegmentName segmentName = new LLCSegmentName(segmentNameStr);
switch(segmentName.getPartitionId()) {
case partitionToBeFixed:
// Do nothing, we will test adding a new segment for this partition when there is only one segment in there.
break;
case emptyPartition:
// Remove existing segment, so we can test adding a new segment for this partition when none exists
segmentManager._idealStateEntries.remove(segmentNameStr);
break;
case partitionWithHigherOffset:
// Set segment metadata for this segment such that its offset is higher than startOffset we get from kafka.
// In that case, we should choose the new segment offset as this one rather than the one kafka hands us.
LLCRealtimeSegmentZKMetadata metadata = new LLCRealtimeSegmentZKMetadata();
metadata.setSegmentName(segmentName.getSegmentName());
metadata.setEndOffset(higherOffset);
metadata.setStatus(CommonConstants.Segment.Realtime.Status.DONE);
segmentManager._metadataMap.put(segmentName.getSegmentName(), metadata);
break;
default:
// Add a second segment for this partition. It will not be repaired.
LLCSegmentName newSegmentName = new LLCSegmentName(segmentName.getTableName(), segmentName.getPartitionId(), segmentName.getSequenceNumber() + 1, now);
List<String> hosts = segmentManager._idealStateEntries.get(segmentNameStr);
segmentManager._idealStateEntries.put(newSegmentName.getSegmentName(), hosts);
break;
}
}
Map<String, String> streamPropMap = new HashMap<>(1);
streamPropMap.put(StringUtil.join(".", CommonConstants.Helix.DataSource.STREAM_PREFIX, CommonConstants.Helix.DataSource.Realtime.Kafka.CONSUMER_TYPE), "simple");
streamPropMap.put(StringUtil.join(".", CommonConstants.Helix.DataSource.STREAM_PREFIX, CommonConstants.Helix.DataSource.Realtime.Kafka.KAFKA_CONSUMER_PROPS_PREFIX, CommonConstants.Helix.DataSource.Realtime.Kafka.AUTO_OFFSET_RESET), tableConfigStartOffset);
KafkaStreamMetadata kafkaStreamMetadata = new KafkaStreamMetadata(streamPropMap);
AbstractTableConfig tableConfig = mock(AbstractTableConfig.class);
IndexingConfig indexingConfig = mock(IndexingConfig.class);
when(indexingConfig.getStreamConfigs()).thenReturn(streamPropMap);
when(tableConfig.getIndexingConfig()).thenReturn(indexingConfig);
Set<Integer> nonConsumingPartitions = new HashSet<>(1);
nonConsumingPartitions.add(partitionToBeFixed);
nonConsumingPartitions.add(partitionWithHigherOffset);
nonConsumingPartitions.add(emptyPartition);
segmentManager._kafkaSmallestOffsetToReturn = smallestPartitionOffset;
segmentManager._kafkaLargestOffsetToReturn = largestPartitionOffset;
existingSegments = new ArrayList<>(segmentManager._idealStateEntries.keySet());
segmentManager._paths.clear();
segmentManager._records.clear();
segmentManager.createConsumingSegment(rtTableName, nonConsumingPartitions, existingSegments, tableConfig);
Assert.assertEquals(segmentManager._paths.size(), 3);
Assert.assertEquals(segmentManager._records.size(), 3);
Assert.assertEquals(segmentManager._oldSegmentNameStr.size(), 3);
Assert.assertEquals(segmentManager._newSegmentNameStr.size(), 3);
int found = 0;
int index = 0;
while (index < segmentManager._paths.size()) {
String znodePath = segmentManager._paths.get(index);
int slash = znodePath.lastIndexOf('/');
String segmentNameStr = znodePath.substring(slash + 1);
LLCSegmentName segmentName = new LLCSegmentName(segmentNameStr);
ZNRecord znRecord;
LLCRealtimeSegmentZKMetadata metadata;
switch(segmentName.getPartitionId()) {
case partitionToBeFixed:
// We had left this partition with one segment. So, a second one should be created with a sequence number one
// higher than starting. Its start offset should be what kafka returns.
found++;
Assert.assertEquals(segmentName.getSequenceNumber(), PinotLLCRealtimeSegmentManager.STARTING_SEQUENCE_NUMBER + 1);
znRecord = segmentManager._records.get(index);
metadata = new LLCRealtimeSegmentZKMetadata(znRecord);
Assert.assertEquals(metadata.getNumReplicas(), 2);
Assert.assertEquals(metadata.getStartOffset(), smallestPartitionOffset);
break;
case emptyPartition:
// We had removed any segments in this partition. A new one should be created with the offset as returned
// by kafka and with the starting sequence number.
found++;
Assert.assertEquals(segmentName.getSequenceNumber(), PinotLLCRealtimeSegmentManager.STARTING_SEQUENCE_NUMBER);
znRecord = segmentManager._records.get(index);
metadata = new LLCRealtimeSegmentZKMetadata(znRecord);
Assert.assertEquals(metadata.getNumReplicas(), 2);
if (tableConfigStartOffset.equals("smallest")) {
Assert.assertEquals(metadata.getStartOffset(), smallestPartitionOffset);
} else {
Assert.assertEquals(metadata.getStartOffset(), largestPartitionOffset);
}
break;
case partitionWithHigherOffset:
// We had left this partition with one segment. In addition, we had the end-offset of the first segment set to
// a value higher than that returned by kafka. So, a second one should be created with a sequence number one
// equal to the end offset of the first one.
found++;
Assert.assertEquals(segmentName.getSequenceNumber(), PinotLLCRealtimeSegmentManager.STARTING_SEQUENCE_NUMBER + 1);
znRecord = segmentManager._records.get(index);
metadata = new LLCRealtimeSegmentZKMetadata(znRecord);
Assert.assertEquals(metadata.getNumReplicas(), 2);
Assert.assertEquals(metadata.getStartOffset(), higherOffset);
break;
}
index++;
}
// We should see all three cases here.
Assert.assertEquals(3, found);
// Now, if we make 'partitionToBeFixed' a non-consuming partition, a second one should get added with the same start offset as
// as the first one, since the kafka offset to return has not changed.
Set<Integer> ncPartitions = new HashSet<>(1);
ncPartitions.add(partitionToBeFixed);
segmentManager.createConsumingSegment(rtTableName, ncPartitions, segmentManager.getExistingSegments(rtTableName), tableConfig);
Assert.assertEquals(segmentManager._paths.size(), 4);
Assert.assertEquals(segmentManager._records.size(), 4);
Assert.assertEquals(segmentManager._oldSegmentNameStr.size(), 4);
Assert.assertEquals(segmentManager._newSegmentNameStr.size(), 4);
// The latest zn record should be that of the new one we added.
ZNRecord znRecord = segmentManager._records.get(3);
LLCRealtimeSegmentZKMetadata metadata = new LLCRealtimeSegmentZKMetadata(znRecord);
Assert.assertEquals(metadata.getNumReplicas(), 2);
Assert.assertEquals(metadata.getStartOffset(), smallestPartitionOffset);
LLCSegmentName llcSegmentName = new LLCSegmentName(metadata.getSegmentName());
Assert.assertEquals(llcSegmentName.getSequenceNumber(), PinotLLCRealtimeSegmentManager.STARTING_SEQUENCE_NUMBER + 2);
Assert.assertEquals(llcSegmentName.getPartitionId(), partitionToBeFixed);
// Now pretend the prev segment ended successfully, and set the end offset
metadata.setEndOffset(metadata.getStartOffset() + 10);
metadata.setStatus(CommonConstants.Segment.Realtime.Status.DONE);
segmentManager._records.remove(3);
segmentManager._records.add(metadata.toZNRecord());
segmentManager._metadataMap.put(metadata.getSegmentName(), metadata);
segmentManager._kafkaLargestOffsetToReturn *= 2;
segmentManager._kafkaSmallestOffsetToReturn *= 2;
ncPartitions.clear();
ncPartitions.add(partitionToBeFixed);
segmentManager.createConsumingSegment(rtTableName, ncPartitions, segmentManager.getExistingSegments(rtTableName), tableConfig);
Assert.assertEquals(segmentManager._paths.size(), 5);
Assert.assertEquals(segmentManager._records.size(), 5);
Assert.assertEquals(segmentManager._oldSegmentNameStr.size(), 5);
Assert.assertEquals(segmentManager._newSegmentNameStr.size(), 5);
znRecord = segmentManager._records.get(4);
metadata = new LLCRealtimeSegmentZKMetadata(znRecord);
Assert.assertEquals(metadata.getNumReplicas(), 2);
// In this case, since we have data loss, we will always put the smallest kafka partition available.
Assert.assertEquals(metadata.getStartOffset(), segmentManager.getKafkaPartitionOffset(null, "smallest", partitionToBeFixed));
llcSegmentName = new LLCSegmentName(metadata.getSegmentName());
Assert.assertEquals(llcSegmentName.getSequenceNumber(), PinotLLCRealtimeSegmentManager.STARTING_SEQUENCE_NUMBER + 3);
Assert.assertEquals(llcSegmentName.getPartitionId(), partitionToBeFixed);
}
use of org.apache.helix.ZNRecord in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManagerTest method testUpdatingKafkaPartitions.
@Test
public void testUpdatingKafkaPartitions() throws Exception {
FakePinotLLCRealtimeSegmentManager segmentManager = new FakePinotLLCRealtimeSegmentManager(false, null);
int nInstances = 3;
int nKafkaPartitions = 8;
int nReplicas = 3;
final String topic = "someTopic";
final String rtTableName = "table_REALTIME";
List<String> instances = getInstanceList(nInstances);
final String startOffset = KAFKA_OFFSET;
// Populate 'partitionSet' with all kafka partitions,
// As we find partitions in the assigment, we will remove the partition from this set.
Set<Integer> partitionSet = new HashSet<>(nKafkaPartitions);
for (int i = 0; i < nKafkaPartitions; i++) {
partitionSet.add(i);
}
// Setup initial entries
segmentManager.setupHelixEntries(topic, rtTableName, nKafkaPartitions, instances, nReplicas, startOffset, DUMMY_HOST, null, true, 1000);
ZNRecord partitionAssignment = segmentManager._partitionAssignment;
segmentManager._currentKafkaPartitionCount = nKafkaPartitions;
segmentManager._currentInstanceList = instances;
// Call to update the partition list should do nothing.
AbstractTableConfig tableConfig = makeTableConfig(nReplicas, topic);
segmentManager.updateKafkaPartitionsIfNecessary(rtTableName, tableConfig);
Assert.assertTrue(segmentManager._partitionAssignment == partitionAssignment);
// Change the number of kafka partitions to 9, and we should generate a new partition assignment
nKafkaPartitions = 9;
segmentManager._currentKafkaPartitionCount = nKafkaPartitions;
segmentManager.updateKafkaPartitionsIfNecessary(rtTableName, tableConfig);
partitionAssignment = validatePartitionAssignment(segmentManager, nKafkaPartitions, nReplicas);
// Now reduce the number of instances and, we should not be updating anything.
segmentManager._currentInstanceList = getInstanceList(nInstances - 1);
segmentManager.updateKafkaPartitionsIfNecessary(rtTableName, tableConfig);
Assert.assertTrue(partitionAssignment == segmentManager._partitionAssignment);
// Change the number of servers to 1 more, and we should update it again.
nInstances++;
segmentManager._currentInstanceList = getInstanceList(nInstances);
segmentManager.updateKafkaPartitionsIfNecessary(rtTableName, tableConfig);
Assert.assertTrue(partitionAssignment != segmentManager._partitionAssignment);
partitionAssignment = validatePartitionAssignment(segmentManager, nKafkaPartitions, nReplicas);
// Change the replica count to one more, and we should update the assignment
nReplicas++;
tableConfig = makeTableConfig(nReplicas, topic);
segmentManager.updateKafkaPartitionsIfNecessary(rtTableName, tableConfig);
Assert.assertTrue(partitionAssignment != segmentManager._partitionAssignment);
partitionAssignment = validatePartitionAssignment(segmentManager, nKafkaPartitions, nReplicas);
// Change the list of servers while keeping the number of servers the same.
// We should see a change in the partition assignment.
String server1 = segmentManager._currentInstanceList.get(0);
segmentManager._currentInstanceList.set(0, server1 + "_new");
Assert.assertEquals(nInstances, segmentManager._currentInstanceList.size());
segmentManager.updateKafkaPartitionsIfNecessary(rtTableName, tableConfig);
Assert.assertTrue(partitionAssignment != segmentManager._partitionAssignment);
partitionAssignment = validatePartitionAssignment(segmentManager, nKafkaPartitions, nReplicas);
}
Aggregations