use of com.linkedin.pinot.common.utils.LLCSegmentName in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManager method completeCommittingSegmentsInternal.
private void completeCommittingSegmentsInternal(String realtimeTableName, Map<Integer, MinMaxPriorityQueue<LLCSegmentName>> partitionToLatestSegments) {
IdealState idealState = getTableIdealState(realtimeTableName);
Set<String> segmentNamesIS = idealState.getPartitionSet();
final ZNRecord partitionAssignment = getKafkaPartitionAssignment(realtimeTableName);
for (Map.Entry<Integer, MinMaxPriorityQueue<LLCSegmentName>> entry : partitionToLatestSegments.entrySet()) {
final LLCSegmentName segmentName = entry.getValue().pollFirst();
final String segmentId = segmentName.getSegmentName();
final int partitionId = entry.getKey();
if (!segmentNamesIS.contains(segmentId)) {
LOGGER.info("{}:Repairing segment for partition {}. Segment {} not found in idealstate", realtimeTableName, partitionId, segmentId);
List<String> newInstances = partitionAssignment.getListField(Integer.toString(partitionId));
LOGGER.info("{}: Assigning segment {} to {}", realtimeTableName, segmentId, newInstances);
// TODO Re-write num-partitions in metadata if needed.
// If there was a prev segment in the same partition, then we need to fix it to be ONLINE.
LLCSegmentName prevSegmentName = entry.getValue().pollLast();
String prevSegmentNameStr = null;
if (prevSegmentName != null) {
prevSegmentNameStr = prevSegmentName.getSegmentName();
}
updateIdealState(realtimeTableName, newInstances, prevSegmentNameStr, segmentId);
}
}
}
use of com.linkedin.pinot.common.utils.LLCSegmentName in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManager method commitSegment.
/**
* This method is invoked after the realtime segment is uploaded but before a response is sent to the server.
* It updates the propertystore segment metadata from IN_PROGRESS to DONE, and also creates new propertystore
* records for new segments, and puts them in idealstate in CONSUMING state.
*
* @param rawTableName Raw table name
* @param committingSegmentNameStr Committing segment name
* @param nextOffset The offset with which the next segment should start.
* @return
*/
public boolean commitSegment(String rawTableName, final String committingSegmentNameStr, long nextOffset) {
final long now = System.currentTimeMillis();
final String realtimeTableName = TableNameBuilder.REALTIME_TABLE_NAME_BUILDER.forTable(rawTableName);
final LLCRealtimeSegmentZKMetadata oldSegMetadata = getRealtimeSegmentZKMetadata(realtimeTableName, committingSegmentNameStr);
final LLCSegmentName oldSegmentName = new LLCSegmentName(committingSegmentNameStr);
final int partitionId = oldSegmentName.getPartitionId();
final int oldSeqNum = oldSegmentName.getSequenceNumber();
oldSegMetadata.setEndOffset(nextOffset);
oldSegMetadata.setStatus(CommonConstants.Segment.Realtime.Status.DONE);
oldSegMetadata.setDownloadUrl(ControllerConf.constructDownloadUrl(rawTableName, committingSegmentNameStr, _controllerConf.generateVipUrl()));
// Pull segment metadata from incoming segment and set it in zk segment metadata
SegmentMetadataImpl segmentMetadata = extractSegmentMetadata(rawTableName, committingSegmentNameStr);
oldSegMetadata.setCrc(Long.valueOf(segmentMetadata.getCrc()));
oldSegMetadata.setStartTime(segmentMetadata.getTimeInterval().getStartMillis());
oldSegMetadata.setEndTime(segmentMetadata.getTimeInterval().getEndMillis());
oldSegMetadata.setTimeUnit(TimeUnit.MILLISECONDS);
oldSegMetadata.setIndexVersion(segmentMetadata.getVersion());
oldSegMetadata.setTotalRawDocs(segmentMetadata.getTotalRawDocs());
final ZNRecord oldZnRecord = oldSegMetadata.toZNRecord();
final String oldZnodePath = ZKMetadataProvider.constructPropertyStorePathForSegment(realtimeTableName, committingSegmentNameStr);
final ZNRecord partitionAssignment = getKafkaPartitionAssignment(realtimeTableName);
// creating a new segment
if (partitionAssignment == null) {
LOGGER.warn("Kafka partition assignment not found for {}", realtimeTableName);
throw new RuntimeException("Kafka partition assigment not found. Not committing segment");
}
List<String> newInstances = partitionAssignment.getListField(Integer.toString(partitionId));
// Construct segment metadata and idealstate for the new segment
final int newSeqNum = oldSeqNum + 1;
final long newStartOffset = nextOffset;
LLCSegmentName newHolder = new LLCSegmentName(oldSegmentName.getTableName(), partitionId, newSeqNum, now);
final String newSegmentNameStr = newHolder.getSegmentName();
ZNRecord newZnRecord = makeZnRecordForNewSegment(rawTableName, newInstances.size(), newStartOffset, newSegmentNameStr);
final LLCRealtimeSegmentZKMetadata newSegmentZKMetadata = new LLCRealtimeSegmentZKMetadata(newZnRecord);
updateFlushThresholdForSegmentMetadata(newSegmentZKMetadata, partitionAssignment, getRealtimeTableFlushSizeForTable(rawTableName));
newZnRecord = newSegmentZKMetadata.toZNRecord();
final String newZnodePath = ZKMetadataProvider.constructPropertyStorePathForSegment(realtimeTableName, newSegmentNameStr);
List<String> paths = new ArrayList<>(2);
paths.add(oldZnodePath);
paths.add(newZnodePath);
List<ZNRecord> records = new ArrayList<>(2);
records.add(oldZnRecord);
records.add(newZnRecord);
/*
* Update zookeeper in two steps.
*
* Step 1: Update PROPERTYSTORE to change the segment metadata for old segment and add a new one for new segment
* Step 2: Update IDEALSTATES to include the new segment in the idealstate for the table in CONSUMING state, and change
* the old segment to ONLINE state.
*
* The controller may fail between these two steps, so when a new controller takes over as leader, it needs to
* check whether there are any recent segments in PROPERTYSTORE that are not accounted for in idealState. If so,
* it should create the new segments in idealState.
*
* If the controller fails after step-2, we are fine because the idealState has the new segments.
* If the controller fails before step-1, the server will see this as an upload failure, and will re-try.
*/
writeSegmentsToPropertyStore(paths, records, realtimeTableName);
// TODO Introduce a controller failure here for integration testing
// When multiple segments of the same table complete around the same time it is possible that
// the idealstate udpate fails due to contention. We serialize the updates to the idealstate
// to reduce this contention. We may still contend with RetentionManager, or other updates
// to idealstate from other controllers, but then we have the retry mechanism to get around that.
// hash code can be negative, so make sure we are getting a positive lock index
int lockIndex = (realtimeTableName.hashCode() & Integer.MAX_VALUE) % NUM_LOCKS;
Lock lock = _idealstateUpdateLocks[lockIndex];
try {
lock.lock();
updateIdealState(realtimeTableName, newInstances, committingSegmentNameStr, newSegmentNameStr);
LOGGER.info("Changed {} to ONLINE and created {} in CONSUMING", committingSegmentNameStr, newSegmentNameStr);
} finally {
lock.unlock();
}
return true;
}
use of com.linkedin.pinot.common.utils.LLCSegmentName in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManager method updateFlushThresholdForSegmentMetadata.
void updateFlushThresholdForSegmentMetadata(LLCRealtimeSegmentZKMetadata segmentZKMetadata, ZNRecord partitionAssignment, int tableFlushSize) {
// Only update the flush threshold if there is a valid table flush size
if (tableFlushSize < 1) {
return;
}
// Gather list of instances for this partition
Object2IntMap<String> partitionCountForInstance = new Object2IntLinkedOpenHashMap<>();
String segmentPartitionId = new LLCSegmentName(segmentZKMetadata.getSegmentName()).getPartitionRange();
for (String instanceName : partitionAssignment.getListField(segmentPartitionId)) {
partitionCountForInstance.put(instanceName, 0);
}
// Find the maximum number of partitions served for each instance that is serving this segment
int maxPartitionCountPerInstance = 1;
for (Map.Entry<String, List<String>> partitionAndInstanceList : partitionAssignment.getListFields().entrySet()) {
for (String instance : partitionAndInstanceList.getValue()) {
if (partitionCountForInstance.containsKey(instance)) {
int partitionCountForThisInstance = partitionCountForInstance.getInt(instance);
partitionCountForThisInstance++;
partitionCountForInstance.put(instance, partitionCountForThisInstance);
if (maxPartitionCountPerInstance < partitionCountForThisInstance) {
maxPartitionCountPerInstance = partitionCountForThisInstance;
}
}
}
}
// Configure the segment size flush limit based on the maximum number of partitions allocated to a replica
int segmentFlushSize = (int) (((float) tableFlushSize) / maxPartitionCountPerInstance);
segmentZKMetadata.setSizeThresholdToFlushSegment(segmentFlushSize);
}
use of com.linkedin.pinot.common.utils.LLCSegmentName in project pinot by linkedin.
the class ValidationManager method validateLLCSegments.
// For LLC segments, validate that there is at least one segment in CONSUMING state for every partition.
void validateLLCSegments(final String realtimeTableName, AbstractTableConfig tableConfig) {
LOGGER.info("Validating LLC Segments for {}", realtimeTableName);
Map<String, String> streamConfigs = tableConfig.getIndexingConfig().getStreamConfigs();
ZNRecord partitionAssignment = _llcRealtimeSegmentManager.getKafkaPartitionAssignment(realtimeTableName);
if (partitionAssignment == null) {
LOGGER.warn("No partition assignment found for table {}", realtimeTableName);
return;
}
Map<String, List<String>> partitionToHostsMap = partitionAssignment.getListFields();
// Keep a set of kafka partitions, and remove the partition when we find a segment in CONSUMING state in
// that partition.
Set<Integer> nonConsumingKafkaPartitions = new HashSet<>(partitionToHostsMap.size());
for (String partitionStr : partitionToHostsMap.keySet()) {
nonConsumingKafkaPartitions.add(Integer.valueOf(partitionStr));
}
IdealState idealState = HelixHelper.getTableIdealState(_pinotHelixResourceManager.getHelixZkManager(), realtimeTableName);
if (!idealState.isEnabled()) {
// No validation to be done.
LOGGER.info("Skipping validation for {} since it is disabled", realtimeTableName);
return;
}
// Walk through all segments in the idealState, looking for one instance that is in CONSUMING state. If we find one
// remove the kafka partition that the segment belongs to, from the kafka partition set.
// Make sure that there are at least some LLC segments in place. If there are no LLC segments, it is possible
// that this table is in the process of being disabled for LLC
Set<String> segmentIds = idealState.getPartitionSet();
List<String> llcSegments = new ArrayList<>(segmentIds.size());
for (String segmentId : segmentIds) {
if (SegmentName.isLowLevelConsumerSegmentName(segmentId)) {
llcSegments.add(segmentId);
Map<String, String> stateMap = idealState.getInstanceStateMap(segmentId);
Iterator<String> iterator = stateMap.values().iterator();
// If there is at least one instance in CONSUMING state, we are good.
boolean foundConsuming = false;
while (iterator.hasNext() && !foundConsuming) {
String stateString = iterator.next();
if (stateString.equals(PinotHelixSegmentOnlineOfflineStateModelGenerator.CONSUMING_STATE)) {
LOGGER.info("Found CONSUMING segment {}", segmentId);
foundConsuming = true;
}
}
if (foundConsuming) {
LLCSegmentName llcSegmentName = new LLCSegmentName(segmentId);
nonConsumingKafkaPartitions.remove(llcSegmentName.getPartitionId());
}
}
}
// Kafka partition set now has all the partitions that do not have any segments in CONSUMING state.
if (!llcSegments.isEmpty()) {
// Raise the metric only if there is at least one llc segment in the idealstate.
_validationMetrics.updateNumNonConsumingPartitionsMetric(realtimeTableName, nonConsumingKafkaPartitions.size());
// Recreate a segment for the partitions that are missing one.
for (Integer kafkaPartition : nonConsumingKafkaPartitions) {
LOGGER.warn("Table {}, kafka partition {} has no segments in CONSUMING state (out of {} llc segments)", realtimeTableName, kafkaPartition, llcSegments.size());
}
if (_autoCreateOnError) {
_llcRealtimeSegmentManager.createConsumingSegment(realtimeTableName, nonConsumingKafkaPartitions, llcSegments, tableConfig);
_llcRealtimeSegmentManager.completeCommittingSegments(realtimeTableName, llcSegments);
}
}
// Make this call after other validations (so that we verify that we are consistent against the existing partition
// assignment). This call may end up changing the kafka partition assignment for the table.
_llcRealtimeSegmentManager.updateKafkaPartitionsIfNecessary(realtimeTableName, tableConfig);
}
Aggregations