use of com.linkedin.pinot.common.metadata.segment.LLCRealtimeSegmentZKMetadata in project pinot by linkedin.
the class SegmentCompletionManager method lookupOrCreateFsm.
// We need to make sure that we never create multiple FSMs for the same segment, so this method must be synchronized.
private synchronized SegmentCompletionFSM lookupOrCreateFsm(final LLCSegmentName segmentName, String msgType) {
final String segmentNameStr = segmentName.getSegmentName();
SegmentCompletionFSM fsm = _fsmMap.get(segmentNameStr);
if (fsm == null) {
// Look up propertystore to see if this is a completed segment
ZNRecord segment;
try {
// TODO if we keep a list of last few committed segments, we don't need to go to zk for this.
final String realtimeTableName = TableNameBuilder.REALTIME_TABLE_NAME_BUILDER.forTable(segmentName.getTableName());
LLCRealtimeSegmentZKMetadata segmentMetadata = _segmentManager.getRealtimeSegmentZKMetadata(realtimeTableName, segmentName.getSegmentName());
if (segmentMetadata == null) {
// It is possible that we are in the process of reverting configuration back to high-level consumers.
LOGGER.warn("Segment metadata not found for {}", segmentNameStr);
throw new RuntimeException("Segment metadata not found for " + segmentNameStr);
}
if (segmentMetadata.getStatus().equals(CommonConstants.Segment.Realtime.Status.DONE)) {
// Best to go through the state machine for this case as well, so that all code regarding state handling is in one place
// Also good for synchronization, because it is possible that multiple threads take this path, and we don't want
// multiple instances of the FSM to be created for the same commit sequence at the same time.
final long endOffset = segmentMetadata.getEndOffset();
fsm = SegmentCompletionFSM.fsmInCommit(_segmentManager, this, segmentName, segmentMetadata.getNumReplicas(), endOffset);
} else if (msgType.equals(SegmentCompletionProtocol.MSG_TYPE_STOPPED_CONSUMING)) {
fsm = SegmentCompletionFSM.fsmStoppedConsuming(_segmentManager, this, segmentName, segmentMetadata.getNumReplicas());
} else {
// Segment is in the process of completing, and this is the first one to respond. Create fsm
fsm = SegmentCompletionFSM.fsmInHolding(_segmentManager, this, segmentName, segmentMetadata.getNumReplicas());
}
LOGGER.info("Created FSM {}", fsm);
_fsmMap.put(segmentNameStr, fsm);
} catch (Exception e) {
// Server gone wonky. Segment does not exist in propstore
LOGGER.error("Exception reading segment read from propertystore {}", segmentNameStr, e);
throw new RuntimeException("Segment read from propertystore " + segmentNameStr, e);
}
}
return fsm;
}
use of com.linkedin.pinot.common.metadata.segment.LLCRealtimeSegmentZKMetadata in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManager method makeZnRecordForNewSegment.
private ZNRecord makeZnRecordForNewSegment(String realtimeTableName, int numReplicas, long startOffset, String newSegmentNameStr) {
final LLCRealtimeSegmentZKMetadata newSegMetadata = new LLCRealtimeSegmentZKMetadata();
newSegMetadata.setCreationTime(System.currentTimeMillis());
newSegMetadata.setStartOffset(startOffset);
newSegMetadata.setEndOffset(END_OFFSET_FOR_CONSUMING_SEGMENTS);
newSegMetadata.setNumReplicas(numReplicas);
newSegMetadata.setTableName(realtimeTableName);
newSegMetadata.setSegmentName(newSegmentNameStr);
newSegMetadata.setStatus(CommonConstants.Segment.Realtime.Status.IN_PROGRESS);
return newSegMetadata.toZNRecord();
}
use of com.linkedin.pinot.common.metadata.segment.LLCRealtimeSegmentZKMetadata in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManager method setupInitialSegments.
protected void setupInitialSegments(String realtimeTableName, ZNRecord partitionAssignment, String topicName, String initialOffset, String bootstrapHosts, IdealState idealState, boolean create, int nReplicas, int flushSize) {
List<String> currentSegments = getExistingSegments(realtimeTableName);
// Make sure that there are no low-level segments existing.
if (currentSegments != null) {
for (String segment : currentSegments) {
if (!SegmentName.isHighLevelConsumerSegmentName(segment)) {
// realtime segments for any other reason.
throw new RuntimeException("Low-level segments already exist for table " + realtimeTableName);
}
}
}
// Map of segment names to the server-instances that hold the segment.
final Map<String, List<String>> idealStateEntries = new HashMap<String, List<String>>(4);
final Map<String, List<String>> partitionToServersMap = partitionAssignment.getListFields();
final int nPartitions = partitionToServersMap.size();
// Create one segment entry in PROPERTYSTORE for each kafka partition.
// Any of these may already be there, so bail out clean if they are already present.
final long now = System.currentTimeMillis();
final int seqNum = STARTING_SEQUENCE_NUMBER;
List<LLCRealtimeSegmentZKMetadata> segmentZKMetadatas = new ArrayList<>();
// Create metadata for each segment
for (int i = 0; i < nPartitions; i++) {
final List<String> instances = partitionToServersMap.get(Integer.toString(i));
LLCRealtimeSegmentZKMetadata metadata = new LLCRealtimeSegmentZKMetadata();
final String rawTableName = TableNameBuilder.extractRawTableName(realtimeTableName);
LLCSegmentName llcSegmentName = new LLCSegmentName(rawTableName, i, seqNum, now);
final String segName = llcSegmentName.getSegmentName();
metadata.setCreationTime(now);
final long startOffset = getPartitionOffset(topicName, bootstrapHosts, initialOffset, i);
LOGGER.info("Setting start offset for segment {} to {}", segName, startOffset);
metadata.setStartOffset(startOffset);
metadata.setEndOffset(END_OFFSET_FOR_CONSUMING_SEGMENTS);
metadata.setNumReplicas(instances.size());
metadata.setTableName(rawTableName);
metadata.setSegmentName(segName);
metadata.setStatus(CommonConstants.Segment.Realtime.Status.IN_PROGRESS);
segmentZKMetadatas.add(metadata);
idealStateEntries.put(segName, instances);
}
// Compute the number of rows for each segment
for (LLCRealtimeSegmentZKMetadata segmentZKMetadata : segmentZKMetadatas) {
updateFlushThresholdForSegmentMetadata(segmentZKMetadata, partitionAssignment, flushSize);
}
// Write metadata for each segment to the Helix property store
List<String> paths = new ArrayList<>(nPartitions);
List<ZNRecord> records = new ArrayList<>(nPartitions);
for (LLCRealtimeSegmentZKMetadata segmentZKMetadata : segmentZKMetadatas) {
ZNRecord record = segmentZKMetadata.toZNRecord();
final String znodePath = ZKMetadataProvider.constructPropertyStorePathForSegment(realtimeTableName, segmentZKMetadata.getSegmentName());
paths.add(znodePath);
records.add(record);
}
writeSegmentsToPropertyStore(paths, records, realtimeTableName);
LOGGER.info("Added {} segments to propertyStore for table {}", paths.size(), realtimeTableName);
updateIdealState(idealState, realtimeTableName, idealStateEntries, create, nReplicas);
}
use of com.linkedin.pinot.common.metadata.segment.LLCRealtimeSegmentZKMetadata in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManager method createSegment.
private void createSegment(String realtimeTableName, int numReplicas, int partitionId, int seqNum, List<String> serverInstances, long startOffset, ZNRecord partitionAssignment) {
LOGGER.info("Attempting to auto-create a segment for partition {} of table {}", partitionId, realtimeTableName);
final List<String> propStorePaths = new ArrayList<>(1);
final List<ZNRecord> propStoreEntries = new ArrayList<>(1);
long now = System.currentTimeMillis();
final String tableName = TableNameBuilder.extractRawTableName(realtimeTableName);
LLCSegmentName newSegmentName = new LLCSegmentName(tableName, partitionId, seqNum, now);
final String newSegmentNameStr = newSegmentName.getSegmentName();
ZNRecord newZnRecord = makeZnRecordForNewSegment(realtimeTableName, numReplicas, startOffset, newSegmentNameStr);
final LLCRealtimeSegmentZKMetadata newSegmentZKMetadata = new LLCRealtimeSegmentZKMetadata(newZnRecord);
updateFlushThresholdForSegmentMetadata(newSegmentZKMetadata, partitionAssignment, getRealtimeTableFlushSizeForTable(realtimeTableName));
newZnRecord = newSegmentZKMetadata.toZNRecord();
final String newZnodePath = ZKMetadataProvider.constructPropertyStorePathForSegment(realtimeTableName, newSegmentNameStr);
propStorePaths.add(newZnodePath);
propStoreEntries.add(newZnRecord);
writeSegmentsToPropertyStore(propStorePaths, propStoreEntries, realtimeTableName);
updateIdealState(realtimeTableName, serverInstances, null, newSegmentNameStr);
LOGGER.info("Successful auto-create of CONSUMING segment {}", newSegmentNameStr);
_controllerMetrics.addMeteredTableValue(realtimeTableName, ControllerMeter.LLC_AUTO_CREATED_PARTITIONS, 1);
}
use of com.linkedin.pinot.common.metadata.segment.LLCRealtimeSegmentZKMetadata in project pinot by linkedin.
the class PinotLLCRealtimeSegmentManager method getBetterStartOffsetIfNeeded.
private long getBetterStartOffsetIfNeeded(final String realtimeTableName, final int partition, final LLCSegmentName latestSegment, final long oldestOffsetInKafka, final int nextSeqNum) {
final LLCRealtimeSegmentZKMetadata oldSegMetadata = getRealtimeSegmentZKMetadata(realtimeTableName, latestSegment.getSegmentName());
CommonConstants.Segment.Realtime.Status status = oldSegMetadata.getStatus();
long segmentStartOffset = oldestOffsetInKafka;
// Offset at which the prev segment intended to start consuming
final long prevSegStartOffset = oldSegMetadata.getStartOffset();
if (status.equals(CommonConstants.Segment.Realtime.Status.IN_PROGRESS)) {
if (oldestOffsetInKafka <= prevSegStartOffset) {
// We still have the same start offset available, re-use it.
segmentStartOffset = prevSegStartOffset;
LOGGER.info("Choosing previous segment start offset {} for table {} for partition {}, sequence {}", oldestOffsetInKafka, realtimeTableName, partition, nextSeqNum);
} else {
// There is data loss.
LOGGER.warn("Data lost from kafka offset {} to {} for table {} partition {} sequence {}", prevSegStartOffset, oldestOffsetInKafka, realtimeTableName, partition, nextSeqNum);
// Start from the earliest offset in kafka
_controllerMetrics.addMeteredTableValue(realtimeTableName, ControllerMeter.LLC_KAFKA_DATA_LOSS, 1);
}
} else {
// Status must be DONE, so we have a valid end-offset for the previous segment
// Will be 0 if the prev segment was not completed.
final long prevSegEndOffset = oldSegMetadata.getEndOffset();
if (oldestOffsetInKafka < prevSegEndOffset) {
// We don't want to create a segment that overlaps in data with the prev segment. We know that the previous
// segment's end offset is available in Kafka, so use that.
segmentStartOffset = prevSegEndOffset;
LOGGER.info("Choosing newer kafka offset {} for table {} for partition {}, sequence {}", oldestOffsetInKafka, realtimeTableName, partition, nextSeqNum);
} else if (oldestOffsetInKafka > prevSegEndOffset) {
// Kafka's oldest offset is greater than the end offset of the prev segment, so there is data loss.
LOGGER.warn("Data lost from kafka offset {} to {} for table {} partition {} sequence {}", prevSegEndOffset, oldestOffsetInKafka, realtimeTableName, partition, nextSeqNum);
_controllerMetrics.addMeteredTableValue(realtimeTableName, ControllerMeter.LLC_KAFKA_DATA_LOSS, 1);
} else {
// The two happen to be equal. A rarity, so log it.
LOGGER.info("Kafka earliest offset {} is the same as new segment start offset", oldestOffsetInKafka);
}
}
return segmentStartOffset;
}
Aggregations