Search in sources :

Example 31 with LLCSegmentName

use of com.linkedin.pinot.common.utils.LLCSegmentName in project pinot by linkedin.

the class PinotLLCRealtimeSegmentManager method completeCommittingSegmentsInternal.

private void completeCommittingSegmentsInternal(String realtimeTableName, Map<Integer, MinMaxPriorityQueue<LLCSegmentName>> partitionToLatestSegments) {
    IdealState idealState = getTableIdealState(realtimeTableName);
    Set<String> segmentNamesIS = idealState.getPartitionSet();
    final ZNRecord partitionAssignment = getKafkaPartitionAssignment(realtimeTableName);
    for (Map.Entry<Integer, MinMaxPriorityQueue<LLCSegmentName>> entry : partitionToLatestSegments.entrySet()) {
        final LLCSegmentName segmentName = entry.getValue().pollFirst();
        final String segmentId = segmentName.getSegmentName();
        final int partitionId = entry.getKey();
        if (!segmentNamesIS.contains(segmentId)) {
  "{}:Repairing segment for partition {}. Segment {} not found in idealstate", realtimeTableName, partitionId, segmentId);
            List<String> newInstances = partitionAssignment.getListField(Integer.toString(partitionId));
  "{}: Assigning segment {} to {}", realtimeTableName, segmentId, newInstances);
            // TODO Re-write num-partitions in metadata if needed.
            // If there was a prev segment in the same partition, then we need to fix it to be ONLINE.
            LLCSegmentName prevSegmentName = entry.getValue().pollLast();
            String prevSegmentNameStr = null;
            if (prevSegmentName != null) {
                prevSegmentNameStr = prevSegmentName.getSegmentName();
            updateIdealState(realtimeTableName, newInstances, prevSegmentNameStr, segmentId);
Also used : MinMaxPriorityQueue( Object2IntLinkedOpenHashMap(it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap) Map(java.util.Map) HashMap(java.util.HashMap) Object2IntMap(it.unimi.dsi.fastutil.objects.Object2IntMap) LLCSegmentName(com.linkedin.pinot.common.utils.LLCSegmentName) IdealState(org.apache.helix.model.IdealState) ZNRecord(org.apache.helix.ZNRecord)

Example 32 with LLCSegmentName

use of com.linkedin.pinot.common.utils.LLCSegmentName in project pinot by linkedin.

the class PinotLLCRealtimeSegmentManager method commitSegment.

   * This method is invoked after the realtime segment is uploaded but before a response is sent to the server.
   * It updates the propertystore segment metadata from IN_PROGRESS to DONE, and also creates new propertystore
   * records for new segments, and puts them in idealstate in CONSUMING state.
   * @param rawTableName Raw table name
   * @param committingSegmentNameStr Committing segment name
   * @param nextOffset The offset with which the next segment should start.
   * @return
public boolean commitSegment(String rawTableName, final String committingSegmentNameStr, long nextOffset) {
    final long now = System.currentTimeMillis();
    final String realtimeTableName = TableNameBuilder.REALTIME_TABLE_NAME_BUILDER.forTable(rawTableName);
    final LLCRealtimeSegmentZKMetadata oldSegMetadata = getRealtimeSegmentZKMetadata(realtimeTableName, committingSegmentNameStr);
    final LLCSegmentName oldSegmentName = new LLCSegmentName(committingSegmentNameStr);
    final int partitionId = oldSegmentName.getPartitionId();
    final int oldSeqNum = oldSegmentName.getSequenceNumber();
    oldSegMetadata.setDownloadUrl(ControllerConf.constructDownloadUrl(rawTableName, committingSegmentNameStr, _controllerConf.generateVipUrl()));
    // Pull segment metadata from incoming segment and set it in zk segment metadata
    SegmentMetadataImpl segmentMetadata = extractSegmentMetadata(rawTableName, committingSegmentNameStr);
    final ZNRecord oldZnRecord = oldSegMetadata.toZNRecord();
    final String oldZnodePath = ZKMetadataProvider.constructPropertyStorePathForSegment(realtimeTableName, committingSegmentNameStr);
    final ZNRecord partitionAssignment = getKafkaPartitionAssignment(realtimeTableName);
    // creating a new segment
    if (partitionAssignment == null) {
        LOGGER.warn("Kafka partition assignment not found for {}", realtimeTableName);
        throw new RuntimeException("Kafka partition assigment not found. Not committing segment");
    List<String> newInstances = partitionAssignment.getListField(Integer.toString(partitionId));
    // Construct segment metadata and idealstate for the new segment
    final int newSeqNum = oldSeqNum + 1;
    final long newStartOffset = nextOffset;
    LLCSegmentName newHolder = new LLCSegmentName(oldSegmentName.getTableName(), partitionId, newSeqNum, now);
    final String newSegmentNameStr = newHolder.getSegmentName();
    ZNRecord newZnRecord = makeZnRecordForNewSegment(rawTableName, newInstances.size(), newStartOffset, newSegmentNameStr);
    final LLCRealtimeSegmentZKMetadata newSegmentZKMetadata = new LLCRealtimeSegmentZKMetadata(newZnRecord);
    updateFlushThresholdForSegmentMetadata(newSegmentZKMetadata, partitionAssignment, getRealtimeTableFlushSizeForTable(rawTableName));
    newZnRecord = newSegmentZKMetadata.toZNRecord();
    final String newZnodePath = ZKMetadataProvider.constructPropertyStorePathForSegment(realtimeTableName, newSegmentNameStr);
    List<String> paths = new ArrayList<>(2);
    List<ZNRecord> records = new ArrayList<>(2);
     * Update zookeeper in two steps.
     * Step 1: Update PROPERTYSTORE to change the segment metadata for old segment and add a new one for new segment
     * Step 2: Update IDEALSTATES to include the new segment in the idealstate for the table in CONSUMING state, and change
     *         the old segment to ONLINE state.
     * The controller may fail between these two steps, so when a new controller takes over as leader, it needs to
     * check whether there are any recent segments in PROPERTYSTORE that are not accounted for in idealState. If so,
     * it should create the new segments in idealState.
     * If the controller fails after step-2, we are fine because the idealState has the new segments.
     * If the controller fails before step-1, the server will see this as an upload failure, and will re-try.
    writeSegmentsToPropertyStore(paths, records, realtimeTableName);
    // TODO Introduce a controller failure here for integration testing
    // When multiple segments of the same table complete around the same time it is possible that
    // the idealstate udpate fails due to contention. We serialize the updates to the idealstate
    // to reduce this contention. We may still contend with RetentionManager, or other updates
    // to idealstate from other controllers, but then we have the retry mechanism to get around that.
    // hash code can be negative, so make sure we are getting a positive lock index
    int lockIndex = (realtimeTableName.hashCode() & Integer.MAX_VALUE) % NUM_LOCKS;
    Lock lock = _idealstateUpdateLocks[lockIndex];
    try {
        updateIdealState(realtimeTableName, newInstances, committingSegmentNameStr, newSegmentNameStr);"Changed {} to ONLINE and created {} in CONSUMING", committingSegmentNameStr, newSegmentNameStr);
    } finally {
    return true;
Also used : ArrayList(java.util.ArrayList) SegmentMetadataImpl(com.linkedin.pinot.core.segment.index.SegmentMetadataImpl) LLCRealtimeSegmentZKMetadata(com.linkedin.pinot.common.metadata.segment.LLCRealtimeSegmentZKMetadata) LLCSegmentName(com.linkedin.pinot.common.utils.LLCSegmentName) ZNRecord(org.apache.helix.ZNRecord) ReentrantLock(java.util.concurrent.locks.ReentrantLock) Lock(java.util.concurrent.locks.Lock)

Example 33 with LLCSegmentName

use of com.linkedin.pinot.common.utils.LLCSegmentName in project pinot by linkedin.

the class PinotLLCRealtimeSegmentManager method updateFlushThresholdForSegmentMetadata.

void updateFlushThresholdForSegmentMetadata(LLCRealtimeSegmentZKMetadata segmentZKMetadata, ZNRecord partitionAssignment, int tableFlushSize) {
    // Only update the flush threshold if there is a valid table flush size
    if (tableFlushSize < 1) {
    // Gather list of instances for this partition
    Object2IntMap<String> partitionCountForInstance = new Object2IntLinkedOpenHashMap<>();
    String segmentPartitionId = new LLCSegmentName(segmentZKMetadata.getSegmentName()).getPartitionRange();
    for (String instanceName : partitionAssignment.getListField(segmentPartitionId)) {
        partitionCountForInstance.put(instanceName, 0);
    // Find the maximum number of partitions served for each instance that is serving this segment
    int maxPartitionCountPerInstance = 1;
    for (Map.Entry<String, List<String>> partitionAndInstanceList : partitionAssignment.getListFields().entrySet()) {
        for (String instance : partitionAndInstanceList.getValue()) {
            if (partitionCountForInstance.containsKey(instance)) {
                int partitionCountForThisInstance = partitionCountForInstance.getInt(instance);
                partitionCountForInstance.put(instance, partitionCountForThisInstance);
                if (maxPartitionCountPerInstance < partitionCountForThisInstance) {
                    maxPartitionCountPerInstance = partitionCountForThisInstance;
    // Configure the segment size flush limit based on the maximum number of partitions allocated to a replica
    int segmentFlushSize = (int) (((float) tableFlushSize) / maxPartitionCountPerInstance);
Also used : Object2IntLinkedOpenHashMap(it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap) List(java.util.List) ArrayList(java.util.ArrayList) LLCSegmentName(com.linkedin.pinot.common.utils.LLCSegmentName) Object2IntLinkedOpenHashMap(it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap) Map(java.util.Map) HashMap(java.util.HashMap) Object2IntMap(it.unimi.dsi.fastutil.objects.Object2IntMap)

Example 34 with LLCSegmentName

use of com.linkedin.pinot.common.utils.LLCSegmentName in project pinot by linkedin.

the class ValidationManager method validateLLCSegments.

// For LLC segments, validate that there is at least one segment in CONSUMING state for every partition.
void validateLLCSegments(final String realtimeTableName, AbstractTableConfig tableConfig) {"Validating LLC Segments for {}", realtimeTableName);
    Map<String, String> streamConfigs = tableConfig.getIndexingConfig().getStreamConfigs();
    ZNRecord partitionAssignment = _llcRealtimeSegmentManager.getKafkaPartitionAssignment(realtimeTableName);
    if (partitionAssignment == null) {
        LOGGER.warn("No partition assignment found for table {}", realtimeTableName);
    Map<String, List<String>> partitionToHostsMap = partitionAssignment.getListFields();
    // Keep a set of kafka partitions, and remove the partition when we find a segment in CONSUMING state in
    // that partition.
    Set<Integer> nonConsumingKafkaPartitions = new HashSet<>(partitionToHostsMap.size());
    for (String partitionStr : partitionToHostsMap.keySet()) {
    IdealState idealState = HelixHelper.getTableIdealState(_pinotHelixResourceManager.getHelixZkManager(), realtimeTableName);
    if (!idealState.isEnabled()) {
        // No validation to be done."Skipping validation for {} since it is disabled", realtimeTableName);
    // Walk through all segments in the idealState, looking for one instance that is in CONSUMING state. If we find one
    // remove the kafka partition that the segment belongs to, from the kafka partition set.
    // Make sure that there are at least some LLC segments in place. If there are no LLC segments, it is possible
    // that this table is in the process of being disabled for LLC
    Set<String> segmentIds = idealState.getPartitionSet();
    List<String> llcSegments = new ArrayList<>(segmentIds.size());
    for (String segmentId : segmentIds) {
        if (SegmentName.isLowLevelConsumerSegmentName(segmentId)) {
            Map<String, String> stateMap = idealState.getInstanceStateMap(segmentId);
            Iterator<String> iterator = stateMap.values().iterator();
            // If there is at least one instance in CONSUMING state, we are good.
            boolean foundConsuming = false;
            while (iterator.hasNext() && !foundConsuming) {
                String stateString =;
                if (stateString.equals(PinotHelixSegmentOnlineOfflineStateModelGenerator.CONSUMING_STATE)) {
          "Found CONSUMING segment {}", segmentId);
                    foundConsuming = true;
            if (foundConsuming) {
                LLCSegmentName llcSegmentName = new LLCSegmentName(segmentId);
    // Kafka partition set now has all the partitions that do not have any segments in CONSUMING state.
    if (!llcSegments.isEmpty()) {
        // Raise the metric only if there is at least one llc segment in the idealstate.
        _validationMetrics.updateNumNonConsumingPartitionsMetric(realtimeTableName, nonConsumingKafkaPartitions.size());
        // Recreate a segment for the partitions that are missing one.
        for (Integer kafkaPartition : nonConsumingKafkaPartitions) {
            LOGGER.warn("Table {}, kafka partition {} has no segments in CONSUMING state (out of {} llc segments)", realtimeTableName, kafkaPartition, llcSegments.size());
        if (_autoCreateOnError) {
            _llcRealtimeSegmentManager.createConsumingSegment(realtimeTableName, nonConsumingKafkaPartitions, llcSegments, tableConfig);
            _llcRealtimeSegmentManager.completeCommittingSegments(realtimeTableName, llcSegments);
    // Make this call after other validations (so that we verify that we are consistent against the existing partition
    // assignment). This call may end up changing the kafka partition assignment for the table.
    _llcRealtimeSegmentManager.updateKafkaPartitionsIfNecessary(realtimeTableName, tableConfig);
Also used : ArrayList(java.util.ArrayList) LLCSegmentName(com.linkedin.pinot.common.utils.LLCSegmentName) IdealState(org.apache.helix.model.IdealState) ArrayList(java.util.ArrayList) List(java.util.List) ZNRecord(org.apache.helix.ZNRecord) HashSet(java.util.HashSet)


LLCSegmentName (com.linkedin.pinot.common.utils.LLCSegmentName)34 Test (org.testng.annotations.Test)16 ArrayList (java.util.ArrayList)13 SegmentCompletionProtocol (com.linkedin.pinot.common.protocols.SegmentCompletionProtocol)12 ZNRecord (org.apache.helix.ZNRecord)11 IdealState (org.apache.helix.model.IdealState)10 LLCRealtimeSegmentZKMetadata (com.linkedin.pinot.common.metadata.segment.LLCRealtimeSegmentZKMetadata)9 HashMap (java.util.HashMap)8 Request (com.linkedin.pinot.common.protocols.SegmentCompletionProtocol.Request)6 Object2IntLinkedOpenHashMap (it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap)5 ExternalView (org.apache.helix.model.ExternalView)5 Field (java.lang.reflect.Field)4 HashSet (java.util.HashSet)4 List (java.util.List)4 BaseConfiguration (org.apache.commons.configuration.BaseConfiguration)4 HLCSegmentName (com.linkedin.pinot.common.utils.HLCSegmentName)3 InstanceConfig (org.apache.helix.model.InstanceConfig)3 BeforeTest (org.testng.annotations.BeforeTest)3 MinMaxPriorityQueue ( AbstractTableConfig (com.linkedin.pinot.common.config.AbstractTableConfig)2