Search in sources :

Example 1 with BuildingShardSpec

use of org.apache.druid.timeline.partition.BuildingShardSpec in project druid by druid-io.

the class SegmentPublisherHelper method annotateShardSpec.

/**
 * This method fills missing information in the shard spec if necessary when publishing segments.
 *
 * - When time chunk lock is used, the non-appending task should set the proper size of the core partitions for
 *   dynamically-partitioned segments. See {@link #annotateCorePartitionSetSizeFn}.
 * - When segment lock is used, the overwriting task should set the proper size of the atomic update group.
 *   See {@link #annotateAtomicUpdateGroupFn}.
 */
static Set<DataSegment> annotateShardSpec(Set<DataSegment> segments) {
    final Map<Interval, List<DataSegment>> intervalToSegments = new HashMap<>();
    segments.forEach(segment -> intervalToSegments.computeIfAbsent(segment.getInterval(), k -> new ArrayList<>()).add(segment));
    for (Entry<Interval, List<DataSegment>> entry : intervalToSegments.entrySet()) {
        final Interval interval = entry.getKey();
        final List<DataSegment> segmentsPerInterval = entry.getValue();
        final ShardSpec firstShardSpec = segmentsPerInterval.get(0).getShardSpec();
        final boolean anyMismatch = segmentsPerInterval.stream().anyMatch(segment -> segment.getShardSpec().getClass() != firstShardSpec.getClass());
        if (anyMismatch) {
            throw new ISE("Mismatched shardSpecs in interval[%s] for segments[%s]", interval, segmentsPerInterval);
        }
        final Function<DataSegment, DataSegment> annotateFn;
        if (firstShardSpec instanceof OverwriteShardSpec) {
            annotateFn = annotateAtomicUpdateGroupFn(segmentsPerInterval.size());
        } else if (firstShardSpec instanceof BuildingShardSpec) {
            // sanity check
            // BuildingShardSpec is used in non-appending mode. In this mode,
            // the segments in each interval should have contiguous partitionIds,
            // so that they can be queryable (see PartitionHolder.isComplete()).
            int expectedCorePartitionSetSize = segmentsPerInterval.size();
            int actualCorePartitionSetSize = Math.toIntExact(segmentsPerInterval.stream().filter(segment -> segment.getShardSpec().getPartitionNum() < expectedCorePartitionSetSize).count());
            if (expectedCorePartitionSetSize != actualCorePartitionSetSize) {
                LOG.errorSegments(segmentsPerInterval, "Cannot publish segments due to incomplete time chunk");
                throw new ISE("Cannot publish segments due to incomplete time chunk for interval[%s]. " + "Expected [%s] segments in the core partition, but only [%] segments are found. " + "See task logs for more details about these segments.", interval, expectedCorePartitionSetSize, actualCorePartitionSetSize);
            }
            annotateFn = annotateCorePartitionSetSizeFn(expectedCorePartitionSetSize);
        } else if (firstShardSpec instanceof BucketNumberedShardSpec) {
            throw new ISE("Cannot publish segments with shardSpec[%s]", firstShardSpec);
        } else {
            annotateFn = null;
        }
        if (annotateFn != null) {
            intervalToSegments.put(interval, segmentsPerInterval.stream().map(annotateFn).collect(Collectors.toList()));
        }
    }
    return intervalToSegments.values().stream().flatMap(Collection::stream).collect(Collectors.toSet());
}
Also used : Logger(org.apache.druid.java.util.common.logger.Logger) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) Collection(java.util.Collection) OverwriteShardSpec(org.apache.druid.timeline.partition.OverwriteShardSpec) Set(java.util.Set) ISE(org.apache.druid.java.util.common.ISE) HashMap(java.util.HashMap) Function(java.util.function.Function) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) Interval(org.joda.time.Interval) List(java.util.List) Map(java.util.Map) DataSegment(org.apache.druid.timeline.DataSegment) Entry(java.util.Map.Entry) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) HashMap(java.util.HashMap) DataSegment(org.apache.druid.timeline.DataSegment) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) OverwriteShardSpec(org.apache.druid.timeline.partition.OverwriteShardSpec) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) ArrayList(java.util.ArrayList) List(java.util.List) ISE(org.apache.druid.java.util.common.ISE) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) OverwriteShardSpec(org.apache.druid.timeline.partition.OverwriteShardSpec) Interval(org.joda.time.Interval)

Example 2 with BuildingShardSpec

use of org.apache.druid.timeline.partition.BuildingShardSpec in project druid by druid-io.

the class ParallelIndexSupervisorTask method getPartitionToLocations.

/**
 * Creates a map from partition (interval + bucketId) to the corresponding
 * PartitionLocations. Note that the bucketId maybe different from the final
 * partitionId (refer to {@link BuildingShardSpec} for more details).
 */
static Map<Partition, List<PartitionLocation>> getPartitionToLocations(Map<String, GeneratedPartitionsReport> subTaskIdToReport) {
    // Create a map from partition to list of reports (PartitionStat and subTaskId)
    final Map<Partition, List<PartitionReport>> partitionToReports = new TreeMap<>(// Sort by (interval, bucketId) to maintain order of partitionIds within interval
    Comparator.comparingLong((Partition partition) -> partition.getInterval().getStartMillis()).thenComparingLong(partition -> partition.getInterval().getEndMillis()).thenComparingInt(Partition::getBucketId));
    subTaskIdToReport.forEach((subTaskId, report) -> report.getPartitionStats().forEach(partitionStat -> partitionToReports.computeIfAbsent(Partition.fromStat(partitionStat), p -> new ArrayList<>()).add(new PartitionReport(subTaskId, partitionStat))));
    final Map<Partition, List<PartitionLocation>> partitionToLocations = new HashMap<>();
    Interval prevInterval = null;
    final AtomicInteger partitionId = new AtomicInteger(0);
    for (Entry<Partition, List<PartitionReport>> entry : partitionToReports.entrySet()) {
        final Partition partition = entry.getKey();
        // Reset the partitionId if this is a new interval
        Interval interval = partition.getInterval();
        if (!interval.equals(prevInterval)) {
            partitionId.set(0);
            prevInterval = interval;
        }
        // Use any PartitionStat of this partition to create a shard spec
        final List<PartitionReport> reportsOfPartition = entry.getValue();
        final BuildingShardSpec<?> shardSpec = reportsOfPartition.get(0).getPartitionStat().getSecondaryPartition().convert(partitionId.getAndIncrement());
        // Create a PartitionLocation for each PartitionStat
        List<PartitionLocation> locationsOfPartition = reportsOfPartition.stream().map(report -> report.getPartitionStat().toPartitionLocation(report.getSubTaskId(), shardSpec)).collect(Collectors.toList());
        partitionToLocations.put(partition, locationsOfPartition);
    }
    return partitionToLocations;
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) TaskReport(org.apache.druid.indexing.common.TaskReport) TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) PartitionBoundaries(org.apache.druid.timeline.partition.PartitionBoundaries) Produces(javax.ws.rs.Produces) IngestionState(org.apache.druid.indexer.IngestionState) Pair(org.apache.druid.java.util.common.Pair) MediaType(javax.ws.rs.core.MediaType) TaskActionClient(org.apache.druid.indexing.common.actions.TaskActionClient) SegmentTransactionalInsertAction(org.apache.druid.indexing.common.actions.SegmentTransactionalInsertAction) FiniteFirehoseFactory(org.apache.druid.data.input.FiniteFirehoseFactory) Map(java.util.Map) StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) AbstractBatchIndexTask(org.apache.druid.indexing.common.task.AbstractBatchIndexTask) InputFormat(org.apache.druid.data.input.InputFormat) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) Set(java.util.Set) ISE(org.apache.druid.java.util.common.ISE) TaskState(org.apache.druid.indexer.TaskState) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) IndexTaskUtils(org.apache.druid.indexing.common.task.IndexTaskUtils) Granularity(org.apache.druid.java.util.common.granularity.Granularity) GET(javax.ws.rs.GET) Tasks(org.apache.druid.indexing.common.task.Tasks) TaskStatus(org.apache.druid.indexer.TaskStatus) ArrayList(java.util.ArrayList) IndexTask(org.apache.druid.indexing.common.task.IndexTask) Interval(org.joda.time.Interval) HttpServletRequest(javax.servlet.http.HttpServletRequest) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) StringSketchMerger(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketchMerger) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) Nullable(javax.annotation.Nullable) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) Throwables(com.google.common.base.Throwables) StringDistributionMerger(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger) IOException(java.io.IOException) TreeMap(java.util.TreeMap) ChatHandlers(org.apache.druid.segment.realtime.firehose.ChatHandlers) Preconditions(com.google.common.base.Preconditions) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) SubTaskSpecStatus(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTaskRunner.SubTaskSpecStatus) HllSketch(org.apache.datasketches.hll.HllSketch) AuthorizerMapper(org.apache.druid.server.security.AuthorizerMapper) Path(javax.ws.rs.Path) Memory(org.apache.datasketches.memory.Memory) TaskResource(org.apache.druid.indexing.common.task.TaskResource) MonotonicNonNull(org.checkerframework.checker.nullness.qual.MonotonicNonNull) ChatHandler(org.apache.druid.segment.realtime.firehose.ChatHandler) QueryParam(javax.ws.rs.QueryParam) Consumes(javax.ws.rs.Consumes) Union(org.apache.datasketches.hll.Union) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Task(org.apache.druid.indexing.common.task.Task) SmileMediaTypes(com.fasterxml.jackson.jaxrs.smile.SmileMediaTypes) Context(javax.ws.rs.core.Context) ImmutableMap(com.google.common.collect.ImmutableMap) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) Collection(java.util.Collection) StringUtils(org.apache.druid.java.util.common.StringUtils) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Action(org.apache.druid.server.security.Action) Collectors(java.util.stream.Collectors) MaxAllowedLocksExceededException(org.apache.druid.indexing.common.task.batch.MaxAllowedLocksExceededException) Objects(java.util.Objects) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) List(java.util.List) Response(javax.ws.rs.core.Response) DataSegment(org.apache.druid.timeline.DataSegment) Entry(java.util.Map.Entry) CurrentSubTaskHolder(org.apache.druid.indexing.common.task.CurrentSubTaskHolder) Logger(org.apache.druid.java.util.common.logger.Logger) PathParam(javax.ws.rs.PathParam) CollectionUtils(org.apache.druid.utils.CollectionUtils) HashMap(java.util.HashMap) Multimap(com.google.common.collect.Multimap) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Function(java.util.function.Function) TuningConfig(org.apache.druid.segment.indexing.TuningConfig) HashSet(java.util.HashSet) InputSource(org.apache.druid.data.input.InputSource) RowIngestionMetersTotals(org.apache.druid.segment.incremental.RowIngestionMetersTotals) Status(javax.ws.rs.core.Response.Status) DimensionRangePartitionsSpec(org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec) ParseExceptionReport(org.apache.druid.segment.incremental.ParseExceptionReport) POST(javax.ws.rs.POST) TransactionalSegmentPublisher(org.apache.druid.segment.realtime.appenderator.TransactionalSegmentPublisher) DateTime(org.joda.time.DateTime) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) IngestionStatsAndErrorsTaskReport(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReport) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) IntermediaryDataManager(org.apache.druid.indexing.worker.shuffle.IntermediaryDataManager) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) Collections(java.util.Collections) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ArrayList(java.util.ArrayList) List(java.util.List) Interval(org.joda.time.Interval)

Example 3 with BuildingShardSpec

use of org.apache.druid.timeline.partition.BuildingShardSpec in project druid by druid-io.

the class PartialGenericSegmentMergeTask method createIntervalAndIntegerToShardSpec.

private static Table<Interval, Integer, BuildingShardSpec<?>> createIntervalAndIntegerToShardSpec(List<PartitionLocation> partitionLocations) {
    final Table<Interval, Integer, BuildingShardSpec<?>> intervalAndIntegerToShardSpec = HashBasedTable.create();
    partitionLocations.forEach(p -> {
        final ShardSpec currShardSpec = intervalAndIntegerToShardSpec.get(p.getInterval(), p.getBucketId());
        if (currShardSpec == null) {
            intervalAndIntegerToShardSpec.put(p.getInterval(), p.getBucketId(), p.getShardSpec());
        } else {
            if (!p.getShardSpec().equals(currShardSpec)) {
                throw new ISE("interval %s, bucketId %s mismatched shard specs: %s and %s", p.getInterval(), p.getBucketId(), currShardSpec, p.getShardSpec());
            }
        }
    });
    return intervalAndIntegerToShardSpec;
}
Also used : ISE(org.apache.druid.java.util.common.ISE) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) Interval(org.joda.time.Interval)

Aggregations

ISE (org.apache.druid.java.util.common.ISE)3 BuildingShardSpec (org.apache.druid.timeline.partition.BuildingShardSpec)3 Interval (org.joda.time.Interval)3 ArrayList (java.util.ArrayList)2 Collection (java.util.Collection)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Map (java.util.Map)2 Entry (java.util.Map.Entry)2 Set (java.util.Set)2 Function (java.util.function.Function)2 Collectors (java.util.stream.Collectors)2 Logger (org.apache.druid.java.util.common.logger.Logger)2 DataSegment (org.apache.druid.timeline.DataSegment)2 ShardSpec (org.apache.druid.timeline.partition.ShardSpec)2 JsonCreator (com.fasterxml.jackson.annotation.JsonCreator)1 JsonProperty (com.fasterxml.jackson.annotation.JsonProperty)1 SmileMediaTypes (com.fasterxml.jackson.jaxrs.smile.SmileMediaTypes)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Preconditions (com.google.common.base.Preconditions)1