Search in sources :

Example 1 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class HadoopDruidDetermineConfigurationJobTest method testRunWithHashedPartitionsSpecCreateHashBasedNumberedShardSpecWithHashPartitionFunction.

@Test
public void testRunWithHashedPartitionsSpecCreateHashBasedNumberedShardSpecWithHashPartitionFunction() {
    final Set<Interval> intervals = ImmutableSet.of(Intervals.of("2020-01-01/P1D"), Intervals.of("2020-01-02/P1D"), Intervals.of("2020-01-03/P1D"));
    final HashedPartitionsSpec partitionsSpec = new HashedPartitionsSpec(null, 2, null, HashPartitionFunction.MURMUR3_32_ABS, null, null);
    final HadoopDruidIndexerConfig config = Mockito.mock(HadoopDruidIndexerConfig.class);
    Mockito.when(config.isDeterminingPartitions()).thenReturn(false);
    Mockito.when(config.getPartitionsSpec()).thenReturn(partitionsSpec);
    Mockito.when(config.getSegmentGranularIntervals()).thenReturn(intervals);
    final ArgumentCaptor<Map<Long, List<HadoopyShardSpec>>> resultCaptor = ArgumentCaptor.forClass(Map.class);
    Mockito.doNothing().when(config).setShardSpecs(resultCaptor.capture());
    final HadoopDruidDetermineConfigurationJob job = new HadoopDruidDetermineConfigurationJob(config);
    Assert.assertTrue(job.run());
    final Map<Long, List<HadoopyShardSpec>> shardSpecs = resultCaptor.getValue();
    Assert.assertEquals(3, shardSpecs.size());
    for (Interval interval : intervals) {
        final List<HadoopyShardSpec> shardSpecsPerInterval = shardSpecs.get(interval.getStartMillis());
        Assert.assertEquals(2, shardSpecsPerInterval.size());
        for (int i = 0; i < shardSpecsPerInterval.size(); i++) {
            Assert.assertEquals(new HashBasedNumberedShardSpec(i, shardSpecsPerInterval.size(), i, shardSpecsPerInterval.size(), null, HashPartitionFunction.MURMUR3_32_ABS, new ObjectMapper()), shardSpecsPerInterval.get(i).getActualSpec());
        }
    }
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Map(java.util.Map) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 2 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class ParallelIndexSupervisorTask method runHashPartitionMultiPhaseParallel.

@VisibleForTesting
TaskStatus runHashPartitionMultiPhaseParallel(TaskToolbox toolbox) throws Exception {
    TaskState state;
    ParallelIndexIngestionSpec ingestionSchemaToUse = ingestionSchema;
    if (!(ingestionSchema.getTuningConfig().getPartitionsSpec() instanceof HashedPartitionsSpec)) {
        // only range and hash partitioning is supported for multiphase parallel ingestion, see runMultiPhaseParallel()
        throw new ISE("forceGuaranteedRollup is set but partitionsSpec [%s] is not a single_dim or hash partition spec.", ingestionSchema.getTuningConfig().getPartitionsSpec());
    }
    final Map<Interval, Integer> intervalToNumShards;
    HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) ingestionSchema.getTuningConfig().getPartitionsSpec();
    final boolean needsInputSampling = partitionsSpec.getNumShards() == null || ingestionSchemaToUse.getDataSchema().getGranularitySpec().inputIntervals().isEmpty();
    if (needsInputSampling) {
        // 0. need to determine intervals and numShards by scanning the data
        LOG.info("Needs to determine intervals or numShards, beginning %s phase.", PartialDimensionCardinalityTask.TYPE);
        ParallelIndexTaskRunner<PartialDimensionCardinalityTask, DimensionCardinalityReport> cardinalityRunner = createRunner(toolbox, this::createPartialDimensionCardinalityRunner);
        state = runNextPhase(cardinalityRunner);
        if (state.isFailure()) {
            String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, cardinalityRunner.getName());
            return TaskStatus.failure(getId(), errMsg);
        }
        if (cardinalityRunner.getReports().isEmpty()) {
            String msg = "No valid rows for hash partitioning." + " All rows may have invalid timestamps or have been filtered out.";
            LOG.warn(msg);
            return TaskStatus.success(getId(), msg);
        }
        if (partitionsSpec.getNumShards() == null) {
            int effectiveMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT : partitionsSpec.getMaxRowsPerSegment();
            LOG.info("effective maxRowsPerSegment is: " + effectiveMaxRowsPerSegment);
            intervalToNumShards = determineNumShardsFromCardinalityReport(cardinalityRunner.getReports().values(), effectiveMaxRowsPerSegment);
        } else {
            intervalToNumShards = CollectionUtils.mapValues(mergeCardinalityReports(cardinalityRunner.getReports().values()), k -> partitionsSpec.getNumShards());
        }
        ingestionSchemaToUse = rewriteIngestionSpecWithIntervalsIfMissing(ingestionSchemaToUse, intervalToNumShards.keySet());
    } else {
        // numShards will be determined in PartialHashSegmentGenerateTask
        intervalToNumShards = null;
    }
    // 1. Partial segment generation phase
    final ParallelIndexIngestionSpec segmentCreateIngestionSpec = ingestionSchemaToUse;
    ParallelIndexTaskRunner<PartialHashSegmentGenerateTask, GeneratedPartitionsReport> indexingRunner = createRunner(toolbox, f -> createPartialHashSegmentGenerateRunner(toolbox, segmentCreateIngestionSpec, intervalToNumShards));
    state = runNextPhase(indexingRunner);
    if (state.isFailure()) {
        String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, indexingRunner.getName());
        return TaskStatus.failure(getId(), errMsg);
    }
    // 2. Partial segment merge phase
    // partition (interval, partitionId) -> partition locations
    Map<Partition, List<PartitionLocation>> partitionToLocations = getPartitionToLocations(indexingRunner.getReports());
    final List<PartialSegmentMergeIOConfig> ioConfigs = createGenericMergeIOConfigs(ingestionSchema.getTuningConfig().getTotalNumMergeTasks(), partitionToLocations);
    final ParallelIndexIngestionSpec segmentMergeIngestionSpec = ingestionSchemaToUse;
    final ParallelIndexTaskRunner<PartialGenericSegmentMergeTask, PushedSegmentsReport> mergeRunner = createRunner(toolbox, tb -> createPartialGenericSegmentMergeRunner(tb, ioConfigs, segmentMergeIngestionSpec));
    state = runNextPhase(mergeRunner);
    TaskStatus taskStatus;
    if (state.isSuccess()) {
        // noinspection ConstantConditions
        publishSegments(toolbox, mergeRunner.getReports());
        if (awaitSegmentAvailabilityTimeoutMillis > 0) {
            waitForSegmentAvailability(mergeRunner.getReports());
        }
        taskStatus = TaskStatus.success(getId());
    } else {
        // there is only success or failure after running....
        Preconditions.checkState(state.isFailure(), "Unrecognized state after task is complete[%s]", state);
        String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, mergeRunner.getName());
        taskStatus = TaskStatus.failure(getId(), errMsg);
    }
    toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports(taskStatus, segmentAvailabilityConfirmationCompleted));
    return taskStatus;
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) TaskReport(org.apache.druid.indexing.common.TaskReport) TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) PartitionBoundaries(org.apache.druid.timeline.partition.PartitionBoundaries) Produces(javax.ws.rs.Produces) IngestionState(org.apache.druid.indexer.IngestionState) Pair(org.apache.druid.java.util.common.Pair) MediaType(javax.ws.rs.core.MediaType) TaskActionClient(org.apache.druid.indexing.common.actions.TaskActionClient) SegmentTransactionalInsertAction(org.apache.druid.indexing.common.actions.SegmentTransactionalInsertAction) FiniteFirehoseFactory(org.apache.druid.data.input.FiniteFirehoseFactory) Map(java.util.Map) StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) AbstractBatchIndexTask(org.apache.druid.indexing.common.task.AbstractBatchIndexTask) InputFormat(org.apache.druid.data.input.InputFormat) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) Set(java.util.Set) ISE(org.apache.druid.java.util.common.ISE) TaskState(org.apache.druid.indexer.TaskState) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) IndexTaskUtils(org.apache.druid.indexing.common.task.IndexTaskUtils) Granularity(org.apache.druid.java.util.common.granularity.Granularity) GET(javax.ws.rs.GET) Tasks(org.apache.druid.indexing.common.task.Tasks) TaskStatus(org.apache.druid.indexer.TaskStatus) ArrayList(java.util.ArrayList) IndexTask(org.apache.druid.indexing.common.task.IndexTask) Interval(org.joda.time.Interval) HttpServletRequest(javax.servlet.http.HttpServletRequest) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) StringSketchMerger(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketchMerger) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) Nullable(javax.annotation.Nullable) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) Throwables(com.google.common.base.Throwables) StringDistributionMerger(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger) IOException(java.io.IOException) TreeMap(java.util.TreeMap) ChatHandlers(org.apache.druid.segment.realtime.firehose.ChatHandlers) Preconditions(com.google.common.base.Preconditions) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) SubTaskSpecStatus(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTaskRunner.SubTaskSpecStatus) HllSketch(org.apache.datasketches.hll.HllSketch) AuthorizerMapper(org.apache.druid.server.security.AuthorizerMapper) Path(javax.ws.rs.Path) Memory(org.apache.datasketches.memory.Memory) TaskResource(org.apache.druid.indexing.common.task.TaskResource) MonotonicNonNull(org.checkerframework.checker.nullness.qual.MonotonicNonNull) ChatHandler(org.apache.druid.segment.realtime.firehose.ChatHandler) QueryParam(javax.ws.rs.QueryParam) Consumes(javax.ws.rs.Consumes) Union(org.apache.datasketches.hll.Union) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Task(org.apache.druid.indexing.common.task.Task) SmileMediaTypes(com.fasterxml.jackson.jaxrs.smile.SmileMediaTypes) Context(javax.ws.rs.core.Context) ImmutableMap(com.google.common.collect.ImmutableMap) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) Collection(java.util.Collection) StringUtils(org.apache.druid.java.util.common.StringUtils) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Action(org.apache.druid.server.security.Action) Collectors(java.util.stream.Collectors) MaxAllowedLocksExceededException(org.apache.druid.indexing.common.task.batch.MaxAllowedLocksExceededException) Objects(java.util.Objects) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) List(java.util.List) Response(javax.ws.rs.core.Response) DataSegment(org.apache.druid.timeline.DataSegment) Entry(java.util.Map.Entry) CurrentSubTaskHolder(org.apache.druid.indexing.common.task.CurrentSubTaskHolder) Logger(org.apache.druid.java.util.common.logger.Logger) PathParam(javax.ws.rs.PathParam) CollectionUtils(org.apache.druid.utils.CollectionUtils) HashMap(java.util.HashMap) Multimap(com.google.common.collect.Multimap) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Function(java.util.function.Function) TuningConfig(org.apache.druid.segment.indexing.TuningConfig) HashSet(java.util.HashSet) InputSource(org.apache.druid.data.input.InputSource) RowIngestionMetersTotals(org.apache.druid.segment.incremental.RowIngestionMetersTotals) Status(javax.ws.rs.core.Response.Status) DimensionRangePartitionsSpec(org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec) ParseExceptionReport(org.apache.druid.segment.incremental.ParseExceptionReport) POST(javax.ws.rs.POST) TransactionalSegmentPublisher(org.apache.druid.segment.realtime.appenderator.TransactionalSegmentPublisher) DateTime(org.joda.time.DateTime) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) IngestionStatsAndErrorsTaskReport(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReport) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) IntermediaryDataManager(org.apache.druid.indexing.worker.shuffle.IntermediaryDataManager) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) Collections(java.util.Collections) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) ISE(org.apache.druid.java.util.common.ISE) ArrayList(java.util.ArrayList) List(java.util.List) TaskStatus(org.apache.druid.indexer.TaskStatus) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TaskState(org.apache.druid.indexer.TaskState) Interval(org.joda.time.Interval) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class PartialHashSegmentGenerateTask method createSegmentAllocator.

@Override
SegmentAllocatorForBatch createSegmentAllocator(TaskToolbox toolbox, ParallelIndexSupervisorTaskClient taskClient) throws IOException {
    final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
    final ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    final HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) tuningConfig.getGivenOrDefaultPartitionsSpec();
    return SegmentAllocators.forNonLinearPartitioning(toolbox, getDataSource(), getSubtaskSpecId(), granularitySpec, new SupervisorTaskAccess(supervisorTaskId, taskClient), createHashPartitionAnalysisFromPartitionsSpec(granularitySpec, partitionsSpec, intervalToNumShardsOverride));
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec)

Example 4 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class CompactionTaskParallelRunTest method testRunParallelWithHashPartitioningMatchCompactionState.

@Test
public void testRunParallelWithHashPartitioningMatchCompactionState() throws Exception {
    // Hash partitioning is not supported with segment lock yet
    Assume.assumeFalse(lockGranularity == LockGranularity.SEGMENT);
    runIndexTask(null, true);
    final Builder builder = new Builder(DATA_SOURCE, getSegmentCacheManagerFactory(), RETRY_POLICY_FACTORY);
    final CompactionTask compactionTask = builder.inputSpec(new CompactionIntervalSpec(INTERVAL_TO_INDEX, null)).tuningConfig(newTuningConfig(new HashedPartitionsSpec(null, 3, null), 2, true)).build();
    final Set<DataSegment> compactedSegments = runTask(compactionTask);
    for (DataSegment segment : compactedSegments) {
        // Expect compaction state to exist as store compaction state by default
        Map<String, String> expectedLongSumMetric = new HashMap<>();
        expectedLongSumMetric.put("type", "longSum");
        expectedLongSumMetric.put("name", "val");
        expectedLongSumMetric.put("fieldName", "val");
        expectedLongSumMetric.put("expression", null);
        Assert.assertSame(HashBasedNumberedShardSpec.class, segment.getShardSpec().getClass());
        CompactionState expectedState = new CompactionState(new HashedPartitionsSpec(null, 3, null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("ts", "dim"))), ImmutableList.of(expectedLongSumMetric), null, compactionTask.getTuningConfig().getIndexSpec().asMap(getObjectMapper()), getObjectMapper().readValue(getObjectMapper().writeValueAsString(new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, true, ImmutableList.of(segment.getInterval()))), Map.class));
        Assert.assertEquals(expectedState, segment.getLastCompactionState());
    }
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) HashMap(java.util.HashMap) Builder(org.apache.druid.indexing.common.task.CompactionTask.Builder) DataSegment(org.apache.druid.timeline.DataSegment) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CompactionState(org.apache.druid.timeline.CompactionState) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) AbstractParallelIndexSupervisorTaskTest(org.apache.druid.indexing.common.task.batch.parallel.AbstractParallelIndexSupervisorTaskTest) Test(org.junit.Test)

Example 5 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class CompactionTaskParallelRunTest method testCompactHashAndDynamicPartitionedSegments.

@Test
public void testCompactHashAndDynamicPartitionedSegments() {
    runIndexTask(new HashedPartitionsSpec(null, 2, null), false);
    runIndexTask(null, true);
    final Builder builder = new Builder(DATA_SOURCE, getSegmentCacheManagerFactory(), RETRY_POLICY_FACTORY);
    final CompactionTask compactionTask = builder.inputSpec(new CompactionIntervalSpec(INTERVAL_TO_INDEX, null)).tuningConfig(AbstractParallelIndexSupervisorTaskTest.DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING).build();
    final Map<Interval, List<DataSegment>> intervalToSegments = SegmentUtils.groupSegmentsByInterval(runTask(compactionTask));
    Assert.assertEquals(3, intervalToSegments.size());
    Assert.assertEquals(ImmutableSet.of(Intervals.of("2014-01-01T00/PT1H"), Intervals.of("2014-01-01T01/PT1H"), Intervals.of("2014-01-01T02/PT1H")), intervalToSegments.keySet());
    for (Entry<Interval, List<DataSegment>> entry : intervalToSegments.entrySet()) {
        final List<DataSegment> segmentsInInterval = entry.getValue();
        Assert.assertEquals(1, segmentsInInterval.size());
        final ShardSpec shardSpec = segmentsInInterval.get(0).getShardSpec();
        if (lockGranularity == LockGranularity.TIME_CHUNK) {
            Assert.assertSame(NumberedShardSpec.class, shardSpec.getClass());
            final NumberedShardSpec numberedShardSpec = (NumberedShardSpec) shardSpec;
            Assert.assertEquals(0, numberedShardSpec.getPartitionNum());
            Assert.assertEquals(1, numberedShardSpec.getNumCorePartitions());
        } else {
            Assert.assertSame(NumberedOverwriteShardSpec.class, shardSpec.getClass());
            final NumberedOverwriteShardSpec numberedShardSpec = (NumberedOverwriteShardSpec) shardSpec;
            Assert.assertEquals(PartitionIds.NON_ROOT_GEN_START_PARTITION_ID, numberedShardSpec.getPartitionNum());
            Assert.assertEquals(1, numberedShardSpec.getAtomicUpdateGroupSize());
        }
    }
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Builder(org.apache.druid.indexing.common.task.CompactionTask.Builder) DataSegment(org.apache.druid.timeline.DataSegment) DimensionRangeShardSpec(org.apache.druid.timeline.partition.DimensionRangeShardSpec) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) SingleDimensionShardSpec(org.apache.druid.timeline.partition.SingleDimensionShardSpec) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) Interval(org.joda.time.Interval) AbstractParallelIndexSupervisorTaskTest(org.apache.druid.indexing.common.task.batch.parallel.AbstractParallelIndexSupervisorTaskTest) Test(org.junit.Test)

Aggregations

HashedPartitionsSpec (org.apache.druid.indexer.partitions.HashedPartitionsSpec)43 Test (org.junit.Test)31 Interval (org.joda.time.Interval)20 DataSegment (org.apache.druid.timeline.DataSegment)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 PartitionsSpec (org.apache.druid.indexer.partitions.PartitionsSpec)12 Map (java.util.Map)11 SingleDimensionPartitionsSpec (org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec)11 HashBasedNumberedShardSpec (org.apache.druid.timeline.partition.HashBasedNumberedShardSpec)11 ArrayList (java.util.ArrayList)10 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)9 StringUtils (org.apache.druid.java.util.common.StringUtils)9 File (java.io.File)8 HashMap (java.util.HashMap)8 DynamicPartitionsSpec (org.apache.druid.indexer.partitions.DynamicPartitionsSpec)8 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)8 HashPartitionFunction (org.apache.druid.timeline.partition.HashPartitionFunction)8 ImmutableMap (com.google.common.collect.ImmutableMap)7 IOException (java.io.IOException)7