use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class HadoopDruidDetermineConfigurationJobTest method testRunWithHashedPartitionsSpecCreateHashBasedNumberedShardSpecWithHashPartitionFunction.
@Test
public void testRunWithHashedPartitionsSpecCreateHashBasedNumberedShardSpecWithHashPartitionFunction() {
final Set<Interval> intervals = ImmutableSet.of(Intervals.of("2020-01-01/P1D"), Intervals.of("2020-01-02/P1D"), Intervals.of("2020-01-03/P1D"));
final HashedPartitionsSpec partitionsSpec = new HashedPartitionsSpec(null, 2, null, HashPartitionFunction.MURMUR3_32_ABS, null, null);
final HadoopDruidIndexerConfig config = Mockito.mock(HadoopDruidIndexerConfig.class);
Mockito.when(config.isDeterminingPartitions()).thenReturn(false);
Mockito.when(config.getPartitionsSpec()).thenReturn(partitionsSpec);
Mockito.when(config.getSegmentGranularIntervals()).thenReturn(intervals);
final ArgumentCaptor<Map<Long, List<HadoopyShardSpec>>> resultCaptor = ArgumentCaptor.forClass(Map.class);
Mockito.doNothing().when(config).setShardSpecs(resultCaptor.capture());
final HadoopDruidDetermineConfigurationJob job = new HadoopDruidDetermineConfigurationJob(config);
Assert.assertTrue(job.run());
final Map<Long, List<HadoopyShardSpec>> shardSpecs = resultCaptor.getValue();
Assert.assertEquals(3, shardSpecs.size());
for (Interval interval : intervals) {
final List<HadoopyShardSpec> shardSpecsPerInterval = shardSpecs.get(interval.getStartMillis());
Assert.assertEquals(2, shardSpecsPerInterval.size());
for (int i = 0; i < shardSpecsPerInterval.size(); i++) {
Assert.assertEquals(new HashBasedNumberedShardSpec(i, shardSpecsPerInterval.size(), i, shardSpecsPerInterval.size(), null, HashPartitionFunction.MURMUR3_32_ABS, new ObjectMapper()), shardSpecsPerInterval.get(i).getActualSpec());
}
}
}
use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class ParallelIndexSupervisorTask method runHashPartitionMultiPhaseParallel.
@VisibleForTesting
TaskStatus runHashPartitionMultiPhaseParallel(TaskToolbox toolbox) throws Exception {
TaskState state;
ParallelIndexIngestionSpec ingestionSchemaToUse = ingestionSchema;
if (!(ingestionSchema.getTuningConfig().getPartitionsSpec() instanceof HashedPartitionsSpec)) {
// only range and hash partitioning is supported for multiphase parallel ingestion, see runMultiPhaseParallel()
throw new ISE("forceGuaranteedRollup is set but partitionsSpec [%s] is not a single_dim or hash partition spec.", ingestionSchema.getTuningConfig().getPartitionsSpec());
}
final Map<Interval, Integer> intervalToNumShards;
HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) ingestionSchema.getTuningConfig().getPartitionsSpec();
final boolean needsInputSampling = partitionsSpec.getNumShards() == null || ingestionSchemaToUse.getDataSchema().getGranularitySpec().inputIntervals().isEmpty();
if (needsInputSampling) {
// 0. need to determine intervals and numShards by scanning the data
LOG.info("Needs to determine intervals or numShards, beginning %s phase.", PartialDimensionCardinalityTask.TYPE);
ParallelIndexTaskRunner<PartialDimensionCardinalityTask, DimensionCardinalityReport> cardinalityRunner = createRunner(toolbox, this::createPartialDimensionCardinalityRunner);
state = runNextPhase(cardinalityRunner);
if (state.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, cardinalityRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
if (cardinalityRunner.getReports().isEmpty()) {
String msg = "No valid rows for hash partitioning." + " All rows may have invalid timestamps or have been filtered out.";
LOG.warn(msg);
return TaskStatus.success(getId(), msg);
}
if (partitionsSpec.getNumShards() == null) {
int effectiveMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT : partitionsSpec.getMaxRowsPerSegment();
LOG.info("effective maxRowsPerSegment is: " + effectiveMaxRowsPerSegment);
intervalToNumShards = determineNumShardsFromCardinalityReport(cardinalityRunner.getReports().values(), effectiveMaxRowsPerSegment);
} else {
intervalToNumShards = CollectionUtils.mapValues(mergeCardinalityReports(cardinalityRunner.getReports().values()), k -> partitionsSpec.getNumShards());
}
ingestionSchemaToUse = rewriteIngestionSpecWithIntervalsIfMissing(ingestionSchemaToUse, intervalToNumShards.keySet());
} else {
// numShards will be determined in PartialHashSegmentGenerateTask
intervalToNumShards = null;
}
// 1. Partial segment generation phase
final ParallelIndexIngestionSpec segmentCreateIngestionSpec = ingestionSchemaToUse;
ParallelIndexTaskRunner<PartialHashSegmentGenerateTask, GeneratedPartitionsReport> indexingRunner = createRunner(toolbox, f -> createPartialHashSegmentGenerateRunner(toolbox, segmentCreateIngestionSpec, intervalToNumShards));
state = runNextPhase(indexingRunner);
if (state.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, indexingRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
// 2. Partial segment merge phase
// partition (interval, partitionId) -> partition locations
Map<Partition, List<PartitionLocation>> partitionToLocations = getPartitionToLocations(indexingRunner.getReports());
final List<PartialSegmentMergeIOConfig> ioConfigs = createGenericMergeIOConfigs(ingestionSchema.getTuningConfig().getTotalNumMergeTasks(), partitionToLocations);
final ParallelIndexIngestionSpec segmentMergeIngestionSpec = ingestionSchemaToUse;
final ParallelIndexTaskRunner<PartialGenericSegmentMergeTask, PushedSegmentsReport> mergeRunner = createRunner(toolbox, tb -> createPartialGenericSegmentMergeRunner(tb, ioConfigs, segmentMergeIngestionSpec));
state = runNextPhase(mergeRunner);
TaskStatus taskStatus;
if (state.isSuccess()) {
// noinspection ConstantConditions
publishSegments(toolbox, mergeRunner.getReports());
if (awaitSegmentAvailabilityTimeoutMillis > 0) {
waitForSegmentAvailability(mergeRunner.getReports());
}
taskStatus = TaskStatus.success(getId());
} else {
// there is only success or failure after running....
Preconditions.checkState(state.isFailure(), "Unrecognized state after task is complete[%s]", state);
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, mergeRunner.getName());
taskStatus = TaskStatus.failure(getId(), errMsg);
}
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports(taskStatus, segmentAvailabilityConfirmationCompleted));
return taskStatus;
}
use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class PartialHashSegmentGenerateTask method createSegmentAllocator.
@Override
SegmentAllocatorForBatch createSegmentAllocator(TaskToolbox toolbox, ParallelIndexSupervisorTaskClient taskClient) throws IOException {
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
final ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
final HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) tuningConfig.getGivenOrDefaultPartitionsSpec();
return SegmentAllocators.forNonLinearPartitioning(toolbox, getDataSource(), getSubtaskSpecId(), granularitySpec, new SupervisorTaskAccess(supervisorTaskId, taskClient), createHashPartitionAnalysisFromPartitionsSpec(granularitySpec, partitionsSpec, intervalToNumShardsOverride));
}
use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class CompactionTaskParallelRunTest method testRunParallelWithHashPartitioningMatchCompactionState.
@Test
public void testRunParallelWithHashPartitioningMatchCompactionState() throws Exception {
// Hash partitioning is not supported with segment lock yet
Assume.assumeFalse(lockGranularity == LockGranularity.SEGMENT);
runIndexTask(null, true);
final Builder builder = new Builder(DATA_SOURCE, getSegmentCacheManagerFactory(), RETRY_POLICY_FACTORY);
final CompactionTask compactionTask = builder.inputSpec(new CompactionIntervalSpec(INTERVAL_TO_INDEX, null)).tuningConfig(newTuningConfig(new HashedPartitionsSpec(null, 3, null), 2, true)).build();
final Set<DataSegment> compactedSegments = runTask(compactionTask);
for (DataSegment segment : compactedSegments) {
// Expect compaction state to exist as store compaction state by default
Map<String, String> expectedLongSumMetric = new HashMap<>();
expectedLongSumMetric.put("type", "longSum");
expectedLongSumMetric.put("name", "val");
expectedLongSumMetric.put("fieldName", "val");
expectedLongSumMetric.put("expression", null);
Assert.assertSame(HashBasedNumberedShardSpec.class, segment.getShardSpec().getClass());
CompactionState expectedState = new CompactionState(new HashedPartitionsSpec(null, 3, null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("ts", "dim"))), ImmutableList.of(expectedLongSumMetric), null, compactionTask.getTuningConfig().getIndexSpec().asMap(getObjectMapper()), getObjectMapper().readValue(getObjectMapper().writeValueAsString(new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, true, ImmutableList.of(segment.getInterval()))), Map.class));
Assert.assertEquals(expectedState, segment.getLastCompactionState());
}
}
use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class CompactionTaskParallelRunTest method testCompactHashAndDynamicPartitionedSegments.
@Test
public void testCompactHashAndDynamicPartitionedSegments() {
runIndexTask(new HashedPartitionsSpec(null, 2, null), false);
runIndexTask(null, true);
final Builder builder = new Builder(DATA_SOURCE, getSegmentCacheManagerFactory(), RETRY_POLICY_FACTORY);
final CompactionTask compactionTask = builder.inputSpec(new CompactionIntervalSpec(INTERVAL_TO_INDEX, null)).tuningConfig(AbstractParallelIndexSupervisorTaskTest.DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING).build();
final Map<Interval, List<DataSegment>> intervalToSegments = SegmentUtils.groupSegmentsByInterval(runTask(compactionTask));
Assert.assertEquals(3, intervalToSegments.size());
Assert.assertEquals(ImmutableSet.of(Intervals.of("2014-01-01T00/PT1H"), Intervals.of("2014-01-01T01/PT1H"), Intervals.of("2014-01-01T02/PT1H")), intervalToSegments.keySet());
for (Entry<Interval, List<DataSegment>> entry : intervalToSegments.entrySet()) {
final List<DataSegment> segmentsInInterval = entry.getValue();
Assert.assertEquals(1, segmentsInInterval.size());
final ShardSpec shardSpec = segmentsInInterval.get(0).getShardSpec();
if (lockGranularity == LockGranularity.TIME_CHUNK) {
Assert.assertSame(NumberedShardSpec.class, shardSpec.getClass());
final NumberedShardSpec numberedShardSpec = (NumberedShardSpec) shardSpec;
Assert.assertEquals(0, numberedShardSpec.getPartitionNum());
Assert.assertEquals(1, numberedShardSpec.getNumCorePartitions());
} else {
Assert.assertSame(NumberedOverwriteShardSpec.class, shardSpec.getClass());
final NumberedOverwriteShardSpec numberedShardSpec = (NumberedOverwriteShardSpec) shardSpec;
Assert.assertEquals(PartitionIds.NON_ROOT_GEN_START_PARTITION_ID, numberedShardSpec.getPartitionNum());
Assert.assertEquals(1, numberedShardSpec.getAtomicUpdateGroupSize());
}
}
}
Aggregations