use of org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis in project druid by druid-io.
the class PartialHashSegmentGenerateTask method createHashPartitionAnalysisFromPartitionsSpec.
/**
* Creates shard specs based on the given configurations. The return value is a map between intervals created
* based on the segment granularity and the shard specs to be created.
* Note that the shard specs to be created is a pair of {@link PartialShardSpec} and number of segments per interval
* and filled only when {@link #isGuaranteedRollup} = true. Otherwise, the return value contains only the set of
* intervals generated based on the segment granularity.
*/
public static HashPartitionAnalysis createHashPartitionAnalysisFromPartitionsSpec(GranularitySpec granularitySpec, @Nonnull HashedPartitionsSpec partitionsSpec, @Nullable Map<Interval, Integer> intervalToNumShardsOverride) {
final HashPartitionAnalysis partitionAnalysis = new HashPartitionAnalysis(partitionsSpec);
if (intervalToNumShardsOverride != null) {
// Some intervals populated from granularitySpec can be missing in intervalToNumShardsOverride
// because intervalToNumShardsOverride contains only the intervals which exist in input data.
// We only care about the intervals in intervalToNumShardsOverride here.
intervalToNumShardsOverride.forEach(partitionAnalysis::updateBucket);
} else {
final Iterable<Interval> intervals = granularitySpec.sortedBucketIntervals();
final int numBucketsPerInterval = partitionsSpec.getNumShards() == null ? 1 : partitionsSpec.getNumShards();
intervals.forEach(interval -> partitionAnalysis.updateBucket(interval, numBucketsPerInterval));
}
return partitionAnalysis;
}
use of org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis in project druid by druid-io.
the class HashPartitionCachingLocalSegmentAllocatorTest method setup.
@Before
public void setup() throws IOException {
TaskToolbox toolbox = createToolbox();
HashPartitionAnalysis partitionAnalysis = new HashPartitionAnalysis(PARTITIONS_SPEC);
partitionAnalysis.updateBucket(INTERVAL, NUM_PARTITONS);
target = SegmentAllocators.forNonLinearPartitioning(toolbox, DATASOURCE, TASKID, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, ImmutableList.of()), new SupervisorTaskAccessWithNullClient(SUPERVISOR_TASKID), partitionAnalysis);
sequenceNameFunction = ((CachingLocalSegmentAllocator) target).getSequenceNameFunction();
}
use of org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis in project druid by druid-io.
the class PartialHashSegmentGenerateTaskTest method testCreateHashPartitionAnalysisFromPartitionsSpecWithNumShardsMap.
@Test
public void testCreateHashPartitionAnalysisFromPartitionsSpecWithNumShardsMap() {
final List<Interval> intervals = ImmutableList.of(Intervals.of("2020-01-01/2020-01-02"), Intervals.of("2020-01-02/2020-01-03"), Intervals.of("2020-01-03/2020-01-04"));
final Map<Interval, Integer> intervalToNumShards = ImmutableMap.of(Intervals.of("2020-01-01/2020-01-02"), 1, Intervals.of("2020-01-02/2020-01-03"), 2, Intervals.of("2020-01-03/2020-01-04"), 3);
final HashPartitionAnalysis partitionAnalysis = PartialHashSegmentGenerateTask.createHashPartitionAnalysisFromPartitionsSpec(new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, intervals), new HashedPartitionsSpec(null, null, null), intervalToNumShards);
Assert.assertEquals(intervals.size(), partitionAnalysis.getNumTimePartitions());
for (Interval interval : intervals) {
Assert.assertEquals(intervalToNumShards.get(interval).intValue(), partitionAnalysis.getBucketAnalysis(interval).intValue());
}
}
use of org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis in project druid by druid-io.
the class IndexTask method createShardSpecsFromInput.
private PartitionAnalysis createShardSpecsFromInput(ObjectMapper jsonMapper, IndexIngestionSpec ingestionSchema, InputSource inputSource, File tmpDir, GranularitySpec granularitySpec, @Nonnull PartitionsSpec partitionsSpec, boolean determineIntervals) throws IOException {
assert partitionsSpec.getType() != SecondaryPartitionType.RANGE;
long determineShardSpecsStartMillis = System.currentTimeMillis();
final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = collectIntervalsAndShardSpecs(jsonMapper, ingestionSchema, inputSource, tmpDir, granularitySpec, partitionsSpec, determineIntervals);
final PartitionAnalysis<Integer, ?> partitionAnalysis;
if (partitionsSpec.getType() == SecondaryPartitionType.LINEAR) {
partitionAnalysis = new LinearPartitionAnalysis((DynamicPartitionsSpec) partitionsSpec);
} else if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
partitionAnalysis = new HashPartitionAnalysis((HashedPartitionsSpec) partitionsSpec);
} else {
throw new UOE("%s", partitionsSpec.getClass().getName());
}
for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : hllCollectors.entrySet()) {
final Interval interval = entry.getKey();
final int numBucketsPerInterval;
if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
final HashedPartitionsSpec hashedPartitionsSpec = (HashedPartitionsSpec) partitionsSpec;
final HyperLogLogCollector collector = entry.getValue().orNull();
if (partitionsSpec.needsDeterminePartitions(false)) {
final long numRows = Preconditions.checkNotNull(collector, "HLL collector").estimateCardinalityRound();
final int nonNullMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT : partitionsSpec.getMaxRowsPerSegment();
numBucketsPerInterval = (int) Math.ceil((double) numRows / nonNullMaxRowsPerSegment);
log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numBucketsPerInterval);
} else {
numBucketsPerInterval = hashedPartitionsSpec.getNumShards() == null ? 1 : hashedPartitionsSpec.getNumShards();
log.info("Creating [%,d] buckets for interval [%s]", numBucketsPerInterval, interval);
}
} else {
numBucketsPerInterval = 1;
}
partitionAnalysis.updateBucket(interval, numBucketsPerInterval);
}
log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
return partitionAnalysis;
}
use of org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis in project druid by druid-io.
the class PartialHashSegmentGenerateTaskTest method testCreateHashPartitionAnalysisFromPartitionsSpecWithNumShardsReturningAnalysisOfValidNumBuckets.
@Test
public void testCreateHashPartitionAnalysisFromPartitionsSpecWithNumShardsReturningAnalysisOfValidNumBuckets() {
final List<Interval> intervals = ImmutableList.of(Intervals.of("2020-01-01/2020-01-02"), Intervals.of("2020-01-02/2020-01-03"), Intervals.of("2020-01-03/2020-01-04"));
final int expectedNumBuckets = 5;
final HashPartitionAnalysis partitionAnalysis = PartialHashSegmentGenerateTask.createHashPartitionAnalysisFromPartitionsSpec(new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, intervals), new HashedPartitionsSpec(null, expectedNumBuckets, null), null);
Assert.assertEquals(intervals.size(), partitionAnalysis.getNumTimePartitions());
for (Interval interval : intervals) {
Assert.assertEquals(expectedNumBuckets, partitionAnalysis.getBucketAnalysis(interval).intValue());
}
}
Aggregations