use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution in project druid by druid-io.
the class ParallelIndexSupervisorTask method determineRangePartition.
private PartitionBoundaries determineRangePartition(Collection<StringDistribution> distributions) {
StringDistributionMerger distributionMerger = new StringSketchMerger();
distributions.forEach(distributionMerger::merge);
StringDistribution mergedDistribution = distributionMerger.getResult();
DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec();
final PartitionBoundaries partitions;
Integer targetRowsPerSegment = partitionsSpec.getTargetRowsPerSegment();
if (targetRowsPerSegment == null) {
partitions = mergedDistribution.getEvenPartitionsByMaxSize(partitionsSpec.getMaxRowsPerSegment());
} else {
partitions = mergedDistribution.getEvenPartitionsByTargetSize(targetRowsPerSegment);
}
return partitions;
}
use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution in project druid by druid-io.
the class PartialDimensionDistributionTask method runTask.
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
DataSchema dataSchema = ingestionSchema.getDataSchema();
GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec();
Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
final List<String> partitionDimensions = partitionsSpec.getPartitionDimensions();
Preconditions.checkArgument(partitionDimensions != null && !partitionDimensions.isEmpty(), "partitionDimension required in partitionsSpec");
boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped();
InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimensions, SKIP_NULL).delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
Map<Interval, StringDistribution> distribution = determineDistribution(iterator, granularitySpec, partitionDimensions, isAssumeGrouped);
sendReport(toolbox, new DimensionDistributionReport(getId(), distribution));
}
return TaskStatus.success(getId());
}
use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution in project druid by druid-io.
the class PartialDimensionDistributionTask method determineDistribution.
private Map<Interval, StringDistribution> determineDistribution(HandlingInputRowIterator inputRowIterator, GranularitySpec granularitySpec, List<String> partitionDimensions, boolean isAssumeGrouped) {
Map<Interval, StringDistribution> intervalToDistribution = new HashMap<>();
InputRowFilter inputRowFilter = !isAssumeGrouped && granularitySpec.isRollup() ? dedupInputRowFilterSupplier.get() : new PassthroughInputRowFilter();
while (inputRowIterator.hasNext()) {
InputRow inputRow = inputRowIterator.next();
if (inputRow == null) {
continue;
}
final Interval interval;
if (granularitySpec.inputIntervals().isEmpty()) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
// this interval must exist since it passed the rowFilter
assert optInterval.isPresent();
interval = optInterval.get();
}
String[] values = new String[partitionDimensions.size()];
for (int i = 0; i < partitionDimensions.size(); ++i) {
List<String> dimensionValues = inputRow.getDimension(partitionDimensions.get(i));
if (dimensionValues != null && !dimensionValues.isEmpty()) {
values[i] = Iterables.getOnlyElement(dimensionValues);
}
}
final StringTuple partitionDimensionValues = StringTuple.create(values);
if (inputRowFilter.accept(interval, partitionDimensionValues, inputRow)) {
StringDistribution stringDistribution = intervalToDistribution.computeIfAbsent(interval, k -> new StringSketch());
stringDistribution.put(partitionDimensionValues);
}
}
// DedupInputRowFilter may not accept the min/max dimensionValue. If needed, add the min/max
// values to the distributions so they have an accurate min/max.
inputRowFilter.getIntervalToMinPartitionDimensionValue().forEach((interval, min) -> intervalToDistribution.get(interval).putIfNewMin(min));
inputRowFilter.getIntervalToMaxPartitionDimensionValue().forEach((interval, max) -> intervalToDistribution.get(interval).putIfNewMax(max));
return intervalToDistribution;
}
use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution in project druid by druid-io.
the class DimensionDistributionReportTest method setup.
@Before
public void setup() {
Interval interval = Intervals.ETERNITY;
StringSketch sketch = new StringSketch();
Map<Interval, StringDistribution> intervalToDistribution = Collections.singletonMap(interval, sketch);
String taskId = "abc";
target = new DimensionDistributionReport(taskId, intervalToDistribution);
}
Aggregations