Examples with StringDistribution - org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution

Example 1 with StringDistribution

use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution in project druid by druid-io.

the class ParallelIndexSupervisorTask method determineRangePartition.

private PartitionBoundaries determineRangePartition(Collection<StringDistribution> distributions) {
    StringDistributionMerger distributionMerger = new StringSketchMerger();
    distributions.forEach(distributionMerger::merge);
    StringDistribution mergedDistribution = distributionMerger.getResult();
    DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec();
    final PartitionBoundaries partitions;
    Integer targetRowsPerSegment = partitionsSpec.getTargetRowsPerSegment();
    if (targetRowsPerSegment == null) {
        partitions = mergedDistribution.getEvenPartitionsByMaxSize(partitionsSpec.getMaxRowsPerSegment());
    } else {
        partitions = mergedDistribution.getEvenPartitionsByTargetSize(targetRowsPerSegment);
    }
    return partitions;
}

Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) StringSketchMerger(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketchMerger) StringDistributionMerger(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger) DimensionRangePartitionsSpec(org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec) PartitionBoundaries(org.apache.druid.timeline.partition.PartitionBoundaries)

Example 2 with StringDistribution

use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution in project druid by druid-io.

the class PartialDimensionDistributionTask method runTask.

@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    DataSchema dataSchema = ingestionSchema.getDataSchema();
    GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec();
    Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
    final List<String> partitionDimensions = partitionsSpec.getPartitionDimensions();
    Preconditions.checkArgument(partitionDimensions != null && !partitionDimensions.isEmpty(), "partitionDimension required in partitionsSpec");
    boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped();
    InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
    final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
    final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
        HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimensions, SKIP_NULL).delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
        Map<Interval, StringDistribution> distribution = determineDistribution(iterator, granularitySpec, partitionDimensions, isAssumeGrouped);
        sendReport(toolbox, new DimensionDistributionReport(getId(), distribution));
    }
    return TaskStatus.success(getId());
}

Also used : InputSource(org.apache.druid.data.input.InputSource) StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) DimensionRangePartitionsSpec(org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec) HandlingInputRowIterator(org.apache.druid.data.input.HandlingInputRowIterator) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputFormat(org.apache.druid.data.input.InputFormat) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) InputRow(org.apache.druid.data.input.InputRow) RangePartitionIndexTaskInputRowIteratorBuilder(org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Interval(org.joda.time.Interval)

Example 3 with StringDistribution

use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution in project druid by druid-io.

the class PartialDimensionDistributionTask method determineDistribution.

private Map<Interval, StringDistribution> determineDistribution(HandlingInputRowIterator inputRowIterator, GranularitySpec granularitySpec, List<String> partitionDimensions, boolean isAssumeGrouped) {
    Map<Interval, StringDistribution> intervalToDistribution = new HashMap<>();
    InputRowFilter inputRowFilter = !isAssumeGrouped && granularitySpec.isRollup() ? dedupInputRowFilterSupplier.get() : new PassthroughInputRowFilter();
    while (inputRowIterator.hasNext()) {
        InputRow inputRow = inputRowIterator.next();
        if (inputRow == null) {
            continue;
        }
        final Interval interval;
        if (granularitySpec.inputIntervals().isEmpty()) {
            interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
        } else {
            final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
            // this interval must exist since it passed the rowFilter
            assert optInterval.isPresent();
            interval = optInterval.get();
        }
        String[] values = new String[partitionDimensions.size()];
        for (int i = 0; i < partitionDimensions.size(); ++i) {
            List<String> dimensionValues = inputRow.getDimension(partitionDimensions.get(i));
            if (dimensionValues != null && !dimensionValues.isEmpty()) {
                values[i] = Iterables.getOnlyElement(dimensionValues);
            }
        }
        final StringTuple partitionDimensionValues = StringTuple.create(values);
        if (inputRowFilter.accept(interval, partitionDimensionValues, inputRow)) {
            StringDistribution stringDistribution = intervalToDistribution.computeIfAbsent(interval, k -> new StringSketch());
            stringDistribution.put(partitionDimensionValues);
        }
    }
    // DedupInputRowFilter may not accept the min/max dimensionValue. If needed, add the min/max
    // values to the distributions so they have an accurate min/max.
    inputRowFilter.getIntervalToMinPartitionDimensionValue().forEach((interval, min) -> intervalToDistribution.get(interval).putIfNewMin(min));
    inputRowFilter.getIntervalToMaxPartitionDimensionValue().forEach((interval, max) -> intervalToDistribution.get(interval).putIfNewMax(max));
    return intervalToDistribution;
}

Also used : StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) HashMap(java.util.HashMap) StringSketch(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch) InputRow(org.apache.druid.data.input.InputRow) StringTuple(org.apache.druid.data.input.StringTuple) Interval(org.joda.time.Interval)

Example 4 with StringDistribution

use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution in project druid by druid-io.

the class DimensionDistributionReportTest method setup.

@Before
public void setup() {
    Interval interval = Intervals.ETERNITY;
    StringSketch sketch = new StringSketch();
    Map<Interval, StringDistribution> intervalToDistribution = Collections.singletonMap(interval, sketch);
    String taskId = "abc";
    target = new DimensionDistributionReport(taskId, intervalToDistribution);
}

Also used : StringSketch(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch) StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) Interval(org.joda.time.Interval) Before(org.junit.Before)

Aggregations

StringDistribution (org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution)4 Interval (org.joda.time.Interval)3 InputRow (org.apache.druid.data.input.InputRow)2 DimensionRangePartitionsSpec (org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec)2 StringSketch (org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch)2 HashMap (java.util.HashMap)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 HandlingInputRowIterator (org.apache.druid.data.input.HandlingInputRowIterator)1 InputFormat (org.apache.druid.data.input.InputFormat)1 InputSource (org.apache.druid.data.input.InputSource)1 StringTuple (org.apache.druid.data.input.StringTuple)1 StringDistributionMerger (org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger)1 StringSketchMerger (org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketchMerger)1 RangePartitionIndexTaskInputRowIteratorBuilder (org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder)1 ParseExceptionHandler (org.apache.druid.segment.incremental.ParseExceptionHandler)1 RowIngestionMeters (org.apache.druid.segment.incremental.RowIngestionMeters)1 DataSchema (org.apache.druid.segment.indexing.DataSchema)1 GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)1 PartitionBoundaries (org.apache.druid.timeline.partition.PartitionBoundaries)1 Before (org.junit.Before)1