use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch in project druid by druid-io.
the class PartialDimensionDistributionTask method determineDistribution.
private Map<Interval, StringDistribution> determineDistribution(HandlingInputRowIterator inputRowIterator, GranularitySpec granularitySpec, List<String> partitionDimensions, boolean isAssumeGrouped) {
Map<Interval, StringDistribution> intervalToDistribution = new HashMap<>();
InputRowFilter inputRowFilter = !isAssumeGrouped && granularitySpec.isRollup() ? dedupInputRowFilterSupplier.get() : new PassthroughInputRowFilter();
while (inputRowIterator.hasNext()) {
InputRow inputRow = inputRowIterator.next();
if (inputRow == null) {
continue;
}
final Interval interval;
if (granularitySpec.inputIntervals().isEmpty()) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
// this interval must exist since it passed the rowFilter
assert optInterval.isPresent();
interval = optInterval.get();
}
String[] values = new String[partitionDimensions.size()];
for (int i = 0; i < partitionDimensions.size(); ++i) {
List<String> dimensionValues = inputRow.getDimension(partitionDimensions.get(i));
if (dimensionValues != null && !dimensionValues.isEmpty()) {
values[i] = Iterables.getOnlyElement(dimensionValues);
}
}
final StringTuple partitionDimensionValues = StringTuple.create(values);
if (inputRowFilter.accept(interval, partitionDimensionValues, inputRow)) {
StringDistribution stringDistribution = intervalToDistribution.computeIfAbsent(interval, k -> new StringSketch());
stringDistribution.put(partitionDimensionValues);
}
}
// DedupInputRowFilter may not accept the min/max dimensionValue. If needed, add the min/max
// values to the distributions so they have an accurate min/max.
inputRowFilter.getIntervalToMinPartitionDimensionValue().forEach((interval, min) -> intervalToDistribution.get(interval).putIfNewMin(min));
inputRowFilter.getIntervalToMaxPartitionDimensionValue().forEach((interval, max) -> intervalToDistribution.get(interval).putIfNewMax(max));
return intervalToDistribution;
}
use of org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch in project druid by druid-io.
the class DimensionDistributionReportTest method setup.
@Before
public void setup() {
Interval interval = Intervals.ETERNITY;
StringSketch sketch = new StringSketch();
Map<Interval, StringDistribution> intervalToDistribution = Collections.singletonMap(interval, sketch);
String taskId = "abc";
target = new DimensionDistributionReport(taskId, intervalToDistribution);
}
Aggregations