Search in sources :

Example 6 with StringTuple

use of org.apache.druid.data.input.StringTuple in project druid by druid-io.

the class DimensionRangeShardSpecTest method testPossibleInDomain_nonNullValues_pruningScenarios.

@Test
public void testPossibleInDomain_nonNullValues_pruningScenarios() {
    setDimensions("planet", "country", "city");
    final StringTuple start = StringTuple.create("Earth", "France", "Paris");
    final StringTuple end = StringTuple.create("Earth", "USA", "New York");
    final RangeSet<String> universalSet = TreeRangeSet.create();
    universalSet.add(Range.all());
    ShardSpec shard = new DimensionRangeShardSpec(dimensions, start, end, 0, null);
    Map<String, RangeSet<String>> domain = new HashMap<>();
    // (-INF, Earth) U (Earth, INF) * (-INF, INF) * (-INF, INF)
    populateDomain(domain, getUnion(getRangeSet(Range.lessThan("Earth")), getRangeSet(Range.greaterThan("Earth"))), // EffectiveDomain[:1] == {} -> PRUNE
    universalSet, universalSet);
    assertFalse(shard.possibleInDomain(domain));
    // (-INF, INF) * (-INF, "France") * (-INF, INF)
    populateDomain(domain, universalSet, // EffectiveDomain[:1] == {Earth} == {start[:1]} == {end[:1]}
    getRangeSet(Range.lessThan("France")), // EffectiveDomain[:2] == {} -> PRUNE
    universalSet);
    assertFalse(shard.possibleInDomain(domain));
    // (-INF, INF) * (-INF, "France] * (-INF, Paris)
    populateDomain(domain, universalSet, // EffectiveDomain[:1] == {Earth} == {start[:1]} == {end[:1]}
    getRangeSet(Range.atMost("France")), // EffectiveDomain[:2] == {<Earth, France>} == {start[:2]}
    getRangeSet(Range.lessThan("Paris")));
    assertFalse(shard.possibleInDomain(domain));
    // {Earth} * {USA} * (New York, INF)
    populateDomain(domain, getRangeSet(Range.singleton("Earth")), // EffectiveDomain[:1] == {Earth} == {start[:1]} == {end[:1]}
    getRangeSet(Range.singleton("USA")), // EffectiveDomain[:2] == {<Earth, USA>} == {end[:2]}
    getRangeSet(Range.greaterThan("New York")));
    assertFalse(shard.possibleInDomain(domain));
}
Also used : HashMap(java.util.HashMap) RangeSet(com.google.common.collect.RangeSet) TreeRangeSet(com.google.common.collect.TreeRangeSet) StringTuple(org.apache.druid.data.input.StringTuple) Test(org.junit.Test)

Example 7 with StringTuple

use of org.apache.druid.data.input.StringTuple in project druid by druid-io.

the class DimensionRangeShardSpec method isInChunk.

public static boolean isInChunk(List<String> dimensions, @Nullable StringTuple start, @Nullable StringTuple end, InputRow inputRow) {
    final String[] inputDimensionValues = new String[dimensions.size()];
    for (int i = 0; i < dimensions.size(); ++i) {
        // Get the values of this dimension, treat multiple values as null
        List<String> values = inputRow.getDimension(dimensions.get(i));
        inputDimensionValues[i] = values != null && values.size() == 1 ? values.get(0) : null;
    }
    final StringTuple inputRowTuple = StringTuple.create(inputDimensionValues);
    int inputVsStart = inputRowTuple.compareTo(start);
    int inputVsEnd = inputRowTuple.compareTo(end);
    return (inputVsStart >= 0 || start == null) && (inputVsEnd < 0 || end == null);
}
Also used : StringTuple(org.apache.druid.data.input.StringTuple)

Example 8 with StringTuple

use of org.apache.druid.data.input.StringTuple in project druid by druid-io.

the class DimensionRangeShardSpecTest method testPossibleInDomain_withNullStart.

@Test
public void testPossibleInDomain_withNullStart() {
    setDimensions("planet", "country", "city");
    // considered to be (-INF, -INF, -INF)
    final StringTuple start = null;
    final StringTuple end = StringTuple.create("Saturn", "Foo", "Bar");
    final RangeSet<String> universalSet = TreeRangeSet.create();
    universalSet.add(Range.all());
    ShardSpec shard = new DimensionRangeShardSpec(dimensions, start, end, 0, null);
    Map<String, RangeSet<String>> domain = new HashMap<>();
    // {Mars} * {Zoo, Zuu} * {Blah, Random}
    populateDomain(domain, getRangeSet(Range.singleton("Mars")), // EffectiveDomain[:1].size > 1 -> ACCEPT
    getUnion(getRangeSet(Range.singleton("Zoo")), getRangeSet(Range.singleton("Zuu"))), getUnion(getRangeSet(Range.singleton("Blah")), getRangeSet(Range.singleton("Random"))));
    assertTrue(shard.possibleInDomain(domain));
    // {Saturn} * (-INF, INF) * (-INF, INF)
    populateDomain(domain, getRangeSet(Range.singleton("Saturn")), // EffectiveDomain[:1] == {end[:1]}
    universalSet, // EffectiveDomain[:2].size > 1 -> ACCEPT
    universalSet);
    assertTrue(shard.possibleInDomain(domain));
    // {Saturn} * {Zoo} * (-INF, INF)
    populateDomain(domain, getRangeSet(Range.singleton("Saturn")), // EffectiveDomain[:1] == {end[:1]}
    getRangeSet(Range.singleton("Zoo")), // EffectiveDomain[:2] == {} -> PRUNE
    universalSet);
    assertFalse(shard.possibleInDomain(domain));
    // (Xeon) * (-INF, INF) * (-INF, INF)
    populateDomain(domain, getRangeSet(Range.singleton("Xeon")), // EffectiveDomain[:1] == {} -> PRUNE
    universalSet, universalSet);
    assertFalse(shard.possibleInDomain(domain));
}
Also used : HashMap(java.util.HashMap) RangeSet(com.google.common.collect.RangeSet) TreeRangeSet(com.google.common.collect.TreeRangeSet) StringTuple(org.apache.druid.data.input.StringTuple) Test(org.junit.Test)

Example 9 with StringTuple

use of org.apache.druid.data.input.StringTuple in project druid by druid-io.

the class DimensionRangeShardSpec method possibleInDomain.

/**
 * Set[:i] is the cartesian product of Set[0],...,Set[i - 1]
 * EffectiveDomain[:i] is defined as QueryDomain[:i] INTERSECTION SegmentRange[:i]
 *
 * i = 1
 * If EffectiveDomain[:i] == {start[:i]} || EffectiveDomain == {end[:i]}:
 *  if i == index.dimensions.size:
 *    ACCEPT segment
 *  else:
 *    REPEAT with i = i + 1
 *else if EffectiveDomain[:i] == {}:
 *  PRUNE segment
 *else:
 *  ACCEPT segment
 *
 * Example: Index on (Hour, Minute, Second). Index.size is 3
 * I)
 * start = (3, 25, 10)
 * end = (5, 10, 30)
 * query domain = {3} * [0, 10] * {10, 20, 30, 40}
 * EffectiveDomain[:1] == {3} == start[:1]
 * EffectiveDomain[:2] == {3} * ([0, 10] INTERSECTION [25, INF))
 *                     == {} -> PRUNE
 *
 * II)
 * start = (3, 25, 10)
 * end = (5, 15, 30)
 * query domain = {4} * [0, 10] * {10, 20, 30, 40}
 * EffectiveDomain[:1] == {4} (!= {} && != start[:1] && != {end[:1]}) -> ACCEPT
 *
 * III)
 * start = (3, 25, 10)
 * end = (5, 15, 30)
 * query domain = {5} * [0, 10] * {10, 20, 30, 40}
 * EffectiveDomain[:1] == {5} == end[:1]
 * EffectiveDomain[:2] == {5} * ([0, 10] INTERSECTION (-INF, 15])
 *                     == {5} * [0, 10] (! ={} && != {end[:2]}) -> ACCEPT
 *
 * IV)
 * start = (3, 25, 10)
 * end = (5, 15, 30)
 * query domain = {5} * [15, 40] * {10, 20, 30, 40}
 * EffectiveDomain[:1] == {5} == end[:1]
 * EffectiveDomain[:2] == {5} * ([15, 40] INTERSECTION (-INF, 15])
 *                     == {5} * {15} == {end[:2]}
 * EffectiveDomain[:3] == {5} * {15} * ({10, 20, 30, 40} * (-INF, 30])
 *                     == {5} * {15} * {10, 20, 30} != {}  -> ACCEPT
 *
 * V)
 * start = (3, 25, 10)
 * end = (5, 15, 30)
 * query domain = {5} * [15, 40] * {50}
 * EffectiveDomain[:1] == {5} == end[:1]
 * EffectiveDomain[:2] == {5} * ([15, 40] INTERSECTION (-INF, 15])
 *                     == {5} * {15} == {end[:2]}
 * EffectiveDomain[:3] == {5} * {15} * ({40} * (-INF, 30])
 *                     == {5} * {15} * {}
 *                     == {} -> PRUNE
 *
 * @param domain The domain inferred from the query. Assumed to be non-emtpy
 * @return true if segment needs to be considered for query, false if it can be pruned
 */
@Override
public boolean possibleInDomain(Map<String, RangeSet<String>> domain) {
    final StringTuple segmentStart = start == null ? new StringTuple(new String[dimensions.size()]) : start;
    final StringTuple segmentEnd = end == null ? new StringTuple(new String[dimensions.size()]) : end;
    // Indicates if the effective domain is equivalent to {start} till the previous dimension
    boolean effectiveDomainIsStart = true;
    // Indicates if the effective domain is equivalent to {end} till the previous dimension
    boolean effectiveDomainIsEnd = true;
    for (int i = 0; i < dimensions.size(); i++) {
        String dimension = dimensions.get(i);
        RangeSet<String> queryDomainForDimension = domain.get(dimension);
        if (queryDomainForDimension == null) {
            queryDomainForDimension = TreeRangeSet.create();
            queryDomainForDimension.add(Range.all());
        }
        // Compute the segment's range for given dimension based on its start, end and boundary conditions
        Range<String> rangeTillSegmentBoundary = Range.all();
        if (effectiveDomainIsStart && segmentStart.get(i) != null) {
            rangeTillSegmentBoundary = rangeTillSegmentBoundary.intersection(Range.atLeast(segmentStart.get(i)));
        }
        if (effectiveDomainIsEnd && segmentEnd.get(i) != null) {
            rangeTillSegmentBoundary = rangeTillSegmentBoundary.intersection(Range.atMost(segmentEnd.get(i)));
        }
        // EffectiveDomain[i] = QueryDomain[i] INTERSECTION SegmentRange[i]
        RangeSet<String> effectiveDomainForDimension = queryDomainForDimension.subRangeSet(rangeTillSegmentBoundary);
        // Prune segment because query domain is out of segment range
        if (effectiveDomainForDimension.isEmpty()) {
            return false;
        }
        // EffectiveDomain is singleton and lies only on the boundaries -> consider next dimensions
        effectiveDomainIsStart = effectiveDomainIsStart && isRangeSetSingletonWithVal(effectiveDomainForDimension, segmentStart.get(i));
        effectiveDomainIsEnd = effectiveDomainIsEnd && isRangeSetSingletonWithVal(effectiveDomainForDimension, segmentEnd.get(i));
        // EffectiveDomain lies within the boundaries as well -> cannot prune based on next dimensions
        if (!effectiveDomainIsStart && !effectiveDomainIsEnd) {
            return true;
        }
    }
    return true;
}
Also used : StringTuple(org.apache.druid.data.input.StringTuple)

Example 10 with StringTuple

use of org.apache.druid.data.input.StringTuple in project druid by druid-io.

the class PartialDimensionDistributionTask method determineDistribution.

private Map<Interval, StringDistribution> determineDistribution(HandlingInputRowIterator inputRowIterator, GranularitySpec granularitySpec, List<String> partitionDimensions, boolean isAssumeGrouped) {
    Map<Interval, StringDistribution> intervalToDistribution = new HashMap<>();
    InputRowFilter inputRowFilter = !isAssumeGrouped && granularitySpec.isRollup() ? dedupInputRowFilterSupplier.get() : new PassthroughInputRowFilter();
    while (inputRowIterator.hasNext()) {
        InputRow inputRow = inputRowIterator.next();
        if (inputRow == null) {
            continue;
        }
        final Interval interval;
        if (granularitySpec.inputIntervals().isEmpty()) {
            interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
        } else {
            final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
            // this interval must exist since it passed the rowFilter
            assert optInterval.isPresent();
            interval = optInterval.get();
        }
        String[] values = new String[partitionDimensions.size()];
        for (int i = 0; i < partitionDimensions.size(); ++i) {
            List<String> dimensionValues = inputRow.getDimension(partitionDimensions.get(i));
            if (dimensionValues != null && !dimensionValues.isEmpty()) {
                values[i] = Iterables.getOnlyElement(dimensionValues);
            }
        }
        final StringTuple partitionDimensionValues = StringTuple.create(values);
        if (inputRowFilter.accept(interval, partitionDimensionValues, inputRow)) {
            StringDistribution stringDistribution = intervalToDistribution.computeIfAbsent(interval, k -> new StringSketch());
            stringDistribution.put(partitionDimensionValues);
        }
    }
    // DedupInputRowFilter may not accept the min/max dimensionValue. If needed, add the min/max
    // values to the distributions so they have an accurate min/max.
    inputRowFilter.getIntervalToMinPartitionDimensionValue().forEach((interval, min) -> intervalToDistribution.get(interval).putIfNewMin(min));
    inputRowFilter.getIntervalToMaxPartitionDimensionValue().forEach((interval, max) -> intervalToDistribution.get(interval).putIfNewMax(max));
    return intervalToDistribution;
}
Also used : StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) HashMap(java.util.HashMap) StringSketch(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch) InputRow(org.apache.druid.data.input.InputRow) StringTuple(org.apache.druid.data.input.StringTuple) Interval(org.joda.time.Interval)

Aggregations

StringTuple (org.apache.druid.data.input.StringTuple)11 HashMap (java.util.HashMap)5 Test (org.junit.Test)5 RangeSet (com.google.common.collect.RangeSet)4 TreeRangeSet (com.google.common.collect.TreeRangeSet)4 Interval (org.joda.time.Interval)2 File (java.io.File)1 ArrayList (java.util.ArrayList)1 InputRow (org.apache.druid.data.input.InputRow)1 StringDistribution (org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution)1 StringSketch (org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch)1 DataSegment (org.apache.druid.timeline.DataSegment)1 DimensionRangeShardSpec (org.apache.druid.timeline.partition.DimensionRangeShardSpec)1 PartitionBoundaries (org.apache.druid.timeline.partition.PartitionBoundaries)1