Search in sources :

Example 6 with DictionaryEncodedColumn

use of org.apache.druid.segment.column.DictionaryEncodedColumn in project druid by druid-io.

the class QueryableIndexVectorColumnSelectorFactory method makeSingleValueDimensionSelector.

@Override
public SingleValueDimensionVectorSelector makeSingleValueDimensionSelector(final DimensionSpec dimensionSpec) {
    if (!dimensionSpec.canVectorize()) {
        throw new ISE("DimensionSpec[%s] cannot be vectorized", dimensionSpec);
    }
    Function<DimensionSpec, SingleValueDimensionVectorSelector> mappingFunction = spec -> {
        if (virtualColumns.exists(spec.getDimension())) {
            SingleValueDimensionVectorSelector dimensionSelector = virtualColumns.makeSingleValueDimensionVectorSelector(dimensionSpec, index, offset);
            if (dimensionSelector == null) {
                return virtualColumns.makeSingleValueDimensionVectorSelector(dimensionSpec, this);
            } else {
                return dimensionSelector;
            }
        }
        final ColumnHolder holder = index.getColumnHolder(spec.getDimension());
        if (holder == null || !holder.getCapabilities().isDictionaryEncoded().isTrue() || !holder.getCapabilities().is(ValueType.STRING)) {
            // Asking for a single-value dimension selector on a non-string column gets you a bunch of nulls.
            return NilVectorSelector.create(offset);
        }
        if (holder.getCapabilities().hasMultipleValues().isMaybeTrue()) {
            // Asking for a single-value dimension selector on a multi-value column gets you an error.
            throw new ISE("Column[%s] is multi-value, do not ask for a single-value selector", spec.getDimension());
        }
        @SuppressWarnings("unchecked") final DictionaryEncodedColumn<String> dictionaryEncodedColumn = (DictionaryEncodedColumn<String>) getCachedColumn(spec.getDimension());
        // dictionaryEncodedColumn is not null because of holder null check above
        assert dictionaryEncodedColumn != null;
        final SingleValueDimensionVectorSelector selector = dictionaryEncodedColumn.makeSingleValueDimensionVectorSelector(offset);
        return spec.decorate(selector);
    };
    // We cannot use computeIfAbsent() here since the function being applied may modify the cache itself through
    // virtual column references, triggering a ConcurrentModificationException in JDK 9 and above.
    SingleValueDimensionVectorSelector selector = singleValueDimensionSelectorCache.get(dimensionSpec);
    if (selector == null) {
        selector = mappingFunction.apply(dimensionSpec);
        singleValueDimensionSelectorCache.put(dimensionSpec, selector);
    }
    return selector;
}
Also used : QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) VirtualColumns(org.apache.druid.segment.VirtualColumns) DictionaryEncodedColumn(org.apache.druid.segment.column.DictionaryEncodedColumn) Closer(org.apache.druid.java.util.common.io.Closer) QueryableIndex(org.apache.druid.segment.QueryableIndex) ISE(org.apache.druid.java.util.common.ISE) ValueType(org.apache.druid.segment.column.ValueType) HashMap(java.util.HashMap) Function(java.util.function.Function) ColumnHolder(org.apache.druid.segment.column.ColumnHolder) DimensionSpec(org.apache.druid.query.dimension.DimensionSpec) Map(java.util.Map) ColumnCapabilities(org.apache.druid.segment.column.ColumnCapabilities) Nullable(javax.annotation.Nullable) BaseColumn(org.apache.druid.segment.column.BaseColumn) DimensionSpec(org.apache.druid.query.dimension.DimensionSpec) ColumnHolder(org.apache.druid.segment.column.ColumnHolder) ISE(org.apache.druid.java.util.common.ISE) DictionaryEncodedColumn(org.apache.druid.segment.column.DictionaryEncodedColumn)

Example 7 with DictionaryEncodedColumn

use of org.apache.druid.segment.column.DictionaryEncodedColumn in project druid by druid-io.

the class QueryableIndexVectorColumnSelectorFactory method makeMultiValueDimensionSelector.

@Override
public MultiValueDimensionVectorSelector makeMultiValueDimensionSelector(final DimensionSpec dimensionSpec) {
    if (!dimensionSpec.canVectorize()) {
        throw new ISE("DimensionSpec[%s] cannot be vectorized", dimensionSpec);
    }
    Function<DimensionSpec, MultiValueDimensionVectorSelector> mappingFunction = spec -> {
        if (virtualColumns.exists(spec.getDimension())) {
            MultiValueDimensionVectorSelector dimensionSelector = virtualColumns.makeMultiValueDimensionVectorSelector(dimensionSpec, index, offset);
            if (dimensionSelector == null) {
                return virtualColumns.makeMultiValueDimensionVectorSelector(dimensionSpec, this);
            } else {
                return dimensionSelector;
            }
        }
        final ColumnHolder holder = index.getColumnHolder(spec.getDimension());
        if (holder == null || holder.getCapabilities().isDictionaryEncoded().isFalse() || !holder.getCapabilities().is(ValueType.STRING) || holder.getCapabilities().hasMultipleValues().isFalse()) {
            throw new ISE("Column[%s] is not a multi-value string column, do not ask for a multi-value selector", spec.getDimension());
        }
        @SuppressWarnings("unchecked") final DictionaryEncodedColumn<String> dictionaryEncodedColumn = (DictionaryEncodedColumn<String>) getCachedColumn(spec.getDimension());
        // dictionaryEncodedColumn is not null because of holder null check above
        assert dictionaryEncodedColumn != null;
        final MultiValueDimensionVectorSelector selector = dictionaryEncodedColumn.makeMultiValueDimensionVectorSelector(offset);
        return spec.decorate(selector);
    };
    // We cannot use computeIfAbsent() here since the function being applied may modify the cache itself through
    // virtual column references, triggering a ConcurrentModificationException in JDK 9 and above.
    MultiValueDimensionVectorSelector selector = multiValueDimensionSelectorCache.get(dimensionSpec);
    if (selector == null) {
        selector = mappingFunction.apply(dimensionSpec);
        multiValueDimensionSelectorCache.put(dimensionSpec, selector);
    }
    return selector;
}
Also used : QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) VirtualColumns(org.apache.druid.segment.VirtualColumns) DictionaryEncodedColumn(org.apache.druid.segment.column.DictionaryEncodedColumn) Closer(org.apache.druid.java.util.common.io.Closer) QueryableIndex(org.apache.druid.segment.QueryableIndex) ISE(org.apache.druid.java.util.common.ISE) ValueType(org.apache.druid.segment.column.ValueType) HashMap(java.util.HashMap) Function(java.util.function.Function) ColumnHolder(org.apache.druid.segment.column.ColumnHolder) DimensionSpec(org.apache.druid.query.dimension.DimensionSpec) Map(java.util.Map) ColumnCapabilities(org.apache.druid.segment.column.ColumnCapabilities) Nullable(javax.annotation.Nullable) BaseColumn(org.apache.druid.segment.column.BaseColumn) DimensionSpec(org.apache.druid.query.dimension.DimensionSpec) ColumnHolder(org.apache.druid.segment.column.ColumnHolder) ISE(org.apache.druid.java.util.common.ISE) DictionaryEncodedColumn(org.apache.druid.segment.column.DictionaryEncodedColumn)

Example 8 with DictionaryEncodedColumn

use of org.apache.druid.segment.column.DictionaryEncodedColumn in project druid by druid-io.

the class IndexMergerNullHandlingTest method testStringColumnNullHandling.

@Test
public void testStringColumnNullHandling() throws Exception {
    List<Map<String, Object>> nonNullFlavors = new ArrayList<>();
    nonNullFlavors.add(ImmutableMap.of("d", "a"));
    nonNullFlavors.add(ImmutableMap.of("d", ImmutableList.of("a", "b")));
    List<Map<String, Object>> nullFlavors = new ArrayList<>();
    Map<String, Object> mMissing = ImmutableMap.of();
    Map<String, Object> mEmptyList = ImmutableMap.of("d", Collections.emptyList());
    Map<String, Object> mNull = new HashMap<>();
    mNull.put("d", null);
    Map<String, Object> mEmptyString = ImmutableMap.of("d", "");
    Map<String, Object> mListOfNull = ImmutableMap.of("d", Collections.singletonList(null));
    Map<String, Object> mListOfEmptyString = ImmutableMap.of("d", Collections.singletonList(""));
    nullFlavors.add(mMissing);
    nullFlavors.add(mEmptyList);
    nullFlavors.add(mNull);
    nullFlavors.add(mListOfNull);
    if (NullHandling.replaceWithDefault()) {
        nullFlavors.add(mEmptyString);
        nullFlavors.add(mListOfEmptyString);
    } else {
        nonNullFlavors.add(mEmptyString);
        nonNullFlavors.add(mListOfEmptyString);
    }
    Set<Map<String, Object>> allValues = new HashSet<>();
    allValues.addAll(nonNullFlavors);
    allValues.addAll(nullFlavors);
    for (Set<Map<String, Object>> subset : Sets.powerSet(allValues)) {
        if (subset.isEmpty()) {
            continue;
        }
        final List<Map<String, Object>> subsetList = new ArrayList<>(subset);
        IncrementalIndex toPersist = IncrementalIndexTest.createIndex(new AggregatorFactory[] {});
        for (Map<String, Object> m : subsetList) {
            toPersist.add(new MapBasedInputRow(0L, ImmutableList.of("d"), m));
        }
        final File tempDir = temporaryFolder.newFolder();
        try (QueryableIndex index = indexIO.loadIndex(indexMerger.persist(toPersist, tempDir, indexSpec, null))) {
            final ColumnHolder columnHolder = index.getColumnHolder("d");
            if (nullFlavors.containsAll(subsetList)) {
                // all null -> should be missing
                Assert.assertNull(subsetList.toString(), columnHolder);
            } else {
                Assert.assertNotNull(subsetList.toString(), columnHolder);
                // The column has multiple values if there are any lists with > 1 element in the input set.
                final boolean hasMultipleValues = subsetList.stream().anyMatch(m -> m.get("d") instanceof List && (((List) m.get("d")).size() > 1));
                // Compute all unique values, the same way that IndexMerger is expected to do it.
                final Set<String> uniqueValues = new HashSet<>();
                for (Map<String, Object> m : subsetList) {
                    final List<String> dValues = normalize(m.get("d"), hasMultipleValues);
                    uniqueValues.addAll(dValues);
                    if (nullFlavors.contains(m)) {
                        uniqueValues.add(null);
                    }
                }
                try (final DictionaryEncodedColumn<String> dictionaryColumn = (DictionaryEncodedColumn<String>) columnHolder.getColumn()) {
                    // Verify unique values against the dictionary.
                    Assert.assertEquals(subsetList.toString(), uniqueValues.stream().sorted(Comparators.naturalNullsFirst()).collect(Collectors.toList()), IntStream.range(0, dictionaryColumn.getCardinality()).mapToObj(dictionaryColumn::lookupName).collect(Collectors.toList()));
                    Assert.assertEquals(subsetList.toString(), hasMultipleValues, dictionaryColumn.hasMultipleValues());
                    Assert.assertEquals(subsetList.toString(), uniqueValues.size(), dictionaryColumn.getCardinality());
                    // Verify the expected set of rows was indexed, ignoring order.
                    Assert.assertEquals(subsetList.toString(), ImmutableMultiset.copyOf(subsetList.stream().map(m -> normalize(m.get("d"), hasMultipleValues)).distinct().collect(Collectors.toList())), ImmutableMultiset.copyOf(IntStream.range(0, index.getNumRows()).mapToObj(rowNumber -> getRow(dictionaryColumn, rowNumber)).distinct().collect(Collectors.toList())));
                    // Verify that the bitmap index for null is correct.
                    final BitmapIndex bitmapIndex = columnHolder.getBitmapIndex();
                    // Read through the column to find all the rows that should match null.
                    final List<Integer> expectedNullRows = new ArrayList<>();
                    for (int i = 0; i < index.getNumRows(); i++) {
                        final List<String> row = getRow(dictionaryColumn, i);
                        if (row.isEmpty() || row.stream().anyMatch(NullHandling::isNullOrEquivalent)) {
                            expectedNullRows.add(i);
                        }
                    }
                    Assert.assertEquals(subsetList.toString(), expectedNullRows.size() > 0, bitmapIndex.hasNulls());
                    if (expectedNullRows.size() > 0) {
                        Assert.assertEquals(subsetList.toString(), 0, bitmapIndex.getIndex(null));
                        final ImmutableBitmap nullBitmap = bitmapIndex.getBitmap(bitmapIndex.getIndex(null));
                        final List<Integer> actualNullRows = new ArrayList<>();
                        final IntIterator iterator = nullBitmap.iterator();
                        while (iterator.hasNext()) {
                            actualNullRows.add(iterator.next());
                        }
                        Assert.assertEquals(subsetList.toString(), expectedNullRows, actualNullRows);
                    } else {
                        Assert.assertEquals(-1, bitmapIndex.getIndex(null));
                    }
                }
            }
        }
    }
}
Also used : IntIterator(org.roaringbitmap.IntIterator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) BitmapIndex(org.apache.druid.segment.column.BitmapIndex) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) MapBasedInputRow(org.apache.druid.data.input.MapBasedInputRow) HashSet(java.util.HashSet) ColumnHolder(org.apache.druid.segment.column.ColumnHolder) IncrementalIndex(org.apache.druid.segment.incremental.IncrementalIndex) ImmutableBitmap(org.apache.druid.collections.bitmap.ImmutableBitmap) DictionaryEncodedColumn(org.apache.druid.segment.column.DictionaryEncodedColumn) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) File(java.io.File) IncrementalIndexTest(org.apache.druid.segment.data.IncrementalIndexTest) Test(org.junit.Test)

Example 9 with DictionaryEncodedColumn

use of org.apache.druid.segment.column.DictionaryEncodedColumn in project druid by druid-io.

the class SeekableStreamIndexTaskTestBase method readSegmentColumn.

protected List<String> readSegmentColumn(final String column, final SegmentDescriptor descriptor) throws IOException {
    File indexBasePath = new File(StringUtils.format("%s/%s/%s_%s/%s/%d", getSegmentDirectory(), OLD_DATA_SCHEMA.getDataSource(), descriptor.getInterval().getStart(), descriptor.getInterval().getEnd(), descriptor.getVersion(), descriptor.getPartitionNumber()));
    File outputLocation = new File(directory, StringUtils.format("%s_%s_%s_%s", descriptor.getInterval().getStart(), descriptor.getInterval().getEnd(), descriptor.getVersion(), descriptor.getPartitionNumber()));
    outputLocation.mkdir();
    CompressionUtils.unzip(Files.asByteSource(new File(indexBasePath.listFiles()[0], "index.zip")), outputLocation, Predicates.alwaysFalse(), false);
    IndexIO indexIO = new TestUtils().getTestIndexIO();
    QueryableIndex index = indexIO.loadIndex(outputLocation);
    DictionaryEncodedColumn<String> theColumn = (DictionaryEncodedColumn<String>) index.getColumnHolder(column).getColumn();
    List<String> values = new ArrayList<>();
    for (int i = 0; i < theColumn.length(); i++) {
        int id = theColumn.getSingleValueRow(i);
        String value = theColumn.lookupName(id);
        values.add(value);
    }
    return values;
}
Also used : TestUtils(org.apache.druid.indexing.common.TestUtils) IndexIO(org.apache.druid.segment.IndexIO) QueryableIndex(org.apache.druid.segment.QueryableIndex) ArrayList(java.util.ArrayList) DictionaryEncodedColumn(org.apache.druid.segment.column.DictionaryEncodedColumn) File(java.io.File)

Aggregations

DictionaryEncodedColumn (org.apache.druid.segment.column.DictionaryEncodedColumn)9 ColumnHolder (org.apache.druid.segment.column.ColumnHolder)6 BaseColumn (org.apache.druid.segment.column.BaseColumn)5 Nullable (javax.annotation.Nullable)4 HashMap (java.util.HashMap)3 Map (java.util.Map)3 QueryableIndex (org.apache.druid.segment.QueryableIndex)3 BitmapIndex (org.apache.druid.segment.column.BitmapIndex)3 ColumnCapabilities (org.apache.druid.segment.column.ColumnCapabilities)3 File (java.io.File)2 ArrayList (java.util.ArrayList)2 Function (java.util.function.Function)2 ISE (org.apache.druid.java.util.common.ISE)2 Closer (org.apache.druid.java.util.common.io.Closer)2 DimensionSpec (org.apache.druid.query.dimension.DimensionSpec)2 RuntimeShapeInspector (org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector)2 QueryableIndexStorageAdapter (org.apache.druid.segment.QueryableIndexStorageAdapter)2 VirtualColumns (org.apache.druid.segment.VirtualColumns)2 ValueType (org.apache.druid.segment.column.ValueType)2 CloseableIndexed (org.apache.druid.segment.data.CloseableIndexed)2