Search in sources :

Example 11 with Blocklet

use of org.apache.carbondata.core.indexstore.Blocklet in project carbondata by apache.

the class TableIndex method pruneMultiThread.

private List<ExtendedBlocklet> pruneMultiThread(List<Segment> segments, final IndexFilter filter, List<ExtendedBlocklet> blocklets, final Map<Segment, List<Index>> indexes, int totalFiles) {
    /*
     *********************************************************************************
     * Below is the example of how this part of code works.
     * consider a scenario of having 5 segments, 10 indexes in each segment,
     * and each index has one record. So total 50 records.
     *
     * indexes in each segment looks like below.
     * s0 [0-9], s1 [0-9], s2 [0-9], s3[0-9], s4[0-9]
     *
     * If number of threads are 4. so filesPerEachThread = 50/4 = 12 files per each thread.
     *
     * SegmentIndexGroup look like below: [SegmentId, fromIndex, toIndex]
     * In each segment only those indexes are processed between fromIndex and toIndex.
     *
     * Final result will be: (4 list created as numOfThreadsForPruning is 4)
     * Thread1 list: s0 [0-9], s1 [0-1]  : 12 files
     * Thread2 list: s1 [2-9], s2 [0-3]  : 12 files
     * Thread3 list: s2 [4-9], s3 [0-5]  : 12 files
     * Thread4 list: s3 [6-9], s4 [0-9]  : 14 files
     * so each thread will process almost equal number of records.
     *
     *********************************************************************************
     */
    int numOfThreadsForPruning = CarbonProperties.getNumOfThreadsForPruning();
    int filesPerEachThread = totalFiles / numOfThreadsForPruning;
    int prev;
    int filesCount = 0;
    int processedFileCount = 0;
    List<List<SegmentIndexGroup>> indexListForEachThread = new ArrayList<>(numOfThreadsForPruning);
    List<SegmentIndexGroup> segmentIndexGroupList = new ArrayList<>();
    Set<String> missingSISegments = filter.getMissingSISegments();
    for (Segment segment : segments) {
        List<Index> eachSegmentIndexList = indexes.get(segment);
        prev = 0;
        for (int i = 0; i < eachSegmentIndexList.size(); i++) {
            Index index = eachSegmentIndexList.get(i);
            filesCount += index.getNumberOfEntries();
            if (filesCount >= filesPerEachThread) {
                if (indexListForEachThread.size() != numOfThreadsForPruning - 1) {
                    // not the last segmentList
                    segmentIndexGroupList.add(new SegmentIndexGroup(segment, prev, i));
                    // save the last value to process in next thread
                    prev = i + 1;
                    indexListForEachThread.add(segmentIndexGroupList);
                    segmentIndexGroupList = new ArrayList<>();
                    processedFileCount += filesCount;
                    filesCount = 0;
                } else {
                    // add remaining in the end
                    processedFileCount += filesCount;
                    filesCount = 0;
                }
            }
        }
        if (prev == 0 || prev != eachSegmentIndexList.size()) {
            // if prev == 0. Add a segment's all indexes
            // eachSegmentIndexList.size() != prev, adding the last remaining indexes of this segment
            segmentIndexGroupList.add(new SegmentIndexGroup(segment, prev, eachSegmentIndexList.size() - 1));
        }
    }
    // adding the last segmentList data
    indexListForEachThread.add(segmentIndexGroupList);
    processedFileCount += filesCount;
    if (processedFileCount != totalFiles) {
        // this should not happen
        throw new RuntimeException(" not all the files processed ");
    }
    if (indexListForEachThread.size() < numOfThreadsForPruning) {
        // If the total indexes fitted in lesser number of threads than numOfThreadsForPruning.
        // Launch only that many threads where indexes are fitted while grouping.
        LOG.info("indexes is distributed in " + indexListForEachThread.size() + " threads");
        numOfThreadsForPruning = indexListForEachThread.size();
    }
    LOG.info("Number of threads selected for multi-thread block pruning is " + numOfThreadsForPruning + ". total files: " + totalFiles + ". total segments: " + segments.size());
    List<Future<Void>> results = new ArrayList<>(numOfThreadsForPruning);
    final Map<Segment, List<ExtendedBlocklet>> prunedBlockletMap = new ConcurrentHashMap<>(segments.size());
    final ExecutorService executorService = Executors.newFixedThreadPool(numOfThreadsForPruning);
    final String threadName = Thread.currentThread().getName();
    for (int i = 0; i < numOfThreadsForPruning; i++) {
        final List<SegmentIndexGroup> segmentIndexGroups = indexListForEachThread.get(i);
        results.add(executorService.submit(new Callable<Void>() {

            @Override
            public Void call() throws IOException {
                Thread.currentThread().setName(threadName);
                for (SegmentIndexGroup segmentIndexGroup : segmentIndexGroups) {
                    List<ExtendedBlocklet> pruneBlocklets = new ArrayList<>();
                    List<Index> indexList = indexes.get(segmentIndexGroup.getSegment());
                    SegmentProperties segmentProperties = segmentPropertiesFetcher.getSegmentPropertiesFromIndex(indexList.get(0));
                    Segment segment = segmentIndexGroup.getSegment();
                    boolean isExternalOrMissingSISegment = segment.getSegmentPath() != null || (missingSISegments != null && missingSISegments.contains(segment.getSegmentNo()));
                    if (filter.isResolvedOnSegment(segmentProperties)) {
                        FilterExecutor filterExecutor;
                        if (!isExternalOrMissingSISegment) {
                            filterExecutor = FilterUtil.getFilterExecutorTree(filter.getResolver(), segmentProperties, null, table.getMinMaxCacheColumns(segmentProperties), false);
                        } else {
                            filterExecutor = FilterUtil.getFilterExecutorTree(filter.getExternalSegmentResolver(), segmentProperties, null, table.getMinMaxCacheColumns(segmentProperties), false);
                        }
                        for (int i = segmentIndexGroup.getFromIndex(); i <= segmentIndexGroup.getToIndex(); i++) {
                            List<Blocklet> dmPruneBlocklets;
                            if (!isExternalOrMissingSISegment) {
                                dmPruneBlocklets = indexList.get(i).prune(filter.getResolver(), segmentProperties, filterExecutor, table);
                            } else {
                                dmPruneBlocklets = indexList.get(i).prune(filter.getExternalSegmentResolver(), segmentProperties, filterExecutor, table);
                            }
                            pruneBlocklets.addAll(addSegmentId(blockletDetailsFetcher.getExtendedBlocklets(dmPruneBlocklets, segment), segment));
                        }
                    } else {
                        Expression filterExpression = filter.getNewCopyOfExpression();
                        FilterExecutor filterExecutor;
                        if (!isExternalOrMissingSISegment) {
                            filterExecutor = FilterUtil.getFilterExecutorTree(new IndexFilter(segmentProperties, table, filterExpression).getResolver(), segmentProperties, null, table.getMinMaxCacheColumns(segmentProperties), false);
                        } else {
                            filterExecutor = FilterUtil.getFilterExecutorTree(new IndexFilter(segmentProperties, table, filterExpression).getExternalSegmentResolver(), segmentProperties, null, table.getMinMaxCacheColumns(segmentProperties), false);
                        }
                        for (int i = segmentIndexGroup.getFromIndex(); i <= segmentIndexGroup.getToIndex(); i++) {
                            List<Blocklet> dmPruneBlocklets;
                            if (!isExternalOrMissingSISegment) {
                                dmPruneBlocklets = indexList.get(i).prune(filterExpression, segmentProperties, table, filterExecutor);
                            } else {
                                dmPruneBlocklets = indexList.get(i).prune(filter.getExternalSegmentFilter(), segmentProperties, table, filterExecutor);
                            }
                            pruneBlocklets.addAll(addSegmentId(blockletDetailsFetcher.getExtendedBlocklets(dmPruneBlocklets, segment), segment));
                        }
                    }
                    synchronized (prunedBlockletMap) {
                        List<ExtendedBlocklet> pruneBlockletsExisting = prunedBlockletMap.get(segmentIndexGroup.getSegment());
                        if (pruneBlockletsExisting != null) {
                            pruneBlockletsExisting.addAll(pruneBlocklets);
                        } else {
                            prunedBlockletMap.put(segmentIndexGroup.getSegment(), pruneBlocklets);
                        }
                    }
                }
                return null;
            }
        }));
    }
    executorService.shutdown();
    try {
        executorService.awaitTermination(2, TimeUnit.HOURS);
    } catch (InterruptedException e) {
        LOG.error("Error in pruning index in multi-thread: " + e.getMessage());
    }
    // check for error
    for (Future<Void> result : results) {
        try {
            result.get();
        } catch (InterruptedException | ExecutionException e) {
            throw new RuntimeException(e);
        }
    }
    for (Map.Entry<Segment, List<ExtendedBlocklet>> entry : prunedBlockletMap.entrySet()) {
        blocklets.addAll(entry.getValue());
    }
    return blocklets;
}
Also used : FilterExecutor(org.apache.carbondata.core.scan.filter.executer.FilterExecutor) ArrayList(java.util.ArrayList) Index(org.apache.carbondata.core.index.dev.Index) BlockIndex(org.apache.carbondata.core.indexstore.blockletindex.BlockIndex) CoarseGrainIndex(org.apache.carbondata.core.index.dev.cgindex.CoarseGrainIndex) Callable(java.util.concurrent.Callable) ArrayList(java.util.ArrayList) List(java.util.List) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ExecutionException(java.util.concurrent.ExecutionException) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) Blocklet(org.apache.carbondata.core.indexstore.Blocklet) FineGrainBlocklet(org.apache.carbondata.core.index.dev.fgindex.FineGrainBlocklet) Expression(org.apache.carbondata.core.scan.expression.Expression) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) SegmentProperties(org.apache.carbondata.core.datastore.block.SegmentProperties) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Example 12 with Blocklet

use of org.apache.carbondata.core.indexstore.Blocklet in project carbondata by apache.

the class TableIndex method prune.

/**
 * This method is used from any machine after it is distributed. It takes the distributable object
 * to prune the filters.
 *
 * @param distributable
 * @param filterExp
 * @return
 */
public List<ExtendedBlocklet> prune(List<Index> indices, IndexInputSplit distributable, FilterResolverIntf filterExp, List<PartitionSpec> partitions) throws IOException {
    List<ExtendedBlocklet> detailedBlocklets = new ArrayList<>();
    List<Blocklet> blocklets = new ArrayList<>();
    Set<Path> partitionsToPrune = getPartitionLocations(partitions);
    SegmentProperties segmentProperties = segmentPropertiesFetcher.getSegmentProperties(distributable.getSegment(), partitionsToPrune);
    FilterExecutor filterExecutor = FilterUtil.getFilterExecutorTree(filterExp, segmentProperties, null, table.getMinMaxCacheColumns(segmentProperties), false);
    for (Index index : indices) {
        blocklets.addAll(index.prune(filterExp, segmentProperties, filterExecutor, table));
    }
    BlockletSerializer serializer = new BlockletSerializer();
    String writePath = identifier.getTablePath() + CarbonCommonConstants.FILE_SEPARATOR + indexSchema.getIndexName();
    if (indexFactory.getIndexLevel() == IndexLevel.FG) {
        FileFactory.mkdirs(writePath);
    }
    for (Blocklet blocklet : blocklets) {
        ExtendedBlocklet detailedBlocklet = blockletDetailsFetcher.getExtendedBlocklet(blocklet, distributable.getSegment());
        if (indexFactory.getIndexLevel() == IndexLevel.FG) {
            String blockletWritePath = writePath + CarbonCommonConstants.FILE_SEPARATOR + System.nanoTime();
            detailedBlocklet.setIndexWriterPath(blockletWritePath);
            serializer.serializeBlocklet((FineGrainBlocklet) blocklet, blockletWritePath);
        }
        detailedBlocklet.setSegment(distributable.getSegment());
        detailedBlocklets.add(detailedBlocklet);
    }
    return detailedBlocklets;
}
Also used : Path(org.apache.hadoop.fs.Path) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) Blocklet(org.apache.carbondata.core.indexstore.Blocklet) FineGrainBlocklet(org.apache.carbondata.core.index.dev.fgindex.FineGrainBlocklet) FilterExecutor(org.apache.carbondata.core.scan.filter.executer.FilterExecutor) ArrayList(java.util.ArrayList) Index(org.apache.carbondata.core.index.dev.Index) BlockIndex(org.apache.carbondata.core.indexstore.blockletindex.BlockIndex) CoarseGrainIndex(org.apache.carbondata.core.index.dev.cgindex.CoarseGrainIndex) SegmentProperties(org.apache.carbondata.core.datastore.block.SegmentProperties) BlockletSerializer(org.apache.carbondata.core.index.dev.BlockletSerializer) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet)

Example 13 with Blocklet

use of org.apache.carbondata.core.indexstore.Blocklet in project carbondata by apache.

the class TableIndex method pruneWithFilter.

private List<ExtendedBlocklet> pruneWithFilter(List<Segment> segments, IndexFilter filter, Set<Path> partitionLocations, List<ExtendedBlocklet> blocklets, Map<Segment, List<Index>> indexes) throws IOException {
    Set<String> missingSISegments = filter.getMissingSISegments();
    for (Segment segment : segments) {
        List<Index> segmentIndices = indexes.get(segment);
        if (segment == null || segmentIndices == null || segmentIndices.isEmpty()) {
            continue;
        }
        boolean isExternalOrMissingSISegment = segment.isExternalSegment() || (missingSISegments != null && missingSISegments.contains(segment.getSegmentNo()));
        List<Blocklet> pruneBlocklets = new ArrayList<>();
        SegmentProperties segmentProperties;
        if (segmentIndices.get(0) instanceof BlockIndex) {
            segmentProperties = segmentPropertiesFetcher.getSegmentPropertiesFromIndex(segmentIndices.get(0));
        } else {
            segmentProperties = segmentPropertiesFetcher.getSegmentProperties(segment, partitionLocations);
        }
        if (filter.isResolvedOnSegment(segmentProperties)) {
            FilterExecutor filterExecutor;
            if (!isExternalOrMissingSISegment) {
                filterExecutor = FilterUtil.getFilterExecutorTree(filter.getResolver(), segmentProperties, null, table.getMinMaxCacheColumns(segmentProperties), false);
            } else {
                filterExecutor = FilterUtil.getFilterExecutorTree(filter.getExternalSegmentResolver(), segmentProperties, null, table.getMinMaxCacheColumns(segmentProperties), false);
            }
            for (Index index : segmentIndices) {
                if (!isExternalOrMissingSISegment) {
                    pruneBlocklets.addAll(index.prune(filter.getResolver(), segmentProperties, filterExecutor, this.table));
                } else {
                    pruneBlocklets.addAll(index.prune(filter.getExternalSegmentResolver(), segmentProperties, filterExecutor, this.table));
                }
            }
        } else {
            FilterExecutor filterExecutor;
            Expression expression = filter.getExpression();
            if (!isExternalOrMissingSISegment) {
                filterExecutor = FilterUtil.getFilterExecutorTree(new IndexFilter(segmentProperties, table, expression).getResolver(), segmentProperties, null, table.getMinMaxCacheColumns(segmentProperties), false);
            } else {
                filterExecutor = FilterUtil.getFilterExecutorTree(new IndexFilter(segmentProperties, table, expression).getExternalSegmentResolver(), segmentProperties, null, table.getMinMaxCacheColumns(segmentProperties), false);
            }
            for (Index index : segmentIndices) {
                if (!isExternalOrMissingSISegment) {
                    pruneBlocklets.addAll(index.prune(filter.getExpression(), segmentProperties, table, filterExecutor));
                } else {
                    pruneBlocklets.addAll(index.prune(filter.getExternalSegmentFilter(), segmentProperties, table, filterExecutor));
                }
            }
        }
        blocklets.addAll(addSegmentId(blockletDetailsFetcher.getExtendedBlocklets(pruneBlocklets, segment), segment));
    }
    return blocklets;
}
Also used : FilterExecutor(org.apache.carbondata.core.scan.filter.executer.FilterExecutor) ArrayList(java.util.ArrayList) Index(org.apache.carbondata.core.index.dev.Index) BlockIndex(org.apache.carbondata.core.indexstore.blockletindex.BlockIndex) CoarseGrainIndex(org.apache.carbondata.core.index.dev.cgindex.CoarseGrainIndex) BlockIndex(org.apache.carbondata.core.indexstore.blockletindex.BlockIndex) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) Blocklet(org.apache.carbondata.core.indexstore.Blocklet) FineGrainBlocklet(org.apache.carbondata.core.index.dev.fgindex.FineGrainBlocklet) Expression(org.apache.carbondata.core.scan.expression.Expression) SegmentProperties(org.apache.carbondata.core.datastore.block.SegmentProperties)

Example 14 with Blocklet

use of org.apache.carbondata.core.indexstore.Blocklet in project carbondata by apache.

the class BlockletIndexFactory method getExtendedBlocklets.

/**
 * Get the blocklet detail information based on blockletId, blockId and segmentId. This method is
 * exclusively for BlockletIndexFactory as detail information is only available in this
 * default index.
 */
@Override
public List<ExtendedBlocklet> getExtendedBlocklets(List<Blocklet> blocklets, Segment segment) throws IOException {
    List<ExtendedBlocklet> detailedBlocklets = new ArrayList<>(blocklets.size() + 1);
    // if the blocklets is empty, return the empty detailed blocklets list directly.
    if (blocklets.size() == 0) {
        return detailedBlocklets;
    }
    // If it is already detailed blocklet then type cast and return same
    if (blocklets.size() > 0 && blocklets.get(0) instanceof ExtendedBlocklet) {
        for (Blocklet blocklet : blocklets) {
            detailedBlocklets.add((ExtendedBlocklet) blocklet);
        }
        return detailedBlocklets;
    }
    Set<TableBlockIndexUniqueIdentifier> identifiers = getTableBlockIndexUniqueIdentifiers(segment);
    Set<TableBlockIndexUniqueIdentifierWrapper> tableBlockIndexUniqueIdentifierWrappers = new HashSet<>(identifiers.size());
    for (TableBlockIndexUniqueIdentifier tableBlockIndexUniqueIdentifier : identifiers) {
        tableBlockIndexUniqueIdentifierWrappers.add(new TableBlockIndexUniqueIdentifierWrapper(tableBlockIndexUniqueIdentifier, this.getCarbonTable()));
    }
    // Retrieve each blocklets detail information from blocklet index
    for (Blocklet blocklet : blocklets) {
        detailedBlocklets.add(getExtendedBlocklet(tableBlockIndexUniqueIdentifierWrappers, blocklet));
    }
    return detailedBlocklets;
}
Also used : ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) Blocklet(org.apache.carbondata.core.indexstore.Blocklet) TableBlockIndexUniqueIdentifierWrapper(org.apache.carbondata.core.indexstore.TableBlockIndexUniqueIdentifierWrapper) ArrayList(java.util.ArrayList) TableBlockIndexUniqueIdentifier(org.apache.carbondata.core.indexstore.TableBlockIndexUniqueIdentifier) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) HashSet(java.util.HashSet)

Example 15 with Blocklet

use of org.apache.carbondata.core.indexstore.Blocklet in project carbondata by apache.

the class BlockIndex method prune.

private List<Blocklet> prune(FilterResolverIntf filterExp, FilterExecutor filterExecutor, SegmentProperties segmentProperties) {
    if (memoryDMStore.getRowCount() == 0) {
        return new ArrayList<>();
    }
    List<Blocklet> blocklets = new ArrayList<>();
    CarbonRowSchema[] schema = getFileFooterEntrySchema();
    String filePath = getFilePath();
    int numEntries = memoryDMStore.getRowCount();
    int totalBlocklets = 0;
    if (ExplainCollector.enabled()) {
        totalBlocklets = getTotalBlocklets();
    }
    int hitBlocklets = 0;
    if (filterExp == null) {
        for (int i = 0; i < numEntries; i++) {
            IndexRow indexRow = memoryDMStore.getIndexRow(schema, i);
            blocklets.add(createBlocklet(indexRow, getFileNameWithFilePath(indexRow, filePath), getBlockletId(indexRow), false));
        }
        hitBlocklets = totalBlocklets;
    } else {
        // Remove B-tree jump logic as start and end key prepared is not
        // correct for old store scenarios
        int entryIndex = 0;
        // flag to be used for deciding whether use min/max in executor pruning for BlockletIndex
        boolean useMinMaxForPruning = useMinMaxForExecutorPruning(filterExp);
        if (!validateSegmentProperties(segmentProperties)) {
            filterExecutor = FilterUtil.getFilterExecutorTree(filterExp, getSegmentProperties(), null, getMinMaxCacheColumns(), false);
        }
        // min and max for executor pruning
        while (entryIndex < numEntries) {
            IndexRow row = memoryDMStore.getIndexRow(schema, entryIndex);
            boolean[] minMaxFlag = getMinMaxFlag(row, BLOCK_MIN_MAX_FLAG);
            String fileName = getFileNameWithFilePath(row, filePath);
            short blockletId = getBlockletId(row);
            boolean isValid = addBlockBasedOnMinMaxValue(filterExecutor, getMinMaxValue(row, MAX_VALUES_INDEX), getMinMaxValue(row, MIN_VALUES_INDEX), minMaxFlag, fileName, blockletId);
            if (isValid) {
                blocklets.add(createBlocklet(row, fileName, blockletId, useMinMaxForPruning));
                if (ExplainCollector.enabled()) {
                    hitBlocklets += getBlockletNumOfEntry(entryIndex);
                }
            }
            entryIndex++;
        }
    }
    if (ExplainCollector.enabled()) {
        ExplainCollector.setShowPruningInfo(true);
        ExplainCollector.addTotalBlocklets(totalBlocklets);
        ExplainCollector.addTotalBlocks(getTotalBlocks());
        ExplainCollector.addDefaultIndexPruningHit(hitBlocklets);
    }
    return blocklets;
}
Also used : ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) Blocklet(org.apache.carbondata.core.indexstore.Blocklet) IndexRow(org.apache.carbondata.core.indexstore.row.IndexRow) ArrayList(java.util.ArrayList) CarbonRowSchema(org.apache.carbondata.core.indexstore.schema.CarbonRowSchema)

Aggregations

ArrayList (java.util.ArrayList)16 Blocklet (org.apache.carbondata.core.indexstore.Blocklet)16 ExtendedBlocklet (org.apache.carbondata.core.indexstore.ExtendedBlocklet)12 SegmentProperties (org.apache.carbondata.core.datastore.block.SegmentProperties)5 CoarseGrainIndex (org.apache.carbondata.core.index.dev.cgindex.CoarseGrainIndex)4 HashSet (java.util.HashSet)3 Index (org.apache.carbondata.core.index.dev.Index)3 FineGrainBlocklet (org.apache.carbondata.core.index.dev.fgindex.FineGrainBlocklet)3 BlockIndex (org.apache.carbondata.core.indexstore.blockletindex.BlockIndex)3 FilterExecuter (org.apache.carbondata.core.scan.filter.executer.FilterExecuter)3 FilterExecutor (org.apache.carbondata.core.scan.filter.executer.FilterExecutor)3 BitSet (java.util.BitSet)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 DataMap (org.apache.carbondata.core.datamap.dev.DataMap)2 CoarseGrainDataMap (org.apache.carbondata.core.datamap.dev.cgdatamap.CoarseGrainDataMap)2 FineGrainBlocklet (org.apache.carbondata.core.datamap.dev.fgdatamap.FineGrainBlocklet)2 TableBlockIndexUniqueIdentifier (org.apache.carbondata.core.indexstore.TableBlockIndexUniqueIdentifier)2 Expression (org.apache.carbondata.core.scan.expression.Expression)2 IOException (java.io.IOException)1