Search in sources :

Example 1 with TableSegmentUniqueIdentifier

use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.

the class CarbonInputFormat method getSplits.

/**
   * {@inheritDoc}
   * Configurations FileInputFormat.INPUT_DIR
   * are used to get table path to read.
   *
   * @param job
   * @return List<InputSplit> list of CarbonInputSplit
   * @throws IOException
   */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    AbsoluteTableIdentifier identifier = getAbsoluteTableIdentifier(job.getConfiguration());
    CacheClient cacheClient = new CacheClient(identifier.getStorePath());
    try {
        List<String> invalidSegments = new ArrayList<>();
        List<UpdateVO> invalidTimestampsList = new ArrayList<>();
        // get all valid segments and set them into the configuration
        if (getSegmentsToAccess(job).length == 0) {
            SegmentStatusManager segmentStatusManager = new SegmentStatusManager(identifier);
            SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments();
            SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(identifier);
            setSegmentsToAccess(job.getConfiguration(), segments.getValidSegments());
            if (segments.getValidSegments().size() == 0) {
                return new ArrayList<>(0);
            }
            // remove entry in the segment index if there are invalid segments
            invalidSegments.addAll(segments.getInvalidSegments());
            for (String invalidSegmentId : invalidSegments) {
                invalidTimestampsList.add(updateStatusManager.getInvalidTimestampRange(invalidSegmentId));
            }
            if (invalidSegments.size() > 0) {
                List<TableSegmentUniqueIdentifier> invalidSegmentsIds = new ArrayList<>(invalidSegments.size());
                for (String segId : invalidSegments) {
                    invalidSegmentsIds.add(new TableSegmentUniqueIdentifier(identifier, segId));
                }
                cacheClient.getSegmentAccessClient().invalidateAll(invalidSegmentsIds);
            }
        }
        // process and resolve the expression
        Expression filter = getFilterPredicates(job.getConfiguration());
        CarbonTable carbonTable = getCarbonTable(job.getConfiguration());
        // this will be null in case of corrupt schema file.
        if (null == carbonTable) {
            throw new IOException("Missing/Corrupt schema file for table.");
        }
        CarbonInputFormatUtil.processFilterExpression(filter, carbonTable);
        // prune partitions for filter query on partition table
        BitSet matchedPartitions = null;
        if (null != filter) {
            PartitionInfo partitionInfo = carbonTable.getPartitionInfo(carbonTable.getFactTableName());
            if (null != partitionInfo) {
                Partitioner partitioner = PartitionUtil.getPartitioner(partitionInfo);
                matchedPartitions = new FilterExpressionProcessor().getFilteredPartitions(filter, partitionInfo, partitioner);
                if (matchedPartitions.cardinality() == 0) {
                    // no partition is required
                    return new ArrayList<InputSplit>();
                }
                if (matchedPartitions.cardinality() == partitioner.numPartitions()) {
                    // all partitions are required, no need to prune partitions
                    matchedPartitions = null;
                }
            }
        }
        FilterResolverIntf filterInterface = CarbonInputFormatUtil.resolveFilter(filter, identifier);
        // do block filtering and get split
        List<InputSplit> splits = getSplits(job, filterInterface, matchedPartitions, cacheClient);
        // pass the invalid segment to task side in order to remove index entry in task side
        if (invalidSegments.size() > 0) {
            for (InputSplit split : splits) {
                ((CarbonInputSplit) split).setInvalidSegments(invalidSegments);
                ((CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
            }
        }
        return splits;
    } finally {
        // close the cache cache client to clear LRU cache memory
        cacheClient.close();
    }
}
Also used : SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) SegmentStatusManager(org.apache.carbondata.core.statusmanager.SegmentStatusManager) IOException(java.io.IOException) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) TableSegmentUniqueIdentifier(org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) FilterExpressionProcessor(org.apache.carbondata.core.scan.filter.FilterExpressionProcessor) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) Expression(org.apache.carbondata.core.scan.expression.Expression) PartitionInfo(org.apache.carbondata.core.metadata.schema.PartitionInfo) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Partitioner(org.apache.carbondata.core.scan.partition.Partitioner) FilterResolverIntf(org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf)

Example 2 with TableSegmentUniqueIdentifier

use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.

the class CarbonInputFormat method getDataBlocksOfSegment.

/**
   * get data blocks of given segment
   */
private List<DataRefNode> getDataBlocksOfSegment(JobContext job, FilterExpressionProcessor filterExpressionProcessor, AbsoluteTableIdentifier absoluteTableIdentifier, FilterResolverIntf resolver, BitSet matchedPartitions, String segmentId, CacheClient cacheClient, SegmentUpdateStatusManager updateStatusManager) throws IOException {
    Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> segmentIndexMap = null;
    try {
        QueryStatisticsRecorder recorder = CarbonTimeStatisticsFactory.createDriverRecorder();
        QueryStatistic statistic = new QueryStatistic();
        segmentIndexMap = getSegmentAbstractIndexs(job, absoluteTableIdentifier, segmentId, cacheClient, updateStatusManager);
        List<DataRefNode> resultFilterredBlocks = new LinkedList<DataRefNode>();
        if (null != segmentIndexMap) {
            for (Map.Entry<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> entry : segmentIndexMap.entrySet()) {
                SegmentTaskIndexStore.TaskBucketHolder taskHolder = entry.getKey();
                int taskId = CarbonTablePath.DataFileUtil.getTaskIdFromTaskNo(taskHolder.taskNo);
                // if this partition is not required, here will skip it.
                if (matchedPartitions == null || matchedPartitions.get(taskId)) {
                    AbstractIndex abstractIndex = entry.getValue();
                    List<DataRefNode> filterredBlocks;
                    // if no filter is given get all blocks from Btree Index
                    if (null == resolver) {
                        filterredBlocks = getDataBlocksOfIndex(abstractIndex);
                    } else {
                        // apply filter and get matching blocks
                        filterredBlocks = filterExpressionProcessor.getFilterredBlocks(abstractIndex.getDataRefNode(), resolver, abstractIndex, absoluteTableIdentifier);
                    }
                    resultFilterredBlocks.addAll(filterredBlocks);
                }
            }
        }
        statistic.addStatistics(QueryStatisticsConstants.LOAD_BLOCKS_DRIVER, System.currentTimeMillis());
        recorder.recordStatisticsForDriver(statistic, job.getConfiguration().get("query.id"));
        return resultFilterredBlocks;
    } finally {
        // low memory systems the same memory can be utilized efficiently
        if (null != segmentIndexMap) {
            List<TableSegmentUniqueIdentifier> tableSegmentUniqueIdentifiers = new ArrayList<>(1);
            tableSegmentUniqueIdentifiers.add(new TableSegmentUniqueIdentifier(absoluteTableIdentifier, segmentId));
            cacheClient.getSegmentAccessClient().clearAccessCount(tableSegmentUniqueIdentifiers);
        }
    }
}
Also used : DataRefNode(org.apache.carbondata.core.datastore.DataRefNode) TableSegmentUniqueIdentifier(org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier) AbstractIndex(org.apache.carbondata.core.datastore.block.AbstractIndex) QueryStatisticsRecorder(org.apache.carbondata.core.stats.QueryStatisticsRecorder) SegmentTaskIndexStore(org.apache.carbondata.core.datastore.SegmentTaskIndexStore) QueryStatistic(org.apache.carbondata.core.stats.QueryStatistic)

Example 3 with TableSegmentUniqueIdentifier

use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.

the class ManageDictionaryAndBTree method invalidateBTreeCache.

/**
 * This method will remove the BTree instances from LRU cache
 *
 * @param absoluteTableIdentifier
 * @param segments
 */
public static void invalidateBTreeCache(AbsoluteTableIdentifier absoluteTableIdentifier, String[] segments) {
    Cache<Object, Object> driverBTreeCache = CacheProvider.getInstance().createCache(CacheType.DRIVER_BTREE);
    for (String segmentNo : segments) {
        TableSegmentUniqueIdentifier tableSegmentUniqueIdentifier = new TableSegmentUniqueIdentifier(absoluteTableIdentifier, segmentNo);
        driverBTreeCache.invalidate(tableSegmentUniqueIdentifier);
    }
}
Also used : TableSegmentUniqueIdentifier(org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier)

Example 4 with TableSegmentUniqueIdentifier

use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.

the class InMemoryBTreeIndex method getSegmentAbstractIndexs.

private Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> getSegmentAbstractIndexs(JobContext job, AbsoluteTableIdentifier identifier) throws IOException {
    Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> segmentIndexMap = null;
    CacheClient cacheClient = new CacheClient();
    TableSegmentUniqueIdentifier segmentUniqueIdentifier = new TableSegmentUniqueIdentifier(identifier, segment.getId());
    try {
        SegmentTaskIndexWrapper segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().getIfPresent(segmentUniqueIdentifier);
        if (null != segmentTaskIndexWrapper) {
            segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
        }
        // if segment tree is not loaded, load the segment tree
        if (segmentIndexMap == null) {
            List<TableBlockInfo> tableBlockInfoList = getTableBlockInfo(job);
            Map<String, List<TableBlockInfo>> segmentToTableBlocksInfos = new HashMap<>();
            segmentToTableBlocksInfos.put(segment.getId(), tableBlockInfoList);
            segmentUniqueIdentifier.setSegmentToTableBlocksInfos(segmentToTableBlocksInfos);
            // TODO: loadAndGetTaskIdToSegmentsMap can be optimized, use tableBlockInfoList as input
            // get Btree blocks for given segment
            segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().get(segmentUniqueIdentifier);
            segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
        }
    } finally {
        cacheClient.close();
    }
    return segmentIndexMap;
}
Also used : CacheClient(org.apache.carbondata.hadoop.CacheClient) TableBlockInfo(org.apache.carbondata.core.datastore.block.TableBlockInfo) SegmentTaskIndexWrapper(org.apache.carbondata.core.datastore.block.SegmentTaskIndexWrapper) HashMap(java.util.HashMap) AbstractIndex(org.apache.carbondata.core.datastore.block.AbstractIndex) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) TableSegmentUniqueIdentifier(org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier)

Example 5 with TableSegmentUniqueIdentifier

use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.

the class CarbonInputFormat method getSegmentAbstractIndexs.

/**
   * It returns index for each task file.
   * @param job
   * @param absoluteTableIdentifier
   * @param segmentId
   * @return
   * @throws IOException
   */
private Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> getSegmentAbstractIndexs(JobContext job, AbsoluteTableIdentifier absoluteTableIdentifier, String segmentId, CacheClient cacheClient, SegmentUpdateStatusManager updateStatusManager) throws IOException {
    Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> segmentIndexMap = null;
    SegmentTaskIndexWrapper segmentTaskIndexWrapper = null;
    boolean isSegmentUpdated = false;
    Set<SegmentTaskIndexStore.TaskBucketHolder> taskKeys = null;
    TableSegmentUniqueIdentifier tableSegmentUniqueIdentifier = new TableSegmentUniqueIdentifier(absoluteTableIdentifier, segmentId);
    segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().getIfPresent(tableSegmentUniqueIdentifier);
    UpdateVO updateDetails = updateStatusManager.getInvalidTimestampRange(segmentId);
    if (null != segmentTaskIndexWrapper) {
        segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
        if (isSegmentUpdate(segmentTaskIndexWrapper, updateDetails)) {
            taskKeys = segmentIndexMap.keySet();
            isSegmentUpdated = true;
        }
    }
    // if segment tree is not loaded, load the segment tree
    if (segmentIndexMap == null || isSegmentUpdated) {
        // if the segment is updated only the updated blocks TableInfo instance has to be
        // retrieved. the same will be filtered based on taskKeys , if the task is same
        // for the block then dont add it since already its btree is loaded.
        Set<SegmentTaskIndexStore.TaskBucketHolder> validTaskKeys = new HashSet<>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE);
        List<TableBlockInfo> tableBlockInfoList = getTableBlockInfo(job, tableSegmentUniqueIdentifier, taskKeys, updateStatusManager.getInvalidTimestampRange(segmentId), updateStatusManager, segmentId, validTaskKeys);
        if (!tableBlockInfoList.isEmpty()) {
            Map<String, List<TableBlockInfo>> segmentToTableBlocksInfos = new HashMap<>();
            segmentToTableBlocksInfos.put(segmentId, tableBlockInfoList);
            // get Btree blocks for given segment
            tableSegmentUniqueIdentifier.setSegmentToTableBlocksInfos(segmentToTableBlocksInfos);
            tableSegmentUniqueIdentifier.setIsSegmentUpdated(isSegmentUpdated);
            segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().get(tableSegmentUniqueIdentifier);
            segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
        }
        if (null != taskKeys) {
            Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> finalMap = new HashMap<>(validTaskKeys.size());
            for (SegmentTaskIndexStore.TaskBucketHolder key : validTaskKeys) {
                finalMap.put(key, segmentIndexMap.get(key));
            }
            segmentIndexMap = finalMap;
        }
    }
    return segmentIndexMap;
}
Also used : TableBlockInfo(org.apache.carbondata.core.datastore.block.TableBlockInfo) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) TableSegmentUniqueIdentifier(org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier) SegmentTaskIndexWrapper(org.apache.carbondata.core.datastore.block.SegmentTaskIndexWrapper) AbstractIndex(org.apache.carbondata.core.datastore.block.AbstractIndex) SegmentTaskIndexStore(org.apache.carbondata.core.datastore.SegmentTaskIndexStore)

Aggregations

TableSegmentUniqueIdentifier (org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier)6 SegmentTaskIndexStore (org.apache.carbondata.core.datastore.SegmentTaskIndexStore)3 AbstractIndex (org.apache.carbondata.core.datastore.block.AbstractIndex)3 SegmentTaskIndexWrapper (org.apache.carbondata.core.datastore.block.SegmentTaskIndexWrapper)2 TableBlockInfo (org.apache.carbondata.core.datastore.block.TableBlockInfo)2 UpdateVO (org.apache.carbondata.core.mutate.UpdateVO)2 IOException (java.io.IOException)1 Field (java.lang.reflect.Field)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 BlockIndexStore (org.apache.carbondata.core.datastore.BlockIndexStore)1 DataRefNode (org.apache.carbondata.core.datastore.DataRefNode)1 TableBlockUniqueIdentifier (org.apache.carbondata.core.datastore.block.TableBlockUniqueIdentifier)1 AbsoluteTableIdentifier (org.apache.carbondata.core.metadata.AbsoluteTableIdentifier)1 PartitionInfo (org.apache.carbondata.core.metadata.schema.PartitionInfo)1 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)1 Expression (org.apache.carbondata.core.scan.expression.Expression)1 FilterExpressionProcessor (org.apache.carbondata.core.scan.filter.FilterExpressionProcessor)1