use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.
the class CarbonInputFormat method getSplits.
/**
* {@inheritDoc}
* Configurations FileInputFormat.INPUT_DIR
* are used to get table path to read.
*
* @param job
* @return List<InputSplit> list of CarbonInputSplit
* @throws IOException
*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
AbsoluteTableIdentifier identifier = getAbsoluteTableIdentifier(job.getConfiguration());
CacheClient cacheClient = new CacheClient(identifier.getStorePath());
try {
List<String> invalidSegments = new ArrayList<>();
List<UpdateVO> invalidTimestampsList = new ArrayList<>();
// get all valid segments and set them into the configuration
if (getSegmentsToAccess(job).length == 0) {
SegmentStatusManager segmentStatusManager = new SegmentStatusManager(identifier);
SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments();
SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(identifier);
setSegmentsToAccess(job.getConfiguration(), segments.getValidSegments());
if (segments.getValidSegments().size() == 0) {
return new ArrayList<>(0);
}
// remove entry in the segment index if there are invalid segments
invalidSegments.addAll(segments.getInvalidSegments());
for (String invalidSegmentId : invalidSegments) {
invalidTimestampsList.add(updateStatusManager.getInvalidTimestampRange(invalidSegmentId));
}
if (invalidSegments.size() > 0) {
List<TableSegmentUniqueIdentifier> invalidSegmentsIds = new ArrayList<>(invalidSegments.size());
for (String segId : invalidSegments) {
invalidSegmentsIds.add(new TableSegmentUniqueIdentifier(identifier, segId));
}
cacheClient.getSegmentAccessClient().invalidateAll(invalidSegmentsIds);
}
}
// process and resolve the expression
Expression filter = getFilterPredicates(job.getConfiguration());
CarbonTable carbonTable = getCarbonTable(job.getConfiguration());
// this will be null in case of corrupt schema file.
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
CarbonInputFormatUtil.processFilterExpression(filter, carbonTable);
// prune partitions for filter query on partition table
BitSet matchedPartitions = null;
if (null != filter) {
PartitionInfo partitionInfo = carbonTable.getPartitionInfo(carbonTable.getFactTableName());
if (null != partitionInfo) {
Partitioner partitioner = PartitionUtil.getPartitioner(partitionInfo);
matchedPartitions = new FilterExpressionProcessor().getFilteredPartitions(filter, partitionInfo, partitioner);
if (matchedPartitions.cardinality() == 0) {
// no partition is required
return new ArrayList<InputSplit>();
}
if (matchedPartitions.cardinality() == partitioner.numPartitions()) {
// all partitions are required, no need to prune partitions
matchedPartitions = null;
}
}
}
FilterResolverIntf filterInterface = CarbonInputFormatUtil.resolveFilter(filter, identifier);
// do block filtering and get split
List<InputSplit> splits = getSplits(job, filterInterface, matchedPartitions, cacheClient);
// pass the invalid segment to task side in order to remove index entry in task side
if (invalidSegments.size() > 0) {
for (InputSplit split : splits) {
((CarbonInputSplit) split).setInvalidSegments(invalidSegments);
((CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
}
}
return splits;
} finally {
// close the cache cache client to clear LRU cache memory
cacheClient.close();
}
}
use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.
the class CarbonInputFormat method getDataBlocksOfSegment.
/**
* get data blocks of given segment
*/
private List<DataRefNode> getDataBlocksOfSegment(JobContext job, FilterExpressionProcessor filterExpressionProcessor, AbsoluteTableIdentifier absoluteTableIdentifier, FilterResolverIntf resolver, BitSet matchedPartitions, String segmentId, CacheClient cacheClient, SegmentUpdateStatusManager updateStatusManager) throws IOException {
Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> segmentIndexMap = null;
try {
QueryStatisticsRecorder recorder = CarbonTimeStatisticsFactory.createDriverRecorder();
QueryStatistic statistic = new QueryStatistic();
segmentIndexMap = getSegmentAbstractIndexs(job, absoluteTableIdentifier, segmentId, cacheClient, updateStatusManager);
List<DataRefNode> resultFilterredBlocks = new LinkedList<DataRefNode>();
if (null != segmentIndexMap) {
for (Map.Entry<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> entry : segmentIndexMap.entrySet()) {
SegmentTaskIndexStore.TaskBucketHolder taskHolder = entry.getKey();
int taskId = CarbonTablePath.DataFileUtil.getTaskIdFromTaskNo(taskHolder.taskNo);
// if this partition is not required, here will skip it.
if (matchedPartitions == null || matchedPartitions.get(taskId)) {
AbstractIndex abstractIndex = entry.getValue();
List<DataRefNode> filterredBlocks;
// if no filter is given get all blocks from Btree Index
if (null == resolver) {
filterredBlocks = getDataBlocksOfIndex(abstractIndex);
} else {
// apply filter and get matching blocks
filterredBlocks = filterExpressionProcessor.getFilterredBlocks(abstractIndex.getDataRefNode(), resolver, abstractIndex, absoluteTableIdentifier);
}
resultFilterredBlocks.addAll(filterredBlocks);
}
}
}
statistic.addStatistics(QueryStatisticsConstants.LOAD_BLOCKS_DRIVER, System.currentTimeMillis());
recorder.recordStatisticsForDriver(statistic, job.getConfiguration().get("query.id"));
return resultFilterredBlocks;
} finally {
// low memory systems the same memory can be utilized efficiently
if (null != segmentIndexMap) {
List<TableSegmentUniqueIdentifier> tableSegmentUniqueIdentifiers = new ArrayList<>(1);
tableSegmentUniqueIdentifiers.add(new TableSegmentUniqueIdentifier(absoluteTableIdentifier, segmentId));
cacheClient.getSegmentAccessClient().clearAccessCount(tableSegmentUniqueIdentifiers);
}
}
}
use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.
the class ManageDictionaryAndBTree method invalidateBTreeCache.
/**
* This method will remove the BTree instances from LRU cache
*
* @param absoluteTableIdentifier
* @param segments
*/
public static void invalidateBTreeCache(AbsoluteTableIdentifier absoluteTableIdentifier, String[] segments) {
Cache<Object, Object> driverBTreeCache = CacheProvider.getInstance().createCache(CacheType.DRIVER_BTREE);
for (String segmentNo : segments) {
TableSegmentUniqueIdentifier tableSegmentUniqueIdentifier = new TableSegmentUniqueIdentifier(absoluteTableIdentifier, segmentNo);
driverBTreeCache.invalidate(tableSegmentUniqueIdentifier);
}
}
use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.
the class InMemoryBTreeIndex method getSegmentAbstractIndexs.
private Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> getSegmentAbstractIndexs(JobContext job, AbsoluteTableIdentifier identifier) throws IOException {
Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> segmentIndexMap = null;
CacheClient cacheClient = new CacheClient();
TableSegmentUniqueIdentifier segmentUniqueIdentifier = new TableSegmentUniqueIdentifier(identifier, segment.getId());
try {
SegmentTaskIndexWrapper segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().getIfPresent(segmentUniqueIdentifier);
if (null != segmentTaskIndexWrapper) {
segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
}
// if segment tree is not loaded, load the segment tree
if (segmentIndexMap == null) {
List<TableBlockInfo> tableBlockInfoList = getTableBlockInfo(job);
Map<String, List<TableBlockInfo>> segmentToTableBlocksInfos = new HashMap<>();
segmentToTableBlocksInfos.put(segment.getId(), tableBlockInfoList);
segmentUniqueIdentifier.setSegmentToTableBlocksInfos(segmentToTableBlocksInfos);
// TODO: loadAndGetTaskIdToSegmentsMap can be optimized, use tableBlockInfoList as input
// get Btree blocks for given segment
segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().get(segmentUniqueIdentifier);
segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
}
} finally {
cacheClient.close();
}
return segmentIndexMap;
}
use of org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier in project carbondata by apache.
the class CarbonInputFormat method getSegmentAbstractIndexs.
/**
* It returns index for each task file.
* @param job
* @param absoluteTableIdentifier
* @param segmentId
* @return
* @throws IOException
*/
private Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> getSegmentAbstractIndexs(JobContext job, AbsoluteTableIdentifier absoluteTableIdentifier, String segmentId, CacheClient cacheClient, SegmentUpdateStatusManager updateStatusManager) throws IOException {
Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> segmentIndexMap = null;
SegmentTaskIndexWrapper segmentTaskIndexWrapper = null;
boolean isSegmentUpdated = false;
Set<SegmentTaskIndexStore.TaskBucketHolder> taskKeys = null;
TableSegmentUniqueIdentifier tableSegmentUniqueIdentifier = new TableSegmentUniqueIdentifier(absoluteTableIdentifier, segmentId);
segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().getIfPresent(tableSegmentUniqueIdentifier);
UpdateVO updateDetails = updateStatusManager.getInvalidTimestampRange(segmentId);
if (null != segmentTaskIndexWrapper) {
segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
if (isSegmentUpdate(segmentTaskIndexWrapper, updateDetails)) {
taskKeys = segmentIndexMap.keySet();
isSegmentUpdated = true;
}
}
// if segment tree is not loaded, load the segment tree
if (segmentIndexMap == null || isSegmentUpdated) {
// if the segment is updated only the updated blocks TableInfo instance has to be
// retrieved. the same will be filtered based on taskKeys , if the task is same
// for the block then dont add it since already its btree is loaded.
Set<SegmentTaskIndexStore.TaskBucketHolder> validTaskKeys = new HashSet<>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE);
List<TableBlockInfo> tableBlockInfoList = getTableBlockInfo(job, tableSegmentUniqueIdentifier, taskKeys, updateStatusManager.getInvalidTimestampRange(segmentId), updateStatusManager, segmentId, validTaskKeys);
if (!tableBlockInfoList.isEmpty()) {
Map<String, List<TableBlockInfo>> segmentToTableBlocksInfos = new HashMap<>();
segmentToTableBlocksInfos.put(segmentId, tableBlockInfoList);
// get Btree blocks for given segment
tableSegmentUniqueIdentifier.setSegmentToTableBlocksInfos(segmentToTableBlocksInfos);
tableSegmentUniqueIdentifier.setIsSegmentUpdated(isSegmentUpdated);
segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().get(tableSegmentUniqueIdentifier);
segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
}
if (null != taskKeys) {
Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> finalMap = new HashMap<>(validTaskKeys.size());
for (SegmentTaskIndexStore.TaskBucketHolder key : validTaskKeys) {
finalMap.put(key, segmentIndexMap.get(key));
}
segmentIndexMap = finalMap;
}
}
return segmentIndexMap;
}
Aggregations