use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class CarbonCompactionExecutor method processTableBlocks.
/**
* For processing of the table blocks.
*
* @return Map of String with Carbon iterators
* Map has 2 elements: UNSORTED and SORTED
* Map(UNSORTED) = List of Iterators which yield sorted data
* Map(Sorted) = List of Iterators which yield sorted data
* In Range Column compaction we will have a Filter Expression to process
*/
public Map<String, List<RawResultIterator>> processTableBlocks(Configuration configuration, Expression filterExpr) throws IOException {
Map<String, List<RawResultIterator>> resultList = new HashMap<>(2);
resultList.put(CarbonCompactionUtil.UNSORTED_IDX, new ArrayList<RawResultIterator>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE));
resultList.put(CarbonCompactionUtil.SORTED_IDX, new ArrayList<RawResultIterator>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE));
List<TableBlockInfo> tableBlockInfos = null;
QueryModelBuilder builder = null;
if (null == filterExpr) {
builder = new QueryModelBuilder(carbonTable).projectAllColumns().dataConverter(dataTypeConverter).enableForcedDetailRawQuery();
} else {
builder = new QueryModelBuilder(carbonTable).projectAllColumns().filterExpression(new IndexFilter(carbonTable, filterExpr)).dataConverter(dataTypeConverter).enableForcedDetailRawQuery().convertToRangeFilter(false);
}
if (enablePageLevelReaderForCompaction()) {
builder.enableReadPageByPage();
}
queryModel = builder.build();
// iterate each seg ID
for (Map.Entry<String, TaskBlockInfo> taskMap : segmentMapping.entrySet()) {
String segmentId = taskMap.getKey();
List<DataFileFooter> listMetadata = dataFileMetadataSegMapping.get(segmentId);
// for each segment get taskblock info
TaskBlockInfo taskBlockInfo = taskMap.getValue();
Set<String> taskBlockListMapping = taskBlockInfo.getTaskSet();
// Check if block needs sorting or not
boolean sortingRequired = !CarbonCompactionUtil.isSortedByCurrentSortColumns(carbonTable, listMetadata.get(0));
for (String task : taskBlockListMapping) {
tableBlockInfos = taskBlockInfo.getTableBlockInfoList(task);
// during update there may be a chance that the cardinality may change within the segment
// which may lead to failure while converting the row, so get all the blocks present in a
// task and then split into multiple lists of same column values and create separate
// RawResultIterator for each tableBlockInfo of same column values. If all the blocks have
// same column values, then make a single RawResultIterator for all the blocks
List<List<TableBlockInfo>> listOfTableBlocksBasedOnKeyLength = getListOfTableBlocksBasedOnColumnValueSize(tableBlockInfos);
for (List<TableBlockInfo> tableBlockInfoList : listOfTableBlocksBasedOnKeyLength) {
Collections.sort(tableBlockInfoList);
LOGGER.info("for task -" + task + "- in segment id -" + segmentId + "- block size is -" + tableBlockInfos.size());
queryModel.setTableBlockInfos(tableBlockInfoList);
if (sortingRequired) {
resultList.get(CarbonCompactionUtil.UNSORTED_IDX).add(getRawResultIterator(configuration, segmentId, task, tableBlockInfoList));
} else {
resultList.get(CarbonCompactionUtil.SORTED_IDX).add(getRawResultIterator(configuration, segmentId, task, tableBlockInfoList));
}
}
}
}
return resultList;
}
use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class CarbonTableReader method getInputSplits.
/**
* Get a carbon muti-block input splits
*
* @param tableCacheModel cached table
* @param filters carbonData filters
* @param filteredPartitions matched partitionSpec for the filter
* @param config hadoop conf
* @return list of multiblock split
* @throws IOException
*/
public List<CarbonLocalMultiBlockSplit> getInputSplits(CarbonTableCacheModel tableCacheModel, Expression filters, List<PartitionSpec> filteredPartitions, Configuration config) throws IOException {
List<CarbonLocalInputSplit> result = new ArrayList<>();
List<CarbonLocalMultiBlockSplit> multiBlockSplitList = new ArrayList<>();
CarbonTable carbonTable = tableCacheModel.getCarbonTable();
TableInfo tableInfo = tableCacheModel.getCarbonTable().getTableInfo();
config.set("presto.cli.query.id", prestoQueryId);
config.set(CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS, "");
String carbonTablePath = carbonTable.getAbsoluteTableIdentifier().getTablePath();
config.set(CarbonTableInputFormat.INPUT_DIR, carbonTablePath);
config.set(CarbonTableInputFormat.DATABASE_NAME, carbonTable.getDatabaseName());
config.set(CarbonTableInputFormat.TABLE_NAME, carbonTable.getTableName());
config.set("query.id", queryId);
CarbonInputFormat.setTransactionalTable(config, carbonTable.isTransactionalTable());
CarbonInputFormat.setTableInfo(config, carbonTable.getTableInfo());
if (CarbonProperties.getInstance().isCoarseGrainSecondaryIndex(tableInfo.getDatabaseName(), tableInfo.getFactTable().getTableName(), "true")) {
CarbonInputFormat.checkAndSetSecondaryIndexPruning(carbonTable.getTableInfo(), filters, config);
}
JobConf jobConf = new JobConf(config);
try {
CarbonTableInputFormat.setTableInfo(config, tableInfo);
CarbonTableInputFormat<Object> carbonTableInputFormat = createInputFormat(jobConf, carbonTable.getAbsoluteTableIdentifier(), new IndexFilter(carbonTable, filters, true), filteredPartitions);
Job job = Job.getInstance(jobConf);
List<InputSplit> splits = carbonTableInputFormat.getSplits(job);
Gson gson = new Gson();
if (splits != null && splits.size() > 0) {
for (InputSplit inputSplit : splits) {
CarbonInputSplit carbonInputSplit = (CarbonInputSplit) inputSplit;
result.add(new CarbonLocalInputSplit(carbonInputSplit.getSegmentId(), carbonInputSplit.getPath().toString(), carbonInputSplit.getStart(), carbonInputSplit.getLength(), Arrays.asList(carbonInputSplit.getLocations()), carbonInputSplit.getNumberOfBlocklets(), carbonInputSplit.getVersion().number(), carbonInputSplit.getDeleteDeltaFiles(), carbonInputSplit.getBlockletId(), gson.toJson(carbonInputSplit.getDetailInfo()), carbonInputSplit.getFileFormat().ordinal()));
}
// Use block distribution
List<List<CarbonLocalInputSplit>> inputSplits = new ArrayList<>(result.stream().collect(Collectors.groupingBy(carbonInput -> {
if (FileFormat.ROW_V1.equals(carbonInput.getFileFormat())) {
return carbonInput.getSegmentId().concat(carbonInput.getPath()).concat(carbonInput.getStart() + "");
}
return carbonInput.getSegmentId().concat(carbonInput.getPath());
})).values());
// TODO : try to optimize the below loic as it may slowdown for huge splits
for (int j = 0; j < inputSplits.size(); j++) {
multiBlockSplitList.add(new CarbonLocalMultiBlockSplit(inputSplits.get(j), inputSplits.get(j).stream().flatMap(f -> Arrays.stream(getLocations(f))).distinct().toArray(String[]::new)));
}
LOGGER.error("Size fo MultiblockList " + multiBlockSplitList.size());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return multiBlockSplitList;
}
use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class CarbondataPageSource method createQueryModel.
/**
* @param carbondataSplit
* @param columns
* @return
*/
private QueryModel createQueryModel(HiveSplit carbondataSplit, ConnectorTableHandle tableHandle, List<? extends ColumnHandle> columns, Configuration conf) {
try {
CarbonProjection carbonProjection = getCarbonProjection(columns);
conf.set(CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS, "");
String carbonTablePath = carbonTable.getAbsoluteTableIdentifier().getTablePath();
CarbonTableInputFormat.setTransactionalTable(conf, carbonTable.getTableInfo().isTransactionalTable());
CarbonTableInputFormat.setTableInfo(conf, carbonTable.getTableInfo());
conf.set(CarbonTableInputFormat.INPUT_DIR, carbonTablePath);
conf.set("query.id", queryId);
JobConf jobConf = new JobConf(conf);
HiveTableHandle hiveTable = (HiveTableHandle) tableHandle;
CarbonTableInputFormat carbonTableInputFormat = createInputFormat(jobConf, carbonTable, new IndexFilter(carbonTable, PrestoFilterUtil.parseFilterExpression(hiveTable.getCompactEffectivePredicate())), carbonProjection);
TaskAttemptContextImpl hadoopAttemptContext = new TaskAttemptContextImpl(jobConf, new TaskAttemptID("", 1, TaskType.MAP, 0, 0));
CarbonMultiBlockSplit carbonInputSplit = CarbonLocalMultiBlockSplit.convertSplit(carbondataSplit.getSchema().getProperty("carbonSplit"));
QueryModel queryModel = carbonTableInputFormat.createQueryModel(carbonInputSplit, hadoopAttemptContext);
queryModel.setQueryId(queryId);
queryModel.setVectorReader(true);
queryModel.setStatisticsRecorder(CarbonTimeStatisticsFactory.createExecutorRecorder(queryModel.getQueryId()));
List<TableBlockInfo> tableBlockInfoList = CarbonInputSplit.createBlocks(carbonInputSplit.getAllSplits());
queryModel.setTableBlockInfos(tableBlockInfoList);
return queryModel;
} catch (IOException e) {
throw new RuntimeException("Unable to get the Query Model ", e);
}
}
use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class CarbondataPageSource method createQueryModel.
/**
* @param carbondataSplit
* @param columns
* @return
*/
private QueryModel createQueryModel(HiveSplit carbondataSplit, List<? extends ColumnHandle> columns, Configuration conf) {
try {
CarbonProjection carbonProjection = getCarbonProjection(columns);
conf.set(CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS, "");
String carbonTablePath = carbonTable.getAbsoluteTableIdentifier().getTablePath();
CarbonTableInputFormat.setTransactionalTable(conf, carbonTable.getTableInfo().isTransactionalTable());
CarbonTableInputFormat.setTableInfo(conf, carbonTable.getTableInfo());
conf.set(CarbonTableInputFormat.INPUT_DIR, carbonTablePath);
conf.set("query.id", queryId);
JobConf jobConf = new JobConf(conf);
CarbonTableInputFormat carbonTableInputFormat = createInputFormat(jobConf, carbonTable, new IndexFilter(carbonTable, PrestoFilterUtil.parseFilterExpression(carbondataSplit.getEffectivePredicate())), carbonProjection);
TaskAttemptContextImpl hadoopAttemptContext = new TaskAttemptContextImpl(jobConf, new TaskAttemptID("", 1, TaskType.MAP, 0, 0));
CarbonMultiBlockSplit carbonInputSplit = CarbonLocalMultiBlockSplit.convertSplit(carbondataSplit.getSchema().getProperty("carbonSplit"));
QueryModel queryModel = carbonTableInputFormat.createQueryModel(carbonInputSplit, hadoopAttemptContext);
queryModel.setQueryId(queryId);
queryModel.setVectorReader(true);
queryModel.setStatisticsRecorder(CarbonTimeStatisticsFactory.createExecutorRecorder(queryModel.getQueryId()));
List<TableBlockInfo> tableBlockInfoList = CarbonInputSplit.createBlocks(carbonInputSplit.getAllSplits());
queryModel.setTableBlockInfos(tableBlockInfoList);
return queryModel;
} catch (IOException e) {
throw new RuntimeException("Unable to get the Query Model ", e);
}
}
use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class BlockletIndexFactory method getTableBlockIndexUniqueIdentifierUsingSegmentMinMax.
/**
* Using blockLevel minmax values, identify if segment has to be added for further pruning and to
* load segment index info to cache
* @param segment to be identified if needed for loading block indexes
* @param segmentMetaDataInfo list of block level min max values
* @param filter filter expression
* @param identifiers tableBlockIndexUniqueIdentifiers
* @param tableBlockIndexUniqueIdentifierWrappers to add tableBlockIndexUniqueIdentifiers
*/
private void getTableBlockIndexUniqueIdentifierUsingSegmentMinMax(Segment segment, SegmentMetaDataInfo segmentMetaDataInfo, IndexFilter filter, Set<TableBlockIndexUniqueIdentifier> identifiers, List<TableBlockIndexUniqueIdentifierWrapper> tableBlockIndexUniqueIdentifierWrappers) {
boolean isScanRequired = false;
Map<String, SegmentColumnMetaDataInfo> segmentColumnMetaDataInfoMap = segmentMetaDataInfo.getSegmentColumnMetaDataInfoMap();
int length = segmentColumnMetaDataInfoMap.size();
// Add columnSchemas based on the columns present in segment
List<ColumnSchema> columnSchemas = new ArrayList<>();
byte[][] min = new byte[length][];
byte[][] max = new byte[length][];
boolean[] minMaxFlag = new boolean[length];
int i = 0;
// get current columnSchema list for the table
Map<String, ColumnSchema> tableColumnSchemas = this.getCarbonTable().getTableInfo().getFactTable().getListOfColumns().stream().collect(Collectors.toMap(ColumnSchema::getColumnUniqueId, ColumnSchema::clone));
// fill min,max and columnSchema values
for (Map.Entry<String, SegmentColumnMetaDataInfo> columnMetaData : segmentColumnMetaDataInfoMap.entrySet()) {
ColumnSchema columnSchema = tableColumnSchemas.get(columnMetaData.getKey());
if (null != columnSchema) {
// get segment sort column and column drift info
boolean isSortColumnInSegment = columnMetaData.getValue().isSortColumn();
boolean isColumnDriftInSegment = columnMetaData.getValue().isColumnDrift();
if (null != columnSchema.getColumnProperties()) {
// get current sort column and column drift info from current columnSchema
String isSortColumn = columnSchema.getColumnProperties().get(CarbonCommonConstants.SORT_COLUMNS);
String isColumnDrift = columnSchema.getColumnProperties().get(CarbonCommonConstants.COLUMN_DRIFT);
if (null != isSortColumn) {
if (isSortColumn.equalsIgnoreCase("true") && !isSortColumnInSegment) {
// Unset current column schema column properties
modifyColumnSchemaForSortColumn(columnSchema, isColumnDriftInSegment, isColumnDrift, false);
} else if (isSortColumn.equalsIgnoreCase("false") && isSortColumnInSegment) {
// set sort column to true in current column schema column properties
modifyColumnSchemaForSortColumn(columnSchema, isColumnDriftInSegment, isColumnDrift, true);
}
} else {
modifyColumnSchemaForSortColumn(columnSchema, isColumnDriftInSegment, isColumnDrift, false);
}
}
columnSchemas.add(columnSchema);
min[i] = columnMetaData.getValue().getColumnMinValue();
max[i] = columnMetaData.getValue().getColumnMaxValue();
minMaxFlag[i] = min[i].length != 0 && max[i].length != 0;
i++;
}
}
// get segmentProperties using created columnSchemas list
SegmentProperties segmentProperties = SegmentPropertiesAndSchemaHolder.getInstance().addSegmentProperties(this.getCarbonTable(), columnSchemas, segment.getSegmentNo()).getSegmentProperties();
FilterResolverIntf resolver = new IndexFilter(segmentProperties, this.getCarbonTable(), filter.getExpression()).getResolver();
// prepare filter executor using IndexFilter resolver
FilterExecutor filterExecutor = FilterUtil.getFilterExecutorTree(resolver, segmentProperties, null, null, false);
// check if block has to be pruned based on segment minmax
BitSet scanRequired = filterExecutor.isScanRequired(max, min, minMaxFlag);
if (!scanRequired.isEmpty()) {
isScanRequired = true;
}
if (isScanRequired) {
for (TableBlockIndexUniqueIdentifier tableBlockIndexUniqueIdentifier : identifiers) {
tableBlockIndexUniqueIdentifierWrappers.add(new TableBlockIndexUniqueIdentifierWrapper(tableBlockIndexUniqueIdentifier, this.getCarbonTable()));
}
}
}
Aggregations