use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonStreamRecordReader method initialize.
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
// input
if (split instanceof CarbonInputSplit) {
fileSplit = (CarbonInputSplit) split;
} else if (split instanceof CarbonMultiBlockSplit) {
fileSplit = ((CarbonMultiBlockSplit) split).getAllSplits().get(0);
} else {
fileSplit = (FileSplit) split;
}
// metadata
hadoopConf = context.getConfiguration();
if (model == null) {
CarbonTableInputFormat format = new CarbonTableInputFormat<Object>();
model = format.createQueryModel(split, context);
}
carbonTable = model.getTable();
List<CarbonDimension> dimensions = carbonTable.getDimensionByTableName(carbonTable.getTableName());
dimensionCount = dimensions.size();
List<CarbonMeasure> measures = carbonTable.getMeasureByTableName(carbonTable.getTableName());
measureCount = measures.size();
List<CarbonColumn> carbonColumnList = carbonTable.getStreamStorageOrderColumn(carbonTable.getTableName());
storageColumns = carbonColumnList.toArray(new CarbonColumn[carbonColumnList.size()]);
isNoDictColumn = CarbonDataProcessorUtil.getNoDictionaryMapping(storageColumns);
directDictionaryGenerators = new DirectDictionaryGenerator[storageColumns.length];
for (int i = 0; i < storageColumns.length; i++) {
if (storageColumns[i].hasEncoding(Encoding.DIRECT_DICTIONARY)) {
directDictionaryGenerators[i] = DirectDictionaryKeyGeneratorFactory.getDirectDictionaryGenerator(storageColumns[i].getDataType());
}
}
measureDataTypes = new DataType[measureCount];
for (int i = 0; i < measureCount; i++) {
measureDataTypes[i] = storageColumns[dimensionCount + i].getDataType();
}
// decode data
allNonNull = new BitSet(storageColumns.length);
projection = model.getProjectionColumns();
isRequired = new boolean[storageColumns.length];
boolean[] isFiltlerDimensions = model.getIsFilterDimensions();
boolean[] isFiltlerMeasures = model.getIsFilterMeasures();
isFilterRequired = new boolean[storageColumns.length];
filterMap = new int[storageColumns.length];
for (int i = 0; i < storageColumns.length; i++) {
if (storageColumns[i].isDimension()) {
if (isFiltlerDimensions[storageColumns[i].getOrdinal()]) {
isRequired[i] = true;
isFilterRequired[i] = true;
filterMap[i] = storageColumns[i].getOrdinal();
}
} else {
if (isFiltlerMeasures[storageColumns[i].getOrdinal()]) {
isRequired[i] = true;
isFilterRequired[i] = true;
filterMap[i] = carbonTable.getDimensionOrdinalMax() + storageColumns[i].getOrdinal();
}
}
}
isProjectionRequired = new boolean[storageColumns.length];
projectionMap = new int[storageColumns.length];
for (int i = 0; i < storageColumns.length; i++) {
for (int j = 0; j < projection.length; j++) {
if (storageColumns[i].getColName().equals(projection[j].getColName())) {
isRequired[i] = true;
isProjectionRequired[i] = true;
projectionMap[i] = j;
break;
}
}
}
// initialize filter
if (null != model.getFilterExpressionResolverTree()) {
initializeFilter();
} else if (projection.length == 0) {
skipScanData = true;
}
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonInputFormat method getDataBlocksOfSegment.
/**
* get data blocks of given segment
*/
protected List<CarbonInputSplit> getDataBlocksOfSegment(JobContext job, CarbonTable carbonTable, FilterResolverIntf resolver, BitSet matchedPartitions, List<Segment> segmentIds, PartitionInfo partitionInfo, List<Integer> oldPartitionIdList) throws IOException {
QueryStatisticsRecorder recorder = CarbonTimeStatisticsFactory.createDriverRecorder();
QueryStatistic statistic = new QueryStatistic();
// get tokens for all the required FileSystem for table path
TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { new Path(carbonTable.getTablePath()) }, job.getConfiguration());
boolean distributedCG = Boolean.parseBoolean(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.USE_DISTRIBUTED_DATAMAP, CarbonCommonConstants.USE_DISTRIBUTED_DATAMAP_DEFAULT));
DataMapExprWrapper dataMapExprWrapper = DataMapChooser.get().choose(getOrCreateCarbonTable(job.getConfiguration()), resolver);
DataMapJob dataMapJob = getDataMapJob(job.getConfiguration());
List<PartitionSpec> partitionsToPrune = getPartitionsToPrune(job.getConfiguration());
List<ExtendedBlocklet> prunedBlocklets;
if (distributedCG || dataMapExprWrapper.getDataMapType() == DataMapLevel.FG) {
DistributableDataMapFormat datamapDstr = new DistributableDataMapFormat(carbonTable, dataMapExprWrapper, segmentIds, partitionsToPrune, BlockletDataMapFactory.class.getName());
prunedBlocklets = dataMapJob.execute(datamapDstr, resolver);
// Apply expression on the blocklets.
prunedBlocklets = dataMapExprWrapper.pruneBlocklets(prunedBlocklets);
} else {
prunedBlocklets = dataMapExprWrapper.prune(segmentIds, partitionsToPrune);
}
List<CarbonInputSplit> resultFilterredBlocks = new ArrayList<>();
int partitionIndex = 0;
List<Integer> partitionIdList = new ArrayList<>();
if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) {
partitionIdList = partitionInfo.getPartitionIds();
}
for (ExtendedBlocklet blocklet : prunedBlocklets) {
long partitionId = CarbonTablePath.DataFileUtil.getTaskIdFromTaskNo(CarbonTablePath.DataFileUtil.getTaskNo(blocklet.getPath()));
// For other normal query should use newest partitionIdList
if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) {
if (oldPartitionIdList != null) {
partitionIndex = oldPartitionIdList.indexOf((int) partitionId);
} else {
partitionIndex = partitionIdList.indexOf((int) partitionId);
}
}
if (partitionIndex != -1) {
// if this partition is not required, here will skip it.
if (matchedPartitions == null || matchedPartitions.get(partitionIndex)) {
CarbonInputSplit inputSplit = convertToCarbonInputSplit(blocklet);
if (inputSplit != null) {
resultFilterredBlocks.add(inputSplit);
}
}
}
}
statistic.addStatistics(QueryStatisticsConstants.LOAD_BLOCKS_DRIVER, System.currentTimeMillis());
recorder.recordStatisticsForDriver(statistic, job.getConfiguration().get("query.id"));
return resultFilterredBlocks;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonInputFormat method convertToCarbonInputSplit.
private CarbonInputSplit convertToCarbonInputSplit(ExtendedBlocklet blocklet) throws IOException {
CarbonInputSplit split = CarbonInputSplit.from(blocklet.getSegmentId(), blocklet.getBlockletId(), new FileSplit(new Path(blocklet.getPath()), 0, blocklet.getLength(), blocklet.getLocations()), ColumnarFormatVersion.valueOf((short) blocklet.getDetailInfo().getVersionNumber()), blocklet.getDataMapWriterPath());
split.setDetailInfo(blocklet.getDetailInfo());
return split;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonTableInputFormat method getSplitsOfOneSegment.
/**
* Read data in one segment. For alter table partition statement
* @param job
* @param targetSegment
* @param oldPartitionIdList get old partitionId before partitionInfo was changed
* @return
*/
public List<InputSplit> getSplitsOfOneSegment(JobContext job, String targetSegment, List<Integer> oldPartitionIdList, PartitionInfo partitionInfo) {
List<Segment> invalidSegments = new ArrayList<>();
List<UpdateVO> invalidTimestampsList = new ArrayList<>();
List<Segment> segmentList = new ArrayList<>();
segmentList.add(new Segment(targetSegment, null));
setSegmentsToAccess(job.getConfiguration(), segmentList);
try {
// process and resolve the expression
Expression filter = getFilterPredicates(job.getConfiguration());
CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
// this will be null in case of corrupt schema file.
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
carbonTable.processFilterExpression(filter, null, null);
TableProvider tableProvider = new SingleTableProvider(carbonTable);
// prune partitions for filter query on partition table
String partitionIds = job.getConfiguration().get(ALTER_PARTITION_ID);
// matchedPartitions records partitionIndex, not partitionId
BitSet matchedPartitions = null;
if (partitionInfo != null) {
matchedPartitions = setMatchedPartitions(partitionIds, filter, partitionInfo, oldPartitionIdList);
if (matchedPartitions != null) {
if (matchedPartitions.cardinality() == 0) {
return new ArrayList<InputSplit>();
} else if (matchedPartitions.cardinality() == partitionInfo.getNumPartitions()) {
matchedPartitions = null;
}
}
}
FilterResolverIntf filterInterface = carbonTable.resolveFilter(filter, tableProvider);
// do block filtering and get split
List<InputSplit> splits = getSplits(job, filterInterface, segmentList, matchedPartitions, partitionInfo, oldPartitionIdList, new SegmentUpdateStatusManager(carbonTable));
// pass the invalid segment to task side in order to remove index entry in task side
if (invalidSegments.size() > 0) {
for (InputSplit split : splits) {
((CarbonInputSplit) split).setInvalidSegments(invalidSegments);
((CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
}
}
return splits;
} catch (IOException e) {
throw new RuntimeException("Can't get splits of the target segment ", e);
}
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class InMemoryBTreeIndex method getTableBlockInfo.
/**
* Below method will be used to get the table block info
*
* @param job job context
* @return list of table block
* @throws IOException
*/
private List<TableBlockInfo> getTableBlockInfo(JobContext job) throws IOException {
List<TableBlockInfo> tableBlockInfoList = new ArrayList<>();
// identify table blocks from all file locations of given segment
for (InputSplit inputSplit : segment.getAllSplits(job)) {
CarbonInputSplit carbonInputSplit = (CarbonInputSplit) inputSplit;
BlockletInfos blockletInfos = new BlockletInfos(carbonInputSplit.getNumberOfBlocklets(), 0, carbonInputSplit.getNumberOfBlocklets());
tableBlockInfoList.add(new TableBlockInfo(carbonInputSplit.getPath().toString(), carbonInputSplit.getBlockletId(), carbonInputSplit.getStart(), segment.getId(), carbonInputSplit.getLocations(), carbonInputSplit.getLength(), blockletInfos, carbonInputSplit.getVersion(), carbonInputSplit.getDeleteDeltaFiles()));
}
return tableBlockInfoList;
}
Aggregations