use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonTableReader method getInputSplits2.
public List<CarbonLocalInputSplit> getInputSplits2(CarbonTableCacheModel tableCacheModel, Expression filters) {
List<CarbonLocalInputSplit> result = new ArrayList<>();
if (config.getUnsafeMemoryInMb() != null) {
CarbonProperties.getInstance().addProperty(CarbonCommonConstants.UNSAFE_WORKING_MEMORY_IN_MB, config.getUnsafeMemoryInMb());
}
CarbonTable carbonTable = tableCacheModel.carbonTable;
TableInfo tableInfo = tableCacheModel.carbonTable.getTableInfo();
Configuration config = new Configuration();
config.set(CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS, "");
String carbonTablePath = carbonTable.getAbsoluteTableIdentifier().getTablePath();
config.set(CarbonTableInputFormat.INPUT_DIR, carbonTablePath);
config.set(CarbonTableInputFormat.DATABASE_NAME, carbonTable.getDatabaseName());
config.set(CarbonTableInputFormat.TABLE_NAME, carbonTable.getTableName());
try {
CarbonTableInputFormat.setTableInfo(config, tableInfo);
CarbonTableInputFormat carbonTableInputFormat = createInputFormat(config, carbonTable.getAbsoluteTableIdentifier(), filters);
JobConf jobConf = new JobConf(config);
Job job = Job.getInstance(jobConf);
List<InputSplit> splits = carbonTableInputFormat.getSplits(job);
CarbonInputSplit carbonInputSplit = null;
Gson gson = new Gson();
if (splits != null && splits.size() > 0) {
for (InputSplit inputSplit : splits) {
carbonInputSplit = (CarbonInputSplit) inputSplit;
result.add(new CarbonLocalInputSplit(carbonInputSplit.getSegmentId(), carbonInputSplit.getPath().toString(), carbonInputSplit.getStart(), carbonInputSplit.getLength(), Arrays.asList(carbonInputSplit.getLocations()), carbonInputSplit.getNumberOfBlocklets(), carbonInputSplit.getVersion().number(), carbonInputSplit.getDeleteDeltaFiles(), gson.toJson(carbonInputSplit.getDetailInfo())));
}
}
} catch (IOException e) {
throw new RuntimeException("Error creating Splits from CarbonTableInputFormat", e);
}
return result;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonTableInputFormat method getSplits.
/**
* get list of block/blocklet and make them to CarbonInputSplit
* @param job JobContext with Configuration
* @return list of CarbonInputSplit
* @throws IOException
*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
carbonTable = getOrCreateCarbonTable(job.getConfiguration());
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
// global dictionary is not supported since 2.0
if (carbonTable.getTableInfo().getFactTable().getTableProperties().containsKey(CarbonCommonConstants.DICTIONARY_INCLUDE)) {
DeprecatedFeatureException.globalDictNotSupported();
}
List<InputSplit> splits = new LinkedList<>();
if (CarbonProperties.isQueryStageInputEnabled()) {
// included for the query
try {
List<InputSplit> stageInputSplits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration());
splits.addAll(stageInputSplits);
} catch (ExecutionException | InterruptedException e) {
LOG.error("Failed to create input splits from stage files", e);
throw new IOException(e);
}
}
this.readCommittedScope = getReadCommitted(job, carbonTable.getAbsoluteTableIdentifier());
LoadMetadataDetails[] loadMetadataDetails = readCommittedScope.getSegmentList();
String updateDeltaVersion = job.getConfiguration().get(UPDATE_DELTA_VERSION);
SegmentUpdateStatusManager updateStatusManager;
if (updateDeltaVersion != null) {
updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails, updateDeltaVersion);
} else {
updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails);
}
List<String> invalidSegmentIds = new ArrayList<>();
List<Segment> streamSegments = null;
// get all valid segments and set them into the configuration
SegmentStatusManager segmentStatusManager = new SegmentStatusManager(carbonTable.getAbsoluteTableIdentifier(), readCommittedScope.getConfiguration());
SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments(carbonTable.isMV(), loadMetadataDetails, this.readCommittedScope);
if (getValidateSegmentsToAccess(job.getConfiguration())) {
List<Segment> validSegments = segments.getValidSegments();
streamSegments = segments.getStreamSegments();
streamSegments = getFilteredSegment(job, streamSegments, true, readCommittedScope);
if (validSegments.size() == 0) {
splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
return splits;
}
List<Segment> filteredSegmentToAccess = getFilteredSegment(job, segments.getValidSegments(), true, readCommittedScope);
if (filteredSegmentToAccess.size() == 0) {
splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
return splits;
} else {
setSegmentsToAccess(job.getConfiguration(), filteredSegmentToAccess);
}
// remove entry in the segment index if there are invalid segments
for (Segment segment : segments.getInvalidSegments()) {
invalidSegmentIds.add(segment.getSegmentNo());
}
if (invalidSegmentIds.size() > 0) {
IndexStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), invalidSegmentIds);
}
}
List<Segment> validAndInProgressSegments = new ArrayList<>(segments.getValidSegments());
// Add in progress segments also to filter it as in case of Secondary Index table load it loads
// data from in progress table.
validAndInProgressSegments.addAll(segments.getListOfInProgressSegments());
List<Segment> segmentToAccess = getFilteredSegment(job, validAndInProgressSegments, false, readCommittedScope);
String segmentFileName = job.getConfiguration().get(CarbonCommonConstants.CURRENT_SEGMENTFILE);
if (segmentFileName != null) {
// per segment it has only one file("current.segment")
segmentToAccess.get(0).setSegmentFileName(segmentFileName + CarbonTablePath.SEGMENT_EXT);
}
// process and resolve the expression
IndexFilter indexFilter = getFilterPredicates(job.getConfiguration());
if (indexFilter != null) {
indexFilter.resolve(false);
}
// do block filtering and get split
List<InputSplit> batchSplits = getSplits(job, indexFilter, segmentToAccess, updateStatusManager, segments.getInvalidSegments());
splits.addAll(batchSplits);
// add all splits of streaming
List<InputSplit> splitsOfStreaming = getSplitsOfStreaming(job, streamSegments, carbonTable);
if (!splitsOfStreaming.isEmpty()) {
splits.addAll(splitsOfStreaming);
}
return splits;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class StreamRecordReader method initialize.
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
// input
if (split instanceof CarbonInputSplit) {
fileSplit = (CarbonInputSplit) split;
} else if (split instanceof CarbonMultiBlockSplit) {
fileSplit = ((CarbonMultiBlockSplit) split).getAllSplits().get(0);
} else {
fileSplit = (FileSplit) split;
}
// metadata
hadoopConf = context.getConfiguration();
if (model == null) {
CarbonTableInputFormat<Object> format = new CarbonTableInputFormat<>();
model = format.createQueryModel(split, context);
}
carbonTable = model.getTable();
List<CarbonDimension> dimensions = carbonTable.getVisibleDimensions();
dimensionCount = dimensions.size();
List<CarbonMeasure> measures = carbonTable.getVisibleMeasures();
measureCount = measures.size();
List<CarbonColumn> carbonColumnList = carbonTable.getStreamStorageOrderColumn();
storageColumns = carbonColumnList.toArray(new CarbonColumn[carbonColumnList.size()]);
isNoDictColumn = CarbonDataProcessorUtil.getNoDictionaryMapping(storageColumns);
directDictionaryGenerators = new DirectDictionaryGenerator[storageColumns.length];
for (int i = 0; i < storageColumns.length; i++) {
if (storageColumns[i].getDataType() == DataTypes.DATE) {
directDictionaryGenerators[i] = DirectDictionaryKeyGeneratorFactory.getDirectDictionaryGenerator(storageColumns[i].getDataType());
}
}
dimensionsIsVarcharTypeMap = new boolean[dimensionCount];
for (int i = 0; i < dimensionCount; i++) {
dimensionsIsVarcharTypeMap[i] = storageColumns[i].getDataType() == DataTypes.VARCHAR;
}
measureDataTypes = new DataType[measureCount];
for (int i = 0; i < measureCount; i++) {
measureDataTypes[i] = storageColumns[dimensionCount + i].getDataType();
}
// decode data
allNonNull = new BitSet(storageColumns.length);
projection = model.getProjectionColumns();
isRequired = new boolean[storageColumns.length];
boolean[] isFilterDimensions = model.getIsFilterDimensions();
boolean[] isFilterMeasures = model.getIsFilterMeasures();
isFilterRequired = new boolean[storageColumns.length];
filterMap = new int[storageColumns.length];
for (int i = 0; i < storageColumns.length; i++) {
if (storageColumns[i].isDimension()) {
if (isFilterDimensions[storageColumns[i].getOrdinal()]) {
isRequired[i] = true;
isFilterRequired[i] = true;
filterMap[i] = storageColumns[i].getOrdinal();
}
} else {
if (isFilterMeasures[storageColumns[i].getOrdinal()]) {
isRequired[i] = true;
isFilterRequired[i] = true;
filterMap[i] = carbonTable.getDimensionOrdinalMax() + storageColumns[i].getOrdinal();
}
}
}
isProjectionRequired = new boolean[storageColumns.length];
projectionMap = new int[storageColumns.length];
for (int j = 0; j < projection.length; j++) {
for (int i = 0; i < storageColumns.length; i++) {
if (storageColumns[i].getColName().equals(projection[j].getColName())) {
isRequired[i] = true;
isProjectionRequired[i] = true;
projectionMap[i] = j;
break;
}
}
}
// initialize filter
if (null != model.getIndexFilter()) {
initializeFilter();
} else if (projection.length == 0) {
skipScanData = true;
}
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonVectorizedRecordReader method initialize.
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException {
List<CarbonInputSplit> splitList;
if (inputSplit instanceof CarbonInputSplit) {
// Read the footer offset and set.
CarbonInputSplit carbonInputSplit = ((CarbonInputSplit) inputSplit);
String splitPath = carbonInputSplit.getFilePath();
if ((null != carbonInputSplit.getDetailInfo() && carbonInputSplit.getDetailInfo().getBlockFooterOffset() == 0L) || (null == carbonInputSplit.getDetailInfo() && carbonInputSplit.getStart() == 0)) {
FileReader reader = FileFactory.getFileHolder(FileFactory.getFileType(splitPath), taskAttemptContext.getConfiguration());
ByteBuffer buffer = reader.readByteBuffer(FileFactory.getUpdatedFilePath(splitPath), ((CarbonInputSplit) inputSplit).getLength() - 8, 8);
if (carbonInputSplit.getDetailInfo() == null) {
carbonInputSplit.setStart(buffer.getLong());
} else {
carbonInputSplit.getDetailInfo().setBlockFooterOffset(buffer.getLong());
}
reader.finish();
}
splitList = new ArrayList<>(1);
splitList.add((CarbonInputSplit) inputSplit);
} else {
throw new RuntimeException("unsupported input split type: " + inputSplit);
}
List<TableBlockInfo> tableBlockInfoList = CarbonInputSplit.createBlocks(splitList);
queryModel.setTableBlockInfos(tableBlockInfoList);
queryModel.setVectorReader(true);
try {
queryExecutor = QueryExecutorFactory.getQueryExecutor(queryModel, taskAttemptContext.getConfiguration());
iterator = (AbstractDetailQueryResultIterator) queryExecutor.execute(queryModel);
initBatch();
} catch (Exception e) {
LOGGER.error(e);
throw e;
}
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonInputFormat method getDataBlocksOfSegment.
/**
* get data blocks of given segment
*/
protected List<CarbonInputSplit> getDataBlocksOfSegment(JobContext job, CarbonTable carbonTable, IndexFilter expression, List<Segment> validSegments, List<Segment> invalidSegments, List<String> segmentsToBeRefreshed) throws IOException {
QueryStatisticsRecorder recorder = CarbonTimeStatisticsFactory.createDriverRecorder();
QueryStatistic statistic = new QueryStatistic();
List<ExtendedBlocklet> prunedBlocklets = getPrunedBlocklets(job, carbonTable, expression, validSegments, invalidSegments, segmentsToBeRefreshed);
List<CarbonInputSplit> resultFilteredBlocks = new ArrayList<>();
for (ExtendedBlocklet blocklet : prunedBlocklets) {
// matchedPartitions variable will be null in two cases as follows
// 1. the table is not a partition table
// 2. the table is a partition table, and all partitions are matched by query
// for partition table, the task id of carbondata file name is the partition id.
// if this partition is not required, here will skip it.
resultFilteredBlocks.add(blocklet.getInputSplit());
}
statistic.addStatistics(QueryStatisticsConstants.LOAD_BLOCKS_DRIVER, System.currentTimeMillis());
recorder.recordStatisticsForDriver(statistic, job.getConfiguration().get("query.id"));
return resultFilteredBlocks;
}
Aggregations