use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonFileInputFormat method getSplits.
/**
* {@inheritDoc}
* Configurations FileInputFormat.INPUT_DIR, CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS
* are used to get table path to read.
*
* @return
* @throws IOException
*/
private List<InputSplit> getSplits(JobContext job, IndexFilter indexFilter, List<Segment> validSegments) throws IOException {
numSegments = validSegments.size();
// for each segment fetch blocks matching filter in Driver BTree
List<CarbonInputSplit> dataBlocksOfSegment = getDataBlocksOfSegment(job, carbonTable, indexFilter, validSegments, new ArrayList<>(), new ArrayList<>());
numBlocks = dataBlocksOfSegment.size();
List<String> allDeleteDeltaFiles = getAllDeleteDeltaFiles(carbonTable.getTablePath());
if (CollectionUtils.isNotEmpty(allDeleteDeltaFiles)) {
for (CarbonInputSplit split : dataBlocksOfSegment) {
split.setDeleteDeltaFiles(getDeleteDeltaFiles(split.getFilePath(), allDeleteDeltaFiles));
}
}
return new LinkedList<>(dataBlocksOfSegment);
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonCompactionUtil method getOverallMinMax.
public static Object[] getOverallMinMax(CarbonInputSplit[] carbonInputSplits, CarbonColumn rangeCol, boolean isSortCol) {
byte[] minVal = null;
byte[] maxVal = null;
int dictMinVal = Integer.MAX_VALUE;
int dictMaxVal = Integer.MIN_VALUE;
int idx = -1;
DataType dataType = rangeCol.getDataType();
Object[] minMaxVals = new Object[2];
boolean isDictEncode = rangeCol.getDataType() == DataTypes.DATE;
try {
for (CarbonInputSplit split : carbonInputSplits) {
DataFileFooter dataFileFooter = null;
dataFileFooter = CarbonUtil.readMetadataFile(CarbonInputSplit.getTableBlockInfo(split), true);
if (-1 == idx) {
List<ColumnSchema> allColumns = dataFileFooter.getColumnInTable();
for (int i = 0; i < allColumns.size(); i++) {
if (allColumns.get(i).getColumnName().equalsIgnoreCase(rangeCol.getColName())) {
idx = i;
break;
}
}
}
if (isDictEncode) {
byte[] tempMin = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMinValues()[idx];
int tempMinVal = CarbonUtil.getSurrogateInternal(tempMin, 0, tempMin.length);
byte[] tempMax = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMaxValues()[idx];
int tempMaxVal = CarbonUtil.getSurrogateInternal(tempMax, 0, tempMax.length);
if (dictMinVal > tempMinVal) {
dictMinVal = tempMinVal;
}
if (dictMaxVal < tempMaxVal) {
dictMaxVal = tempMaxVal;
}
} else {
if (null == minVal) {
minVal = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMinValues()[idx];
maxVal = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMaxValues()[idx];
} else {
byte[] tempMin = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMinValues()[idx];
byte[] tempMax = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMaxValues()[idx];
if (ByteUtil.compare(tempMin, minVal) <= 0) {
minVal = tempMin;
}
if (ByteUtil.compare(tempMax, maxVal) >= 0) {
maxVal = tempMax;
}
}
}
}
// Based on how min/max value is stored in the footer we change the data
if (isDictEncode) {
minMaxVals[0] = dictMinVal;
minMaxVals[1] = dictMaxVal;
} else {
if (!isSortCol && (dataType == DataTypes.INT || dataType == DataTypes.LONG)) {
minMaxVals[0] = ByteUtil.toLong(minVal, 0, minVal.length);
minMaxVals[1] = ByteUtil.toLong(maxVal, 0, maxVal.length);
} else if (dataType == DataTypes.DOUBLE) {
minMaxVals[0] = ByteUtil.toDouble(minVal, 0, minVal.length);
minMaxVals[1] = ByteUtil.toDouble(maxVal, 0, maxVal.length);
} else {
minMaxVals[0] = DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(minVal, dataType, true);
minMaxVals[1] = DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(maxVal, dataType, true);
}
}
} catch (IOException e) {
LOGGER.error(e.getMessage());
}
return minMaxVals;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonTableReader method getInputSplits.
/**
* Get a carbon muti-block input splits
*
* @param tableCacheModel cached table
* @param filters carbonData filters
* @param filteredPartitions matched partitionSpec for the filter
* @param config hadoop conf
* @return list of multiblock split
* @throws IOException
*/
public List<CarbonLocalMultiBlockSplit> getInputSplits(CarbonTableCacheModel tableCacheModel, Expression filters, List<PartitionSpec> filteredPartitions, Configuration config) throws IOException {
List<CarbonLocalInputSplit> result = new ArrayList<>();
List<CarbonLocalMultiBlockSplit> multiBlockSplitList = new ArrayList<>();
CarbonTable carbonTable = tableCacheModel.getCarbonTable();
TableInfo tableInfo = tableCacheModel.getCarbonTable().getTableInfo();
config.set("presto.cli.query.id", prestoQueryId);
config.set(CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS, "");
String carbonTablePath = carbonTable.getAbsoluteTableIdentifier().getTablePath();
config.set(CarbonTableInputFormat.INPUT_DIR, carbonTablePath);
config.set(CarbonTableInputFormat.DATABASE_NAME, carbonTable.getDatabaseName());
config.set(CarbonTableInputFormat.TABLE_NAME, carbonTable.getTableName());
config.set("query.id", queryId);
CarbonInputFormat.setTransactionalTable(config, carbonTable.isTransactionalTable());
CarbonInputFormat.setTableInfo(config, carbonTable.getTableInfo());
if (CarbonProperties.getInstance().isCoarseGrainSecondaryIndex(tableInfo.getDatabaseName(), tableInfo.getFactTable().getTableName(), "true")) {
CarbonInputFormat.checkAndSetSecondaryIndexPruning(carbonTable.getTableInfo(), filters, config);
}
JobConf jobConf = new JobConf(config);
try {
CarbonTableInputFormat.setTableInfo(config, tableInfo);
CarbonTableInputFormat<Object> carbonTableInputFormat = createInputFormat(jobConf, carbonTable.getAbsoluteTableIdentifier(), new IndexFilter(carbonTable, filters, true), filteredPartitions);
Job job = Job.getInstance(jobConf);
List<InputSplit> splits = carbonTableInputFormat.getSplits(job);
Gson gson = new Gson();
if (splits != null && splits.size() > 0) {
for (InputSplit inputSplit : splits) {
CarbonInputSplit carbonInputSplit = (CarbonInputSplit) inputSplit;
result.add(new CarbonLocalInputSplit(carbonInputSplit.getSegmentId(), carbonInputSplit.getPath().toString(), carbonInputSplit.getStart(), carbonInputSplit.getLength(), Arrays.asList(carbonInputSplit.getLocations()), carbonInputSplit.getNumberOfBlocklets(), carbonInputSplit.getVersion().number(), carbonInputSplit.getDeleteDeltaFiles(), carbonInputSplit.getBlockletId(), gson.toJson(carbonInputSplit.getDetailInfo()), carbonInputSplit.getFileFormat().ordinal()));
}
// Use block distribution
List<List<CarbonLocalInputSplit>> inputSplits = new ArrayList<>(result.stream().collect(Collectors.groupingBy(carbonInput -> {
if (FileFormat.ROW_V1.equals(carbonInput.getFileFormat())) {
return carbonInput.getSegmentId().concat(carbonInput.getPath()).concat(carbonInput.getStart() + "");
}
return carbonInput.getSegmentId().concat(carbonInput.getPath());
})).values());
// TODO : try to optimize the below loic as it may slowdown for huge splits
for (int j = 0; j < inputSplits.size(); j++) {
multiBlockSplitList.add(new CarbonLocalMultiBlockSplit(inputSplits.get(j), inputSplits.get(j).stream().flatMap(f -> Arrays.stream(getLocations(f))).distinct().toArray(String[]::new)));
}
LOGGER.error("Size fo MultiblockList " + multiBlockSplitList.size());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return multiBlockSplitList;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonReaderBuilder method totalRowCountInSplits.
private <T> void totalRowCountInSplits(Job job, List<InputSplit> splits, List<Long> rowCountInSplit) throws IOException, InterruptedException {
CarbonFileInputFormat format = this.prepareFileInputFormat(job, false, true);
long sum = 0;
boolean isIUDTable = false;
if (!StringUtils.isEmpty(this.tablePath)) {
// Check if update or delete happened on the table.
CarbonFile emptyMetadataFile = FileFactory.getCarbonFile(this.tablePath + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.CARBON_SDK_EMPTY_METADATA_PATH, this.hadoopConf);
if (emptyMetadataFile.exists() && emptyMetadataFile.isDirectory()) {
isIUDTable = true;
}
}
// building carbon reader else get the row count from the details info of each splits.
if (this.filterExpression != null || isIUDTable) {
RecordReader reader = null;
CarbonReader carbonReader = null;
for (InputSplit split : splits) {
List<RecordReader<Void, T>> readers = new ArrayList<>();
try {
reader = this.getRecordReader(job, format, readers, split);
readers.add(reader);
carbonReader = new CarbonReader<>(readers);
while (carbonReader.hasNext()) {
try {
sum += carbonReader.readNextBatchRow().length;
} catch (Exception ex) {
LOGGER.error("Exception occured while reading the batch row " + ex.getMessage());
}
}
rowCountInSplit.add(sum);
} finally {
if (reader != null) {
reader.close();
}
if (carbonReader != null) {
carbonReader.close();
}
}
}
} else {
for (InputSplit split : splits) {
// prepare a summation array of row counts in each blocklet,
// this is used for pruning with pagination vales.
// At current index, it contains sum of rows of all the blocklet from previous + current.
sum += ((CarbonInputSplit) split).getDetailInfo().getRowCount();
rowCountInSplit.add(sum);
}
}
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonReaderBuilder method getSplits.
/**
* Gets an array of CarbonInputSplits.
* In carbondata, splits can be block level or blocklet level.
* by default splits are block level.
*
* @param enableBlockletDistribution, returns blocklet level splits if set to true,
* else block level splits.
* @return
* @throws IOException
*/
public InputSplit[] getSplits(boolean enableBlockletDistribution) throws IOException {
if (hadoopConf == null) {
hadoopConf = FileFactory.getConfiguration();
}
Job job = null;
List<InputSplit> splits;
CarbonFileInputFormat format = null;
try {
job = new Job(new JobConf(hadoopConf));
format = prepareFileInputFormat(job, enableBlockletDistribution, false);
splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
for (InputSplit split : splits) {
// Load the detailInfo
((CarbonInputSplit) split).getDetailInfo();
}
} finally {
if (format != null) {
// Clear the index cache as it is added in getSplits() method
IndexStoreManager.getInstance().clearIndexCache(format.getOrCreateCarbonTable((job.getConfiguration())).getAbsoluteTableIdentifier(), false);
}
}
return splits.toArray(new InputSplit[splits.size()]);
}
Aggregations