use of org.apache.carbondata.core.stream.StreamPruner in project carbondata by apache.
the class CarbonTableInputFormat method getSplitsOfStreaming.
/**
* use file list in .carbonindex file to get the split of streaming.
*/
public List<InputSplit> getSplitsOfStreaming(JobContext job, List<Segment> streamSegments, CarbonTable carbonTable, FilterResolverIntf filterResolverIntf) throws IOException {
List<InputSplit> splits = new ArrayList<>();
if (streamSegments != null && !streamSegments.isEmpty()) {
numStreamSegments = streamSegments.size();
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
if (filterResolverIntf == null) {
if (carbonTable != null) {
IndexFilter filter = getFilterPredicates(job.getConfiguration());
if (filter != null) {
filter.processFilterExpression();
filterResolverIntf = filter.getResolver();
}
}
}
StreamPruner streamPruner = new StreamPruner(carbonTable);
streamPruner.init(filterResolverIntf);
List<StreamFile> streamFiles = streamPruner.prune(streamSegments);
// record the hit information of the streaming files
this.hitStreamFiles = streamFiles.size();
this.numStreamFiles = streamPruner.getTotalFileNums();
for (StreamFile streamFile : streamFiles) {
Path path = new Path(streamFile.getFilePath());
long length = streamFile.getFileSize();
if (length != 0) {
BlockLocation[] blkLocations;
FileSystem fs = FileFactory.getFileSystem(path);
FileStatus file = fs.getFileStatus(path);
blkLocations = fs.getFileBlockLocations(path, 0, length);
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
long bytesRemaining = length;
// there is 10% slop to avoid to generate very small split in the end
while (((double) bytesRemaining) / splitSize > 1.1) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(streamFile.getSegmentNo(), streamFile.getFilePath(), length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(streamFile.getSegmentNo(), streamFile.getFilePath(), length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
}
}
}
}
return splits;
}
Aggregations