Search in sources :

Example 1 with StreamPruner

use of org.apache.carbondata.core.stream.StreamPruner in project carbondata by apache.

the class CarbonTableInputFormat method getSplitsOfStreaming.

/**
 * use file list in .carbonindex file to get the split of streaming.
 */
public List<InputSplit> getSplitsOfStreaming(JobContext job, List<Segment> streamSegments, CarbonTable carbonTable, FilterResolverIntf filterResolverIntf) throws IOException {
    List<InputSplit> splits = new ArrayList<>();
    if (streamSegments != null && !streamSegments.isEmpty()) {
        numStreamSegments = streamSegments.size();
        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
        long maxSize = getMaxSplitSize(job);
        if (filterResolverIntf == null) {
            if (carbonTable != null) {
                IndexFilter filter = getFilterPredicates(job.getConfiguration());
                if (filter != null) {
                    filter.processFilterExpression();
                    filterResolverIntf = filter.getResolver();
                }
            }
        }
        StreamPruner streamPruner = new StreamPruner(carbonTable);
        streamPruner.init(filterResolverIntf);
        List<StreamFile> streamFiles = streamPruner.prune(streamSegments);
        // record the hit information of the streaming files
        this.hitStreamFiles = streamFiles.size();
        this.numStreamFiles = streamPruner.getTotalFileNums();
        for (StreamFile streamFile : streamFiles) {
            Path path = new Path(streamFile.getFilePath());
            long length = streamFile.getFileSize();
            if (length != 0) {
                BlockLocation[] blkLocations;
                FileSystem fs = FileFactory.getFileSystem(path);
                FileStatus file = fs.getFileStatus(path);
                blkLocations = fs.getFileBlockLocations(path, 0, length);
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);
                long bytesRemaining = length;
                // there is 10% slop to avoid to generate very small split in the end
                while (((double) bytesRemaining) / splitSize > 1.1) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(streamFile.getSegmentNo(), streamFile.getFilePath(), length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
                    bytesRemaining -= splitSize;
                }
                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(streamFile.getSegmentNo(), streamFile.getFilePath(), length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
                }
            }
        }
    }
    return splits;
}
Also used : StreamPruner(org.apache.carbondata.core.stream.StreamPruner) Path(org.apache.hadoop.fs.Path) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) StreamFile(org.apache.carbondata.core.stream.StreamFile) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSystem(org.apache.hadoop.fs.FileSystem) IndexFilter(org.apache.carbondata.core.index.IndexFilter) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Aggregations

ArrayList (java.util.ArrayList)1 IndexFilter (org.apache.carbondata.core.index.IndexFilter)1 StreamFile (org.apache.carbondata.core.stream.StreamFile)1 StreamPruner (org.apache.carbondata.core.stream.StreamPruner)1 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)1 CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)1 BlockLocation (org.apache.hadoop.fs.BlockLocation)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 InputSplit (org.apache.hadoop.mapreduce.InputSplit)1