Search in sources :

Example 11 with IndexFilter

use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.

the class CarbonTableInputFormat method getSplitsOfStreaming.

/**
 * use file list in .carbonindex file to get the split of streaming.
 */
public List<InputSplit> getSplitsOfStreaming(JobContext job, List<Segment> streamSegments, CarbonTable carbonTable, FilterResolverIntf filterResolverIntf) throws IOException {
    List<InputSplit> splits = new ArrayList<>();
    if (streamSegments != null && !streamSegments.isEmpty()) {
        numStreamSegments = streamSegments.size();
        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
        long maxSize = getMaxSplitSize(job);
        if (filterResolverIntf == null) {
            if (carbonTable != null) {
                IndexFilter filter = getFilterPredicates(job.getConfiguration());
                if (filter != null) {
                    filter.processFilterExpression();
                    filterResolverIntf = filter.getResolver();
                }
            }
        }
        StreamPruner streamPruner = new StreamPruner(carbonTable);
        streamPruner.init(filterResolverIntf);
        List<StreamFile> streamFiles = streamPruner.prune(streamSegments);
        // record the hit information of the streaming files
        this.hitStreamFiles = streamFiles.size();
        this.numStreamFiles = streamPruner.getTotalFileNums();
        for (StreamFile streamFile : streamFiles) {
            Path path = new Path(streamFile.getFilePath());
            long length = streamFile.getFileSize();
            if (length != 0) {
                BlockLocation[] blkLocations;
                FileSystem fs = FileFactory.getFileSystem(path);
                FileStatus file = fs.getFileStatus(path);
                blkLocations = fs.getFileBlockLocations(path, 0, length);
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);
                long bytesRemaining = length;
                // there is 10% slop to avoid to generate very small split in the end
                while (((double) bytesRemaining) / splitSize > 1.1) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(streamFile.getSegmentNo(), streamFile.getFilePath(), length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
                    bytesRemaining -= splitSize;
                }
                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(streamFile.getSegmentNo(), streamFile.getFilePath(), length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
                }
            }
        }
    }
    return splits;
}
Also used : StreamPruner(org.apache.carbondata.core.stream.StreamPruner) Path(org.apache.hadoop.fs.Path) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) StreamFile(org.apache.carbondata.core.stream.StreamFile) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSystem(org.apache.hadoop.fs.FileSystem) IndexFilter(org.apache.carbondata.core.index.IndexFilter) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 12 with IndexFilter

use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.

the class CarbonTableInputFormat method getSplits.

/**
 * get list of block/blocklet and make them to CarbonInputSplit
 * @param job JobContext with Configuration
 * @return list of CarbonInputSplit
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    carbonTable = getOrCreateCarbonTable(job.getConfiguration());
    if (null == carbonTable) {
        throw new IOException("Missing/Corrupt schema file for table.");
    }
    // global dictionary is not supported since 2.0
    if (carbonTable.getTableInfo().getFactTable().getTableProperties().containsKey(CarbonCommonConstants.DICTIONARY_INCLUDE)) {
        DeprecatedFeatureException.globalDictNotSupported();
    }
    List<InputSplit> splits = new LinkedList<>();
    if (CarbonProperties.isQueryStageInputEnabled()) {
        // included for the query
        try {
            List<InputSplit> stageInputSplits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration());
            splits.addAll(stageInputSplits);
        } catch (ExecutionException | InterruptedException e) {
            LOG.error("Failed to create input splits from stage files", e);
            throw new IOException(e);
        }
    }
    this.readCommittedScope = getReadCommitted(job, carbonTable.getAbsoluteTableIdentifier());
    LoadMetadataDetails[] loadMetadataDetails = readCommittedScope.getSegmentList();
    String updateDeltaVersion = job.getConfiguration().get(UPDATE_DELTA_VERSION);
    SegmentUpdateStatusManager updateStatusManager;
    if (updateDeltaVersion != null) {
        updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails, updateDeltaVersion);
    } else {
        updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails);
    }
    List<String> invalidSegmentIds = new ArrayList<>();
    List<Segment> streamSegments = null;
    // get all valid segments and set them into the configuration
    SegmentStatusManager segmentStatusManager = new SegmentStatusManager(carbonTable.getAbsoluteTableIdentifier(), readCommittedScope.getConfiguration());
    SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments(carbonTable.isMV(), loadMetadataDetails, this.readCommittedScope);
    if (getValidateSegmentsToAccess(job.getConfiguration())) {
        List<Segment> validSegments = segments.getValidSegments();
        streamSegments = segments.getStreamSegments();
        streamSegments = getFilteredSegment(job, streamSegments, true, readCommittedScope);
        if (validSegments.size() == 0) {
            splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
            return splits;
        }
        List<Segment> filteredSegmentToAccess = getFilteredSegment(job, segments.getValidSegments(), true, readCommittedScope);
        if (filteredSegmentToAccess.size() == 0) {
            splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
            return splits;
        } else {
            setSegmentsToAccess(job.getConfiguration(), filteredSegmentToAccess);
        }
        // remove entry in the segment index if there are invalid segments
        for (Segment segment : segments.getInvalidSegments()) {
            invalidSegmentIds.add(segment.getSegmentNo());
        }
        if (invalidSegmentIds.size() > 0) {
            IndexStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), invalidSegmentIds);
        }
    }
    List<Segment> validAndInProgressSegments = new ArrayList<>(segments.getValidSegments());
    // Add in progress segments also to filter it as in case of Secondary Index table load it loads
    // data from in progress table.
    validAndInProgressSegments.addAll(segments.getListOfInProgressSegments());
    List<Segment> segmentToAccess = getFilteredSegment(job, validAndInProgressSegments, false, readCommittedScope);
    String segmentFileName = job.getConfiguration().get(CarbonCommonConstants.CURRENT_SEGMENTFILE);
    if (segmentFileName != null) {
        // per segment it has only one file("current.segment")
        segmentToAccess.get(0).setSegmentFileName(segmentFileName + CarbonTablePath.SEGMENT_EXT);
    }
    // process and resolve the expression
    IndexFilter indexFilter = getFilterPredicates(job.getConfiguration());
    if (indexFilter != null) {
        indexFilter.resolve(false);
    }
    // do block filtering and get split
    List<InputSplit> batchSplits = getSplits(job, indexFilter, segmentToAccess, updateStatusManager, segments.getInvalidSegments());
    splits.addAll(batchSplits);
    // add all splits of streaming
    List<InputSplit> splitsOfStreaming = getSplitsOfStreaming(job, streamSegments, carbonTable);
    if (!splitsOfStreaming.isEmpty()) {
        splits.addAll(splitsOfStreaming);
    }
    return splits;
}
Also used : SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) SegmentStatusManager(org.apache.carbondata.core.statusmanager.SegmentStatusManager) IOException(java.io.IOException) LinkedList(java.util.LinkedList) Segment(org.apache.carbondata.core.index.Segment) IndexFilter(org.apache.carbondata.core.index.IndexFilter) ExecutionException(java.util.concurrent.ExecutionException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 13 with IndexFilter

use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.

the class CarbonInputFormat method getFilterPredicates.

public IndexFilter getFilterPredicates(Configuration configuration) {
    try {
        String filterExprString = configuration.get(FILTER_PREDICATE);
        if (filterExprString == null) {
            return null;
        }
        IndexFilter filter = (IndexFilter) ObjectSerializationUtil.convertStringToObject(filterExprString);
        if (filter != null) {
            CarbonTable carbonTable = getOrCreateCarbonTable(configuration);
            filter.setTable(carbonTable);
        }
        return filter;
    } catch (IOException e) {
        throw new RuntimeException("Error while reading filter expression", e);
    }
}
Also used : CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) IndexFilter(org.apache.carbondata.core.index.IndexFilter) IOException(java.io.IOException)

Example 14 with IndexFilter

use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.

the class CarbonTableInputFormatTest method runJob.

private void runJob(String outPath, CarbonProjection projection, Expression filter) throws Exception {
    Configuration configuration = new Configuration();
    configuration.set("mapreduce.cluster.local.dir", new File(outPath + "1").getCanonicalPath());
    Job job = Job.getInstance(configuration);
    job.setJarByClass(CarbonTableInputFormatTest.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(Map.class);
    job.setInputFormatClass(CarbonTableInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    AbsoluteTableIdentifier abs = creator.getAbsoluteTableIdentifier();
    if (projection != null) {
        CarbonTableInputFormat.setColumnProjection(job.getConfiguration(), projection);
    }
    if (filter != null) {
        CarbonTableInputFormat.setFilterPredicates(job.getConfiguration(), new IndexFilter(loadModel.getCarbonDataLoadSchema().getCarbonTable(), filter));
    }
    CarbonTableInputFormat.setDatabaseName(job.getConfiguration(), abs.getCarbonTableIdentifier().getDatabaseName());
    CarbonTableInputFormat.setTableName(job.getConfiguration(), abs.getCarbonTableIdentifier().getTableName());
    FileInputFormat.addInputPath(job, new Path(abs.getTablePath()));
    CarbonUtil.deleteFoldersAndFiles(new File(outPath + "1"));
    FileOutputFormat.setOutputPath(job, new Path(outPath + "1"));
    job.getConfiguration().set("outpath", outPath);
    job.getConfiguration().set("query.id", String.valueOf(System.nanoTime()));
    boolean status = job.waitForCompletion(true);
}
Also used : Path(org.apache.hadoop.fs.Path) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) Configuration(org.apache.hadoop.conf.Configuration) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) IndexFilter(org.apache.carbondata.core.index.IndexFilter) Job(org.apache.hadoop.mapreduce.Job) File(java.io.File)

Example 15 with IndexFilter

use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.

the class CarbonTableInputFormatTest method testGetFilteredSplits.

@Test
public void testGetFilteredSplits() throws Exception {
    CarbonTableInputFormat carbonInputFormat = new CarbonTableInputFormat();
    JobConf jobConf = new JobConf(new Configuration());
    Job job = Job.getInstance(jobConf);
    job.getConfiguration().set("query.id", UUID.randomUUID().toString());
    String tblPath = creator.getAbsoluteTableIdentifier().getTablePath();
    FileInputFormat.addInputPath(job, new Path(tblPath));
    CarbonTableInputFormat.setDatabaseName(job.getConfiguration(), creator.getAbsoluteTableIdentifier().getDatabaseName());
    CarbonTableInputFormat.setTableName(job.getConfiguration(), creator.getAbsoluteTableIdentifier().getTableName());
    Expression expression = new EqualToExpression(new ColumnExpression("country", DataTypes.STRING), new LiteralExpression("china", DataTypes.STRING));
    CarbonTableInputFormat.setFilterPredicates(job.getConfiguration(), new IndexFilter(loadModel.getCarbonDataLoadSchema().getCarbonTable(), expression));
    List splits = carbonInputFormat.getSplits(job);
    Assert.assertTrue(splits != null);
    Assert.assertTrue(!splits.isEmpty());
}
Also used : Path(org.apache.hadoop.fs.Path) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) EqualToExpression(org.apache.carbondata.core.scan.expression.conditional.EqualToExpression) Configuration(org.apache.hadoop.conf.Configuration) ColumnExpression(org.apache.carbondata.core.scan.expression.ColumnExpression) Expression(org.apache.carbondata.core.scan.expression.Expression) EqualToExpression(org.apache.carbondata.core.scan.expression.conditional.EqualToExpression) LiteralExpression(org.apache.carbondata.core.scan.expression.LiteralExpression) ColumnExpression(org.apache.carbondata.core.scan.expression.ColumnExpression) LiteralExpression(org.apache.carbondata.core.scan.expression.LiteralExpression) CarbonTableInputFormat(org.apache.carbondata.hadoop.api.CarbonTableInputFormat) List(java.util.List) IndexFilter(org.apache.carbondata.core.index.IndexFilter) Job(org.apache.hadoop.mapreduce.Job) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Aggregations

IndexFilter (org.apache.carbondata.core.index.IndexFilter)27 Configuration (org.apache.hadoop.conf.Configuration)16 InputSplit (org.apache.hadoop.mapreduce.InputSplit)16 JobConf (org.apache.hadoop.mapred.JobConf)15 Job (org.apache.hadoop.mapreduce.Job)15 ExprNodeGenericFuncDesc (org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)12 Test (org.junit.Test)12 CarbonFileInputFormat (org.apache.carbondata.hadoop.api.CarbonFileInputFormat)11 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)11 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)11 IOException (java.io.IOException)9 ExprNodeConstantDesc (org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc)9 ArrayList (java.util.ArrayList)8 List (java.util.List)5 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)5 CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)5 CarbonTableInputFormat (org.apache.carbondata.hadoop.api.CarbonTableInputFormat)5 HashMap (java.util.HashMap)4 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)4 Map (java.util.Map)3