Search in sources :

Example 66 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project carbondata by apache.

the class CarbonInputFormat method getSplits.

/**
   * {@inheritDoc}
   * Configurations FileInputFormat.INPUT_DIR
   * are used to get table path to read.
   *
   * @param job
   * @return List<InputSplit> list of CarbonInputSplit
   * @throws IOException
   */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    AbsoluteTableIdentifier identifier = getAbsoluteTableIdentifier(job.getConfiguration());
    CacheClient cacheClient = new CacheClient(identifier.getStorePath());
    try {
        List<String> invalidSegments = new ArrayList<>();
        List<UpdateVO> invalidTimestampsList = new ArrayList<>();
        // get all valid segments and set them into the configuration
        if (getSegmentsToAccess(job).length == 0) {
            SegmentStatusManager segmentStatusManager = new SegmentStatusManager(identifier);
            SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments();
            SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(identifier);
            setSegmentsToAccess(job.getConfiguration(), segments.getValidSegments());
            if (segments.getValidSegments().size() == 0) {
                return new ArrayList<>(0);
            }
            // remove entry in the segment index if there are invalid segments
            invalidSegments.addAll(segments.getInvalidSegments());
            for (String invalidSegmentId : invalidSegments) {
                invalidTimestampsList.add(updateStatusManager.getInvalidTimestampRange(invalidSegmentId));
            }
            if (invalidSegments.size() > 0) {
                List<TableSegmentUniqueIdentifier> invalidSegmentsIds = new ArrayList<>(invalidSegments.size());
                for (String segId : invalidSegments) {
                    invalidSegmentsIds.add(new TableSegmentUniqueIdentifier(identifier, segId));
                }
                cacheClient.getSegmentAccessClient().invalidateAll(invalidSegmentsIds);
            }
        }
        // process and resolve the expression
        Expression filter = getFilterPredicates(job.getConfiguration());
        CarbonTable carbonTable = getCarbonTable(job.getConfiguration());
        // this will be null in case of corrupt schema file.
        if (null == carbonTable) {
            throw new IOException("Missing/Corrupt schema file for table.");
        }
        CarbonInputFormatUtil.processFilterExpression(filter, carbonTable);
        // prune partitions for filter query on partition table
        BitSet matchedPartitions = null;
        if (null != filter) {
            PartitionInfo partitionInfo = carbonTable.getPartitionInfo(carbonTable.getFactTableName());
            if (null != partitionInfo) {
                Partitioner partitioner = PartitionUtil.getPartitioner(partitionInfo);
                matchedPartitions = new FilterExpressionProcessor().getFilteredPartitions(filter, partitionInfo, partitioner);
                if (matchedPartitions.cardinality() == 0) {
                    // no partition is required
                    return new ArrayList<InputSplit>();
                }
                if (matchedPartitions.cardinality() == partitioner.numPartitions()) {
                    // all partitions are required, no need to prune partitions
                    matchedPartitions = null;
                }
            }
        }
        FilterResolverIntf filterInterface = CarbonInputFormatUtil.resolveFilter(filter, identifier);
        // do block filtering and get split
        List<InputSplit> splits = getSplits(job, filterInterface, matchedPartitions, cacheClient);
        // pass the invalid segment to task side in order to remove index entry in task side
        if (invalidSegments.size() > 0) {
            for (InputSplit split : splits) {
                ((CarbonInputSplit) split).setInvalidSegments(invalidSegments);
                ((CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
            }
        }
        return splits;
    } finally {
        // close the cache cache client to clear LRU cache memory
        cacheClient.close();
    }
}
Also used : SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) SegmentStatusManager(org.apache.carbondata.core.statusmanager.SegmentStatusManager) IOException(java.io.IOException) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) TableSegmentUniqueIdentifier(org.apache.carbondata.core.datastore.TableSegmentUniqueIdentifier) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) FilterExpressionProcessor(org.apache.carbondata.core.scan.filter.FilterExpressionProcessor) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) Expression(org.apache.carbondata.core.scan.expression.Expression) PartitionInfo(org.apache.carbondata.core.metadata.schema.PartitionInfo) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Partitioner(org.apache.carbondata.core.scan.partition.Partitioner) FilterResolverIntf(org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf)

Example 67 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project carbondata by apache.

the class CarbonInputFormat method getSplitsInternal.

private List<InputSplit> getSplitsInternal(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    List<InputSplit> carbonSplits = new ArrayList<InputSplit>(splits.size());
    // identify table blocks
    for (InputSplit inputSplit : splits) {
        FileSplit fileSplit = (FileSplit) inputSplit;
        String segmentId = CarbonTablePath.DataPathUtil.getSegmentId(fileSplit.getPath().toString());
        if (segmentId.equals(CarbonCommonConstants.INVALID_SEGMENT_ID)) {
            continue;
        }
        carbonSplits.add(CarbonInputSplit.from(segmentId, fileSplit, ColumnarFormatVersion.valueOf(CarbonCommonConstants.CARBON_DATA_FILE_DEFAULT_VERSION)));
    }
    return carbonSplits;
}
Also used : FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 68 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project carbondata by apache.

the class IndexedSegment method getSplits.

@Override
public List<InputSplit> getSplits(JobContext job, FilterResolverIntf filterResolver) throws IOException {
    // do as following
    // 1. create the index or get from cache by the filter name in the configuration
    // 2. filter by index to get the filtered block
    // 3. create input split from filtered block
    List<InputSplit> output = new LinkedList<>();
    Index index = loader.load(this);
    List<Block> blocks = index.filter(job, filterResolver);
    for (Block block : blocks) {
        output.add(makeInputSplit(block));
    }
    return output;
}
Also used : Block(org.apache.carbondata.hadoop.internal.index.Block) Index(org.apache.carbondata.hadoop.internal.index.Index) InputSplit(org.apache.hadoop.mapreduce.InputSplit) LinkedList(java.util.LinkedList)

Example 69 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project gora by apache.

the class FileBackedDataStoreBase method getPartitions.

@Override
public List<PartitionQuery<K, T>> getPartitions(Query<K, T> query) {
    List<InputSplit> splits = null;
    List<PartitionQuery<K, T>> queries = null;
    try {
        splits = GoraMapReduceUtils.getSplits(getConf(), inputPath);
        queries = new ArrayList<>(splits.size());
        for (InputSplit split : splits) {
            queries.add(new FileSplitPartitionQuery<>(query, (FileSplit) split));
        }
    } catch (IOException ex) {
        LOG.error(ex.getMessage(), ex);
    }
    return queries;
}
Also used : IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit) PartitionQuery(org.apache.gora.query.PartitionQuery) FileSplitPartitionQuery(org.apache.gora.query.impl.FileSplitPartitionQuery)

Example 70 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project gora by apache.

the class TestGoraInputFormat method testGetSplits.

/**
   * First, asserts that the attempt to obtain splits results in 
   * greater than 0 splits which can be used for computation.
   * We then check that the partition query (obtained by using the 
   * splits) has the same fields as we would expect by directly 
   * accessing the fields of an Employee object.
   * @throws IOException
   * @throws InterruptedException
   */
@Test
@SuppressWarnings("rawtypes")
public void testGetSplits() throws IOException, InterruptedException {
    List<InputSplit> splits = getInputSplits();
    assertTrue(splits.size() > 0);
    InputSplit split = splits.get(0);
    PartitionQuery query = ((GoraInputSplit) split).getQuery();
    assertTrue(Arrays.equals(getEmployeeFieldNames(), query.getFields()));
}
Also used : InputSplit(org.apache.hadoop.mapreduce.InputSplit) PartitionQuery(org.apache.gora.query.PartitionQuery) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9