Search in sources :

Example 21 with Segment

use of org.apache.carbondata.core.datamap.Segment in project carbondata by apache.

the class CarbonTableInputFormat method getFilteredSegment.

/**
 * Return segment list after filtering out valid segments and segments set by user by
 * `INPUT_SEGMENT_NUMBERS` in job configuration
 */
private List<Segment> getFilteredSegment(JobContext job, List<Segment> validSegments, boolean validationRequired) {
    Segment[] segmentsToAccess = getSegmentsToAccess(job);
    List<Segment> segmentToAccessSet = new ArrayList<>(new HashSet<>(Arrays.asList(segmentsToAccess)));
    List<Segment> filteredSegmentToAccess = new ArrayList<>();
    if (segmentsToAccess.length == 0 || segmentsToAccess[0].getSegmentNo().equalsIgnoreCase("*")) {
        filteredSegmentToAccess.addAll(validSegments);
    } else {
        for (Segment validSegment : validSegments) {
            int index = segmentToAccessSet.indexOf(validSegment);
            if (index > -1) {
                // In case of in progress reading segment, segment file name is set to the property itself
                if (segmentToAccessSet.get(index).getSegmentFileName() != null && validSegment.getSegmentFileName() == null) {
                    filteredSegmentToAccess.add(segmentToAccessSet.get(index));
                } else {
                    filteredSegmentToAccess.add(validSegment);
                }
            }
        }
        if (filteredSegmentToAccess.size() != segmentToAccessSet.size() && !validationRequired) {
            for (Segment segment : segmentToAccessSet) {
                if (!filteredSegmentToAccess.contains(segment)) {
                    filteredSegmentToAccess.add(segment);
                }
            }
        }
        if (!filteredSegmentToAccess.containsAll(segmentToAccessSet)) {
            List<Segment> filteredSegmentToAccessTemp = new ArrayList<>(filteredSegmentToAccess);
            filteredSegmentToAccessTemp.removeAll(segmentToAccessSet);
            LOG.info("Segments ignored are : " + Arrays.toString(filteredSegmentToAccessTemp.toArray()));
        }
    }
    return filteredSegmentToAccess;
}
Also used : ArrayList(java.util.ArrayList) Segment(org.apache.carbondata.core.datamap.Segment)

Example 22 with Segment

use of org.apache.carbondata.core.datamap.Segment in project carbondata by apache.

the class CarbonTableInputFormat method getSplits.

/**
 * {@inheritDoc}
 * Configurations FileInputFormat.INPUT_DIR
 * are used to get table path to read.
 *
 * @param job
 * @return List<InputSplit> list of CarbonInputSplit
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    AbsoluteTableIdentifier identifier = getAbsoluteTableIdentifier(job.getConfiguration());
    LoadMetadataDetails[] loadMetadataDetails = SegmentStatusManager.readTableStatusFile(CarbonTablePath.getTableStatusFilePath(identifier.getTablePath()));
    CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
    if (null == carbonTable) {
        throw new IOException("Missing/Corrupt schema file for table.");
    }
    SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails);
    List<Segment> invalidSegments = new ArrayList<>();
    List<UpdateVO> invalidTimestampsList = new ArrayList<>();
    List<Segment> streamSegments = null;
    // get all valid segments and set them into the configuration
    SegmentStatusManager segmentStatusManager = new SegmentStatusManager(identifier);
    SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments(loadMetadataDetails);
    if (getValidateSegmentsToAccess(job.getConfiguration())) {
        List<Segment> validSegments = segments.getValidSegments();
        streamSegments = segments.getStreamSegments();
        streamSegments = getFilteredSegment(job, streamSegments, true);
        if (validSegments.size() == 0) {
            return getSplitsOfStreaming(job, identifier, streamSegments);
        }
        List<Segment> filteredSegmentToAccess = getFilteredSegment(job, segments.getValidSegments(), true);
        if (filteredSegmentToAccess.size() == 0) {
            return getSplitsOfStreaming(job, identifier, streamSegments);
        } else {
            setSegmentsToAccess(job.getConfiguration(), filteredSegmentToAccess);
        }
        // remove entry in the segment index if there are invalid segments
        invalidSegments.addAll(segments.getInvalidSegments());
        for (Segment invalidSegmentId : invalidSegments) {
            invalidTimestampsList.add(updateStatusManager.getInvalidTimestampRange(invalidSegmentId.getSegmentNo()));
        }
        if (invalidSegments.size() > 0) {
            DataMapStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), invalidSegments);
        }
    }
    ArrayList<Segment> validAndInProgressSegments = new ArrayList<>(segments.getValidSegments());
    // Add in progress segments also to filter it as in case of aggregate table load it loads
    // data from in progress table.
    validAndInProgressSegments.addAll(segments.getListOfInProgressSegments());
    // get updated filtered list
    List<Segment> filteredSegmentToAccess = getFilteredSegment(job, new ArrayList<>(validAndInProgressSegments), false);
    // Clean the updated segments from memory if the update happens on segments
    List<Segment> toBeCleanedSegments = new ArrayList<>();
    for (SegmentUpdateDetails segmentUpdateDetail : updateStatusManager.getUpdateStatusDetails()) {
        boolean refreshNeeded = DataMapStoreManager.getInstance().getTableSegmentRefresher(carbonTable).isRefreshNeeded(segmentUpdateDetail.getSegmentName(), updateStatusManager);
        if (refreshNeeded) {
            toBeCleanedSegments.add(new Segment(segmentUpdateDetail.getSegmentName(), null));
        }
    }
    // Clean segments if refresh is needed
    for (Segment segment : filteredSegmentToAccess) {
        if (DataMapStoreManager.getInstance().getTableSegmentRefresher(carbonTable).isRefreshNeeded(segment.getSegmentNo())) {
            toBeCleanedSegments.add(segment);
        }
    }
    if (toBeCleanedSegments.size() > 0) {
        DataMapStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), toBeCleanedSegments);
    }
    // process and resolve the expression
    Expression filter = getFilterPredicates(job.getConfiguration());
    TableProvider tableProvider = new SingleTableProvider(carbonTable);
    // this will be null in case of corrupt schema file.
    PartitionInfo partitionInfo = carbonTable.getPartitionInfo(carbonTable.getTableName());
    carbonTable.processFilterExpression(filter, null, null);
    // prune partitions for filter query on partition table
    BitSet matchedPartitions = null;
    if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) {
        matchedPartitions = setMatchedPartitions(null, filter, partitionInfo, null);
        if (matchedPartitions != null) {
            if (matchedPartitions.cardinality() == 0) {
                return new ArrayList<InputSplit>();
            } else if (matchedPartitions.cardinality() == partitionInfo.getNumPartitions()) {
                matchedPartitions = null;
            }
        }
    }
    FilterResolverIntf filterInterface = carbonTable.resolveFilter(filter, tableProvider);
    // do block filtering and get split
    List<InputSplit> splits = getSplits(job, filterInterface, filteredSegmentToAccess, matchedPartitions, partitionInfo, null, updateStatusManager);
    // pass the invalid segment to task side in order to remove index entry in task side
    if (invalidSegments.size() > 0) {
        for (InputSplit split : splits) {
            ((org.apache.carbondata.hadoop.CarbonInputSplit) split).setInvalidSegments(invalidSegments);
            ((org.apache.carbondata.hadoop.CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
        }
    }
    // add all splits of streaming
    List<InputSplit> splitsOfStreaming = getSplitsOfStreaming(job, identifier, streamSegments);
    if (!splitsOfStreaming.isEmpty()) {
        splits.addAll(splitsOfStreaming);
    }
    return splits;
}
Also used : LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) Segment(org.apache.carbondata.core.datamap.Segment) PartitionInfo(org.apache.carbondata.core.metadata.schema.PartitionInfo) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) BitSet(java.util.BitSet) SegmentStatusManager(org.apache.carbondata.core.statusmanager.SegmentStatusManager) IOException(java.io.IOException) TableProvider(org.apache.carbondata.core.scan.filter.TableProvider) SingleTableProvider(org.apache.carbondata.core.scan.filter.SingleTableProvider) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) SegmentUpdateDetails(org.apache.carbondata.core.mutate.SegmentUpdateDetails) SingleTableProvider(org.apache.carbondata.core.scan.filter.SingleTableProvider) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) Expression(org.apache.carbondata.core.scan.expression.Expression) FilterResolverIntf(org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf)

Example 23 with Segment

use of org.apache.carbondata.core.datamap.Segment in project carbondata by apache.

the class CarbonTableInputFormat method getSplitsOfOneSegment.

/**
 * Read data in one segment. For alter table partition statement
 * @param job
 * @param targetSegment
 * @param oldPartitionIdList  get old partitionId before partitionInfo was changed
 * @return
 */
public List<InputSplit> getSplitsOfOneSegment(JobContext job, String targetSegment, List<Integer> oldPartitionIdList, PartitionInfo partitionInfo) {
    List<Segment> invalidSegments = new ArrayList<>();
    List<UpdateVO> invalidTimestampsList = new ArrayList<>();
    List<Segment> segmentList = new ArrayList<>();
    segmentList.add(new Segment(targetSegment, null));
    setSegmentsToAccess(job.getConfiguration(), segmentList);
    try {
        // process and resolve the expression
        Expression filter = getFilterPredicates(job.getConfiguration());
        CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
        // this will be null in case of corrupt schema file.
        if (null == carbonTable) {
            throw new IOException("Missing/Corrupt schema file for table.");
        }
        carbonTable.processFilterExpression(filter, null, null);
        TableProvider tableProvider = new SingleTableProvider(carbonTable);
        // prune partitions for filter query on partition table
        String partitionIds = job.getConfiguration().get(ALTER_PARTITION_ID);
        // matchedPartitions records partitionIndex, not partitionId
        BitSet matchedPartitions = null;
        if (partitionInfo != null) {
            matchedPartitions = setMatchedPartitions(partitionIds, filter, partitionInfo, oldPartitionIdList);
            if (matchedPartitions != null) {
                if (matchedPartitions.cardinality() == 0) {
                    return new ArrayList<InputSplit>();
                } else if (matchedPartitions.cardinality() == partitionInfo.getNumPartitions()) {
                    matchedPartitions = null;
                }
            }
        }
        FilterResolverIntf filterInterface = carbonTable.resolveFilter(filter, tableProvider);
        // do block filtering and get split
        List<InputSplit> splits = getSplits(job, filterInterface, segmentList, matchedPartitions, partitionInfo, oldPartitionIdList, new SegmentUpdateStatusManager(carbonTable));
        // pass the invalid segment to task side in order to remove index entry in task side
        if (invalidSegments.size() > 0) {
            for (InputSplit split : splits) {
                ((CarbonInputSplit) split).setInvalidSegments(invalidSegments);
                ((CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
            }
        }
        return splits;
    } catch (IOException e) {
        throw new RuntimeException("Can't get splits of the target segment ", e);
    }
}
Also used : SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) ArrayList(java.util.ArrayList) BitSet(java.util.BitSet) IOException(java.io.IOException) TableProvider(org.apache.carbondata.core.scan.filter.TableProvider) SingleTableProvider(org.apache.carbondata.core.scan.filter.SingleTableProvider) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) Segment(org.apache.carbondata.core.datamap.Segment) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) SingleTableProvider(org.apache.carbondata.core.scan.filter.SingleTableProvider) Expression(org.apache.carbondata.core.scan.expression.Expression) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FilterResolverIntf(org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf)

Aggregations

Segment (org.apache.carbondata.core.datamap.Segment)23 ArrayList (java.util.ArrayList)10 IOException (java.io.IOException)8 LoadMetadataDetails (org.apache.carbondata.core.statusmanager.LoadMetadataDetails)8 SegmentStatusManager (org.apache.carbondata.core.statusmanager.SegmentStatusManager)8 AbsoluteTableIdentifier (org.apache.carbondata.core.metadata.AbsoluteTableIdentifier)6 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)6 CarbonFile (org.apache.carbondata.core.datastore.filesystem.CarbonFile)5 SegmentUpdateStatusManager (org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager)5 CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)4 InputSplit (org.apache.hadoop.mapreduce.InputSplit)4 SegmentFileStore (org.apache.carbondata.core.metadata.SegmentFileStore)3 Expression (org.apache.carbondata.core.scan.expression.Expression)3 SingleTableProvider (org.apache.carbondata.core.scan.filter.SingleTableProvider)3 TableProvider (org.apache.carbondata.core.scan.filter.TableProvider)3 FilterResolverIntf (org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf)3 BitSet (java.util.BitSet)2 HashSet (java.util.HashSet)2 FileFactory (org.apache.carbondata.core.datastore.impl.FileFactory)2 ICarbonLock (org.apache.carbondata.core.locks.ICarbonLock)2