Search in sources :

Example 16 with Segment

use of org.apache.carbondata.core.index.Segment in project carbondata by apache.

the class CarbonTableInputFormat method getFilteredSegment.

/**
 * Return segment list after filtering out valid segments and segments set by user by
 * `INPUT_SEGMENT_NUMBERS` in job configuration
 */
private List<Segment> getFilteredSegment(JobContext job, List<Segment> validSegments, boolean validationRequired, ReadCommittedScope readCommittedScope) throws IOException {
    Segment[] segmentsToAccess = getSegmentsToAccess(job, readCommittedScope);
    if (segmentsToAccess.length == 0 || segmentsToAccess[0].getSegmentNo().equalsIgnoreCase("*")) {
        return validSegments;
    }
    Map<String, Segment> segmentToAccessMap = Arrays.stream(segmentsToAccess).collect(Collectors.toMap(Segment::getSegmentNo, segment -> segment, (e1, e2) -> e1));
    Map<String, Segment> filteredSegmentToAccess = new HashMap<>(segmentToAccessMap.size());
    for (Segment validSegment : validSegments) {
        String segmentNoOfValidSegment = validSegment.getSegmentNo();
        if (segmentToAccessMap.containsKey(segmentNoOfValidSegment)) {
            Segment segmentToAccess = segmentToAccessMap.get(segmentNoOfValidSegment);
            if (segmentToAccess.getSegmentFileName() != null && validSegment.getSegmentFileName() == null) {
                validSegment = segmentToAccess;
            }
            filteredSegmentToAccess.put(segmentNoOfValidSegment, validSegment);
        }
    }
    if (!validationRequired && filteredSegmentToAccess.size() != segmentToAccessMap.size()) {
        for (Segment segment : segmentToAccessMap.values()) {
            boolean isSegmentValid = true;
            LoadMetadataDetails[] segmentList = readCommittedScope.getSegmentList();
            for (LoadMetadataDetails validSegment : segmentList) {
                if (validSegment.getLoadName().equals(segment.getSegmentNo()) && (validSegment.getSegmentStatus().equals(SegmentStatus.MARKED_FOR_DELETE) || validSegment.getSegmentStatus().equals(SegmentStatus.COMPACTED))) {
                    isSegmentValid = false;
                    break;
                }
            }
            if (isSegmentValid && !filteredSegmentToAccess.containsKey(segment.getSegmentNo())) {
                filteredSegmentToAccess.put(segment.getSegmentNo(), segment);
            }
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Segments ignored are : " + Arrays.toString(Sets.difference(new HashSet<>(filteredSegmentToAccess.values()), new HashSet<>(segmentToAccessMap.values())).toArray()));
    }
    return new ArrayList<>(filteredSegmentToAccess.values());
}
Also used : Arrays(java.util.Arrays) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSystem(org.apache.hadoop.fs.FileSystem) ExplainCollector(org.apache.carbondata.core.profiler.ExplainCollector) FileStatus(org.apache.hadoop.fs.FileStatus) FilterResolverIntf(org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) IndexChooser(org.apache.carbondata.core.index.IndexChooser) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) CarbonCommonConstants(org.apache.carbondata.core.constants.CarbonCommonConstants) Logger(org.apache.log4j.Logger) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) ReadCommittedScope(org.apache.carbondata.core.readcommitter.ReadCommittedScope) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) TableStatusReadCommittedScope(org.apache.carbondata.core.readcommitter.TableStatusReadCommittedScope) DeprecatedFeatureException(org.apache.carbondata.common.exceptions.DeprecatedFeatureException) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) BlockMappingVO(org.apache.carbondata.core.mutate.data.BlockMappingVO) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) IndexUtil(org.apache.carbondata.core.index.IndexUtil) CarbonProperties(org.apache.carbondata.core.util.CarbonProperties) CarbonUtil(org.apache.carbondata.core.util.CarbonUtil) Segment(org.apache.carbondata.core.index.Segment) HashMap(java.util.HashMap) StreamFile(org.apache.carbondata.core.stream.StreamFile) FileFactory(org.apache.carbondata.core.datastore.impl.FileFactory) IndexExprWrapper(org.apache.carbondata.core.index.dev.expr.IndexExprWrapper) SegmentStatus(org.apache.carbondata.core.statusmanager.SegmentStatus) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) StageInputCollector(org.apache.carbondata.core.statusmanager.StageInputCollector) CarbonUpdateUtil(org.apache.carbondata.core.mutate.CarbonUpdateUtil) LinkedList(java.util.LinkedList) LogServiceFactory(org.apache.carbondata.common.logging.LogServiceFactory) IndexStoreManager(org.apache.carbondata.core.index.IndexStoreManager) SegmentStatusManager(org.apache.carbondata.core.statusmanager.SegmentStatusManager) StreamPruner(org.apache.carbondata.core.stream.StreamPruner) CarbonCommonConstantsInternal(org.apache.carbondata.core.constants.CarbonCommonConstantsInternal) SegmentUpdateDetails(org.apache.carbondata.core.mutate.SegmentUpdateDetails) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) LatestFilesReadCommittedScope(org.apache.carbondata.core.readcommitter.LatestFilesReadCommittedScope) PartitionSpec(org.apache.carbondata.core.indexstore.PartitionSpec) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FileFormat(org.apache.carbondata.core.statusmanager.FileFormat) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) JobContext(org.apache.hadoop.mapreduce.JobContext) IndexFilter(org.apache.carbondata.core.index.IndexFilter) TableIndex(org.apache.carbondata.core.index.TableIndex) ArrayUtils(org.apache.commons.lang.ArrayUtils) HashMap(java.util.HashMap) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) Segment(org.apache.carbondata.core.index.Segment)

Example 17 with Segment

use of org.apache.carbondata.core.index.Segment in project carbondata by apache.

the class CarbonTableInputFormat method getSplits.

/**
 * get list of block/blocklet and make them to CarbonInputSplit
 * @param job JobContext with Configuration
 * @return list of CarbonInputSplit
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    carbonTable = getOrCreateCarbonTable(job.getConfiguration());
    if (null == carbonTable) {
        throw new IOException("Missing/Corrupt schema file for table.");
    }
    // global dictionary is not supported since 2.0
    if (carbonTable.getTableInfo().getFactTable().getTableProperties().containsKey(CarbonCommonConstants.DICTIONARY_INCLUDE)) {
        DeprecatedFeatureException.globalDictNotSupported();
    }
    List<InputSplit> splits = new LinkedList<>();
    if (CarbonProperties.isQueryStageInputEnabled()) {
        // included for the query
        try {
            List<InputSplit> stageInputSplits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration());
            splits.addAll(stageInputSplits);
        } catch (ExecutionException | InterruptedException e) {
            LOG.error("Failed to create input splits from stage files", e);
            throw new IOException(e);
        }
    }
    this.readCommittedScope = getReadCommitted(job, carbonTable.getAbsoluteTableIdentifier());
    LoadMetadataDetails[] loadMetadataDetails = readCommittedScope.getSegmentList();
    String updateDeltaVersion = job.getConfiguration().get(UPDATE_DELTA_VERSION);
    SegmentUpdateStatusManager updateStatusManager;
    if (updateDeltaVersion != null) {
        updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails, updateDeltaVersion);
    } else {
        updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails);
    }
    List<String> invalidSegmentIds = new ArrayList<>();
    List<Segment> streamSegments = null;
    // get all valid segments and set them into the configuration
    SegmentStatusManager segmentStatusManager = new SegmentStatusManager(carbonTable.getAbsoluteTableIdentifier(), readCommittedScope.getConfiguration());
    SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments(carbonTable.isMV(), loadMetadataDetails, this.readCommittedScope);
    if (getValidateSegmentsToAccess(job.getConfiguration())) {
        List<Segment> validSegments = segments.getValidSegments();
        streamSegments = segments.getStreamSegments();
        streamSegments = getFilteredSegment(job, streamSegments, true, readCommittedScope);
        if (validSegments.size() == 0) {
            splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
            return splits;
        }
        List<Segment> filteredSegmentToAccess = getFilteredSegment(job, segments.getValidSegments(), true, readCommittedScope);
        if (filteredSegmentToAccess.size() == 0) {
            splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
            return splits;
        } else {
            setSegmentsToAccess(job.getConfiguration(), filteredSegmentToAccess);
        }
        // remove entry in the segment index if there are invalid segments
        for (Segment segment : segments.getInvalidSegments()) {
            invalidSegmentIds.add(segment.getSegmentNo());
        }
        if (invalidSegmentIds.size() > 0) {
            IndexStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), invalidSegmentIds);
        }
    }
    List<Segment> validAndInProgressSegments = new ArrayList<>(segments.getValidSegments());
    // Add in progress segments also to filter it as in case of Secondary Index table load it loads
    // data from in progress table.
    validAndInProgressSegments.addAll(segments.getListOfInProgressSegments());
    List<Segment> segmentToAccess = getFilteredSegment(job, validAndInProgressSegments, false, readCommittedScope);
    String segmentFileName = job.getConfiguration().get(CarbonCommonConstants.CURRENT_SEGMENTFILE);
    if (segmentFileName != null) {
        // per segment it has only one file("current.segment")
        segmentToAccess.get(0).setSegmentFileName(segmentFileName + CarbonTablePath.SEGMENT_EXT);
    }
    // process and resolve the expression
    IndexFilter indexFilter = getFilterPredicates(job.getConfiguration());
    if (indexFilter != null) {
        indexFilter.resolve(false);
    }
    // do block filtering and get split
    List<InputSplit> batchSplits = getSplits(job, indexFilter, segmentToAccess, updateStatusManager, segments.getInvalidSegments());
    splits.addAll(batchSplits);
    // add all splits of streaming
    List<InputSplit> splitsOfStreaming = getSplitsOfStreaming(job, streamSegments, carbonTable);
    if (!splitsOfStreaming.isEmpty()) {
        splits.addAll(splitsOfStreaming);
    }
    return splits;
}
Also used : SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) SegmentStatusManager(org.apache.carbondata.core.statusmanager.SegmentStatusManager) IOException(java.io.IOException) LinkedList(java.util.LinkedList) Segment(org.apache.carbondata.core.index.Segment) IndexFilter(org.apache.carbondata.core.index.IndexFilter) ExecutionException(java.util.concurrent.ExecutionException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 18 with Segment

use of org.apache.carbondata.core.index.Segment in project carbondata by apache.

the class CarbonTableInputFormat method getBlockRowCount.

/**
 * Get the row count of the Block and mapping of segment and Block count.
 */
public BlockMappingVO getBlockRowCount(Job job, CarbonTable table, List<PartitionSpec> partitions, boolean isUpdateFlow) throws IOException {
    // Normal query flow goes to CarbonInputFormat#getPrunedBlocklets and initialize the
    // pruning info for table we queried. But here count star query without filter uses a different
    // query plan, and no pruning info is initialized. When it calls default index to
    // prune(with a null filter), exception will occur during setting pruning info.
    // Considering no useful information about block/blocklet pruning for such query
    // (actually no pruning), so we disable explain collector here
    ExplainCollector.remove();
    AbsoluteTableIdentifier identifier = table.getAbsoluteTableIdentifier();
    ReadCommittedScope readCommittedScope = getReadCommitted(job, identifier);
    LoadMetadataDetails[] loadMetadataDetails = readCommittedScope.getSegmentList();
    SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(table, loadMetadataDetails);
    SegmentStatusManager.ValidAndInvalidSegmentsInfo allSegments = new SegmentStatusManager(identifier, readCommittedScope.getConfiguration()).getValidAndInvalidSegments(table.isMV(), loadMetadataDetails, readCommittedScope);
    Map<String, Long> blockRowCountMapping = new HashMap<>();
    Map<String, Long> segmentAndBlockCountMapping = new HashMap<>();
    Map<String, String> blockToSegmentMapping = new HashMap<>();
    // TODO: currently only batch segment is supported, add support for streaming table
    List<Segment> filteredSegment = getFilteredSegment(job, allSegments.getValidSegments(), false, readCommittedScope);
    boolean isIUDTable = (updateStatusManager.getUpdateStatusDetails().length != 0);
    /* In the select * flow, getSplits() method was clearing the segmentMap if,
    segment needs refreshing. same thing need for select count(*) flow also.
    For NonTransactional table, one of the reason for a segment refresh is below scenario.
    SDK is written one set of files with UUID, with same UUID it can write again.
    So, latest files content should reflect the new count by refreshing the segment */
    List<String> toBeCleanedSegments = new ArrayList<>();
    for (Segment segment : filteredSegment) {
        boolean refreshNeeded = IndexStoreManager.getInstance().getTableSegmentRefresher(getOrCreateCarbonTable(job.getConfiguration())).isRefreshNeeded(segment, SegmentUpdateStatusManager.getInvalidTimestampRange(segment.getLoadMetadataDetails()));
        if (refreshNeeded) {
            toBeCleanedSegments.add(segment.getSegmentNo());
        }
    }
    for (Segment segment : allSegments.getInvalidSegments()) {
        // remove entry in the segment index if there are invalid segments
        toBeCleanedSegments.add(segment.getSegmentNo());
    }
    if (toBeCleanedSegments.size() > 0) {
        IndexStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), toBeCleanedSegments);
    }
    IndexExprWrapper indexExprWrapper = IndexChooser.getDefaultIndex(getOrCreateCarbonTable(job.getConfiguration()), null);
    IndexUtil.loadIndexes(table, indexExprWrapper, filteredSegment);
    if (isIUDTable || isUpdateFlow) {
        Map<String, Long> blockletToRowCountMap = new HashMap<>();
        if (CarbonProperties.getInstance().isDistributedPruningEnabled(table.getDatabaseName(), table.getTableName())) {
            try {
                List<ExtendedBlocklet> extendedBlocklets = getDistributedBlockRowCount(table, partitions, filteredSegment, allSegments.getInvalidSegments(), toBeCleanedSegments, job.getConfiguration());
                for (ExtendedBlocklet blocklet : extendedBlocklets) {
                    String filePath = blocklet.getFilePath().replace("\\", "/");
                    String blockName = filePath.substring(filePath.lastIndexOf("/") + 1);
                    blockletToRowCountMap.put(blocklet.getSegmentId() + "," + blockName, blocklet.getRowCount());
                }
            } catch (Exception e) {
                // pruning.
                if (CarbonProperties.getInstance().isFallBackDisabled()) {
                    throw e;
                }
                TableIndex defaultIndex = IndexStoreManager.getInstance().getDefaultIndex(table);
                blockletToRowCountMap.putAll(defaultIndex.getBlockRowCount(filteredSegment, partitions, defaultIndex));
            }
        } else {
            TableIndex defaultIndex = IndexStoreManager.getInstance().getDefaultIndex(table);
            blockletToRowCountMap.putAll(defaultIndex.getBlockRowCount(filteredSegment, partitions, defaultIndex));
        }
        // key is the (segmentId","+blockletPath) and key is the row count of that blocklet
        for (Map.Entry<String, Long> eachBlocklet : blockletToRowCountMap.entrySet()) {
            String[] segmentIdAndPath = eachBlocklet.getKey().split(",", 2);
            String segmentId = segmentIdAndPath[0];
            String blockName = segmentIdAndPath[1];
            long rowCount = eachBlocklet.getValue();
            String key = CarbonUpdateUtil.getSegmentBlockNameKey(segmentId, blockName, table.isHivePartitionTable());
            // if block is invalid then don't add the count
            SegmentUpdateDetails details = updateStatusManager.getDetailsForABlock(key);
            if (null == details || !CarbonUpdateUtil.isBlockInvalid(details.getSegmentStatus())) {
                Long blockCount = blockRowCountMapping.get(key);
                if (blockCount == null) {
                    blockCount = 0L;
                    Long count = segmentAndBlockCountMapping.get(segmentId);
                    if (count == null) {
                        count = 0L;
                    }
                    segmentAndBlockCountMapping.put(segmentId, count + 1);
                }
                blockToSegmentMapping.put(key, segmentId);
                blockCount += rowCount;
                blockRowCountMapping.put(key, blockCount);
            }
        }
    } else {
        long totalRowCount;
        if (CarbonProperties.getInstance().isDistributedPruningEnabled(table.getDatabaseName(), table.getTableName())) {
            totalRowCount = getDistributedCount(table, partitions, filteredSegment, job.getConfiguration());
        } else {
            TableIndex defaultIndex = IndexStoreManager.getInstance().getDefaultIndex(table);
            totalRowCount = defaultIndex.getRowCount(filteredSegment, partitions, defaultIndex);
        }
        blockRowCountMapping.put(CarbonCommonConstantsInternal.ROW_COUNT, totalRowCount);
    }
    BlockMappingVO blockMappingVO = new BlockMappingVO(blockRowCountMapping, segmentAndBlockCountMapping);
    blockMappingVO.setBlockToSegmentMapping(blockToSegmentMapping);
    return blockMappingVO;
}
Also used : BlockMappingVO(org.apache.carbondata.core.mutate.data.BlockMappingVO) HashMap(java.util.HashMap) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) Segment(org.apache.carbondata.core.index.Segment) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) IndexExprWrapper(org.apache.carbondata.core.index.dev.expr.IndexExprWrapper) SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) TableIndex(org.apache.carbondata.core.index.TableIndex) SegmentStatusManager(org.apache.carbondata.core.statusmanager.SegmentStatusManager) DeprecatedFeatureException(org.apache.carbondata.common.exceptions.DeprecatedFeatureException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) SegmentUpdateDetails(org.apache.carbondata.core.mutate.SegmentUpdateDetails) ReadCommittedScope(org.apache.carbondata.core.readcommitter.ReadCommittedScope) TableStatusReadCommittedScope(org.apache.carbondata.core.readcommitter.TableStatusReadCommittedScope) LatestFilesReadCommittedScope(org.apache.carbondata.core.readcommitter.LatestFilesReadCommittedScope) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) Map(java.util.Map) HashMap(java.util.HashMap)

Example 19 with Segment

use of org.apache.carbondata.core.index.Segment in project carbondata by apache.

the class CarbonDataMergerUtil method identifySegmentsToBeMergedBasedOnSegCount.

/**
 * Identify the segments to be merged based on the segment count, the segment whose data size
 * exceed minor compaction size threshold will not be compacted.
 *
 * @param listOfSegmentsAfterPreserve the list of segments after
 *        preserve and before filtering by minor compaction level
 * @param tblProps
 * @return the list of segments to be merged after filtering by minor compaction level
 */
private static List<LoadMetadataDetails> identifySegmentsToBeMergedBasedOnSegCount(long compactionSize, List<LoadMetadataDetails> listOfSegmentsAfterPreserve, Map<String, String> tblProps, CarbonLoadModel carbonLoadModel) throws IOException {
    List<LoadMetadataDetails> mergedSegments = new ArrayList<>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE);
    List<LoadMetadataDetails> unMergedSegments = new ArrayList<>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE);
    int[] noOfSegmentLevelsCount = CarbonProperties.getInstance().getCompactionSegmentLevelCount();
    // overwrite system level option by table level option if exists
    if (tblProps.containsKey(CarbonCommonConstants.TABLE_COMPACTION_LEVEL_THRESHOLD)) {
        noOfSegmentLevelsCount = CarbonProperties.getInstance().getIntArray(tblProps.get(CarbonCommonConstants.TABLE_COMPACTION_LEVEL_THRESHOLD));
        if (0 == noOfSegmentLevelsCount.length) {
            noOfSegmentLevelsCount = CarbonProperties.getInstance().getCompactionSegmentLevelCount();
        }
    }
    int level1Size = 0;
    int level2Size = 0;
    int size = noOfSegmentLevelsCount.length;
    if (size >= 2) {
        level1Size = noOfSegmentLevelsCount[0];
        level2Size = noOfSegmentLevelsCount[1];
        /*
      Ex. if segs => 0.1,2,3 and threshold =2,1
      during 2nd time compaction,mergeCounter becomes 1 and we checks if mergeCounter==level2Size
      then return mergedSegments which will return 0.1 and since only 1 segment(0.1) is identified ,
      no segment would go for compaction .So change 2nd level threshold  to 0 if it is 1.
       */
        level2Size = level2Size == 1 ? 0 : level2Size;
    } else if (size == 1) {
        level1Size = noOfSegmentLevelsCount[0];
    }
    int unMergeCounter = 0;
    int mergeCounter = 0;
    CarbonTable carbonTable = carbonLoadModel.getCarbonDataLoadSchema().getCarbonTable();
    // check size of each segment , sum it up across partitions
    for (LoadMetadataDetails segment : listOfSegmentsAfterPreserve) {
        long sizeOfOneSegmentAcrossPartition;
        if (segment.getSegmentFile() != null) {
            // index files. If not there then read the index file and calculate size.
            if (!StringUtils.isEmpty(segment.getDataSize())) {
                sizeOfOneSegmentAcrossPartition = Long.parseLong(segment.getDataSize());
            } else {
                sizeOfOneSegmentAcrossPartition = CarbonUtil.getSizeOfSegment(carbonTable.getTablePath(), new Segment(segment.getLoadName(), segment.getSegmentFile()));
            }
        } else {
            sizeOfOneSegmentAcrossPartition = getSizeOfSegment(carbonTable.getTablePath(), segment.getLoadName());
        }
        // segment size.
        if (segment.getSegmentStatus() == SegmentStatus.STREAMING || segment.getSegmentStatus() == SegmentStatus.STREAMING_FINISH || (compactionSize > 0 && sizeOfOneSegmentAcrossPartition / (1024 * 1024) >= compactionSize)) {
            continue;
        }
        String segName = segment.getLoadName();
        // if a segment is already merged 2 or more levels (possible from custom compaction),
        // need to exclude those segments from minor compaction.
        // if a segment is major compacted then should not be considered for minor.
        boolean isMoreThanOrEqualsToLevel2 = false;
        if (segName.contains(".")) {
            if (Integer.parseInt(segName.substring(segName.lastIndexOf(".") + 1)) >= 2) {
                isMoreThanOrEqualsToLevel2 = true;
            }
        }
        if (isMoreThanOrEqualsToLevel2 || (segment.isMajorCompacted() != null && segment.isMajorCompacted().equalsIgnoreCase("true"))) {
            continue;
        }
        // check if the segment is merged or not, consider only non-compacted segments for merge.
        if ((segment.getSegmentStatus() == SegmentStatus.SUCCESS) || (segment.getSegmentStatus() == SegmentStatus.LOAD_PARTIAL_SUCCESS)) {
            if (!isMergedSegment(segName)) {
                // if it is an unmerged segment then increment counter
                unMergeCounter++;
                unMergedSegments.add(segment);
                if (unMergeCounter == (level1Size)) {
                    return unMergedSegments;
                }
            } else {
                mergeCounter++;
                mergedSegments.add(segment);
                if (mergeCounter == (level2Size)) {
                    return mergedSegments;
                }
            }
        }
    }
    return new ArrayList<>(0);
}
Also used : CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) Segment(org.apache.carbondata.core.index.Segment)

Example 20 with Segment

use of org.apache.carbondata.core.index.Segment in project carbondata by apache.

the class CarbonDataMergerUtil method identifySegmentsToBeMergedBasedOnSize.

/**
 * Identify the segments to be merged based on the Size in case of Major compaction.
 *
 * @param compactionSize compaction size in MB format
 * @param listOfSegmentsAfterPreserve  the segments list after
 *        preserving the configured number of latest loads
 * @param carbonLoadModel carbon load model
 * @return the list of segments that need to be merged
 *         based on the Size in case of Major compaction
 */
private static List<LoadMetadataDetails> identifySegmentsToBeMergedBasedOnSize(long compactionSize, List<LoadMetadataDetails> listOfSegmentsAfterPreserve, CarbonLoadModel carbonLoadModel) throws IOException {
    List<LoadMetadataDetails> segmentsToBeMerged = new ArrayList<>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE);
    CarbonTable carbonTable = carbonLoadModel.getCarbonDataLoadSchema().getCarbonTable();
    // total length
    long totalLength = 0;
    // check size of each segment , sum it up across partitions
    for (LoadMetadataDetails segment : listOfSegmentsAfterPreserve) {
        // compaction should skip streaming segments
        if (segment.getSegmentStatus() == SegmentStatus.STREAMING || segment.getSegmentStatus() == SegmentStatus.STREAMING_FINISH) {
            continue;
        }
        String segId = segment.getLoadName();
        // variable to store one  segment size across partition.
        long sizeOfOneSegmentAcrossPartition;
        if (segment.getSegmentFile() != null) {
            // index files. If not there then read the index file and calculate size.
            if (!StringUtils.isEmpty(segment.getDataSize())) {
                sizeOfOneSegmentAcrossPartition = Long.parseLong(segment.getDataSize());
            } else {
                sizeOfOneSegmentAcrossPartition = CarbonUtil.getSizeOfSegment(carbonTable.getTablePath(), new Segment(segId, segment.getSegmentFile()));
            }
        } else {
            sizeOfOneSegmentAcrossPartition = getSizeOfSegment(carbonTable.getTablePath(), segId);
        }
        // if size of a segment is greater than the Major compaction size. then ignore it.
        if (sizeOfOneSegmentAcrossPartition > (compactionSize * 1024 * 1024)) {
            // if already 2 segments have been found for merging then stop scan here and merge.
            if (segmentsToBeMerged.size() > 1) {
                break;
            } else {
                // if only one segment is found then remove the earlier one in list.
                // reset the total length to 0.
                segmentsToBeMerged = new ArrayList<>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE);
                totalLength = 0;
                continue;
            }
        }
        totalLength += sizeOfOneSegmentAcrossPartition;
        // in case of major compaction the size doesnt matter. all the segments will be merged.
        if (totalLength < (compactionSize * 1024 * 1024)) {
            segmentsToBeMerged.add(segment);
        } else {
            // if already 2 segments have been found for merging then stop scan here and merge.
            if (segmentsToBeMerged.size() > 1) {
                break;
            } else {
                // if only one segment is found then remove the earlier one in list and put this.
                // reset the total length to the current identified segment.
                segmentsToBeMerged = new ArrayList<>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE);
                segmentsToBeMerged.add(segment);
                totalLength = sizeOfOneSegmentAcrossPartition;
            }
        }
    }
    return segmentsToBeMerged;
}
Also used : CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) Segment(org.apache.carbondata.core.index.Segment)

Aggregations

Segment (org.apache.carbondata.core.index.Segment)35 ArrayList (java.util.ArrayList)24 IOException (java.io.IOException)18 LoadMetadataDetails (org.apache.carbondata.core.statusmanager.LoadMetadataDetails)14 SegmentStatusManager (org.apache.carbondata.core.statusmanager.SegmentStatusManager)11 HashMap (java.util.HashMap)10 List (java.util.List)9 Map (java.util.Map)8 AbsoluteTableIdentifier (org.apache.carbondata.core.metadata.AbsoluteTableIdentifier)8 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)8 SegmentUpdateStatusManager (org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager)8 CarbonCommonConstants (org.apache.carbondata.core.constants.CarbonCommonConstants)7 HashSet (java.util.HashSet)6 CarbonFile (org.apache.carbondata.core.datastore.filesystem.CarbonFile)6 FileFactory (org.apache.carbondata.core.datastore.impl.FileFactory)6 TableIndex (org.apache.carbondata.core.index.TableIndex)6 Collectors (java.util.stream.Collectors)5 LogServiceFactory (org.apache.carbondata.common.logging.LogServiceFactory)5 IndexFilter (org.apache.carbondata.core.index.IndexFilter)5 PartitionSpec (org.apache.carbondata.core.indexstore.PartitionSpec)5