Examples with CarbonInputSplit - org.apache.carbondata.hadoop.CarbonInputSplit

Example 6 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonFileInputFormat method getSplits.

/**
 * get list of block/blocklet and make them to CarbonInputSplit
 * @param job JobContext with Configuration
 * @return list of CarbonInputSplit
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
    if (null == carbonTable) {
        throw new IOException("Missing/Corrupt schema file for table.");
    }
    AbsoluteTableIdentifier identifier = carbonTable.getAbsoluteTableIdentifier();
    // get all valid segments and set them into the configuration
    // check for externalTable segment (Segment_null)
    // process and resolve the expression
    ReadCommittedScope readCommittedScope;
    if (carbonTable.isTransactionalTable()) {
        readCommittedScope = new LatestFilesReadCommittedScope(identifier.getTablePath() + "/Fact/Part0/Segment_null/", job.getConfiguration());
    } else {
        readCommittedScope = getReadCommittedScope(job.getConfiguration());
        if (readCommittedScope == null) {
            readCommittedScope = new LatestFilesReadCommittedScope(identifier.getTablePath(), job.getConfiguration());
        } else {
            readCommittedScope.setConfiguration(job.getConfiguration());
        }
    }
    // this will be null in case of corrupt schema file.
    IndexFilter filter = getFilterPredicates(job.getConfiguration());
    // if external table Segments are found, add it to the List
    List<Segment> externalTableSegments = new ArrayList<>();
    Segment seg;
    if (carbonTable.isTransactionalTable()) {
        // SDK some cases write into the Segment Path instead of Table Path i.e. inside
        // the "Fact/Part0/Segment_null". The segment in this case is named as "null".
        // The table is denoted by default as a transactional table and goes through
        // the path of CarbonFileInputFormat. The above scenario is handled in the below code.
        seg = new Segment("null", null, readCommittedScope);
        externalTableSegments.add(seg);
    } else {
        LoadMetadataDetails[] loadMetadataDetails = readCommittedScope.getSegmentList();
        for (LoadMetadataDetails load : loadMetadataDetails) {
            seg = new Segment(load.getLoadName(), null, readCommittedScope);
            if (fileLists != null) {
                for (Object fileList : fileLists) {
                    String timestamp = CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileList.toString());
                    if (timestamp.equals(seg.getSegmentNo())) {
                        externalTableSegments.add(seg);
                        break;
                    }
                }
            } else {
                externalTableSegments.add(seg);
            }
        }
    }
    List<InputSplit> splits = new ArrayList<>();
    boolean useBlockIndex = job.getConfiguration().getBoolean("filter_blocks", true);
    // scenarios
    if (filter != null) {
        filter.resolve(false);
    }
    if (useBlockIndex) {
        // do block filtering and get split
        splits = getSplits(job, filter, externalTableSegments);
    } else {
        List<CarbonFile> carbonFiles;
        if (null != this.fileLists) {
            carbonFiles = getAllCarbonDataFiles(this.fileLists);
        } else {
            carbonFiles = getAllCarbonDataFiles(carbonTable.getTablePath());
        }
        List<String> allDeleteDeltaFiles = getAllDeleteDeltaFiles(carbonTable.getTablePath());
        for (CarbonFile carbonFile : carbonFiles) {
            // Segment id is set to null because SDK does not write carbondata files with respect
            // to segments. So no specific name is present for this load.
            CarbonInputSplit split = new CarbonInputSplit("null", carbonFile.getAbsolutePath(), 0, carbonFile.getLength(), carbonFile.getLocations(), FileFormat.COLUMNAR_V3);
            split.setVersion(ColumnarFormatVersion.V3);
            BlockletDetailInfo info = new BlockletDetailInfo();
            split.setDetailInfo(info);
            info.setBlockSize(carbonFile.getLength());
            info.setVersionNumber(split.getVersion().number());
            info.setUseMinMaxForPruning(false);
            if (CollectionUtils.isNotEmpty(allDeleteDeltaFiles)) {
                split.setDeleteDeltaFiles(getDeleteDeltaFiles(carbonFile.getAbsolutePath(), allDeleteDeltaFiles));
            }
            splits.add(split);
        }
        splits.sort(Comparator.comparing(o -> ((CarbonInputSplit) o).getFilePath()));
    }
    setAllColumnProjectionIfNotConfigured(job, carbonTable);
    return splits;
}

Also used : Segment(org.apache.carbondata.core.index.Segment) BlockletDetailInfo(org.apache.carbondata.core.indexstore.BlockletDetailInfo) TableInfo(org.apache.carbondata.core.metadata.schema.table.TableInfo) FileFactory(org.apache.carbondata.core.datastore.impl.FileFactory) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) CarbonCommonConstants(org.apache.carbondata.core.constants.CarbonCommonConstants) ColumnarFormatVersion(org.apache.carbondata.core.metadata.ColumnarFormatVersion) CollectionUtils(org.apache.commons.collections.CollectionUtils) Configuration(org.apache.hadoop.conf.Configuration) LinkedList(java.util.LinkedList) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) ReadCommittedScope(org.apache.carbondata.core.readcommitter.ReadCommittedScope) InterfaceAudience(org.apache.carbondata.common.annotations.InterfaceAudience) CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) InputSplit(org.apache.hadoop.mapreduce.InputSplit) InterfaceStability(org.apache.carbondata.common.annotations.InterfaceStability) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) IOException(java.io.IOException) File(java.io.File) Serializable(java.io.Serializable) LatestFilesReadCommittedScope(org.apache.carbondata.core.readcommitter.LatestFilesReadCommittedScope) List(java.util.List) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FileFormat(org.apache.carbondata.core.statusmanager.FileFormat) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) JobContext(org.apache.hadoop.mapreduce.JobContext) Pattern(java.util.regex.Pattern) IndexFilter(org.apache.carbondata.core.index.IndexFilter) Comparator(java.util.Comparator) CarbonFileFilter(org.apache.carbondata.core.datastore.filesystem.CarbonFileFilter) SchemaReader(org.apache.carbondata.core.metadata.schema.SchemaReader) ArrayUtils(org.apache.commons.lang.ArrayUtils) CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) LatestFilesReadCommittedScope(org.apache.carbondata.core.readcommitter.LatestFilesReadCommittedScope) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) IOException(java.io.IOException) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) Segment(org.apache.carbondata.core.index.Segment) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) ReadCommittedScope(org.apache.carbondata.core.readcommitter.ReadCommittedScope) LatestFilesReadCommittedScope(org.apache.carbondata.core.readcommitter.LatestFilesReadCommittedScope) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) BlockletDetailInfo(org.apache.carbondata.core.indexstore.BlockletDetailInfo) IndexFilter(org.apache.carbondata.core.index.IndexFilter) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 7 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonTableInputFormat method updateLoadMetaDataDetailsToSegments.

public void updateLoadMetaDataDetailsToSegments(List<Segment> validSegments, List<org.apache.carbondata.hadoop.CarbonInputSplit> prunedSplits) {
    Map<String, Segment> validSegmentsMap = validSegments.stream().collect(Collectors.toMap(Segment::getSegmentNo, segment -> segment, (e1, e2) -> e1));
    for (CarbonInputSplit split : prunedSplits) {
        Segment segment = split.getSegment();
        if (segment.getLoadMetadataDetails() == null || segment.getReadCommittedScope() == null) {
            if (validSegmentsMap.containsKey(segment.getSegmentNo())) {
                segment.setLoadMetadataDetails(validSegmentsMap.get(segment.getSegmentNo()).getLoadMetadataDetails());
                segment.setReadCommittedScope(validSegmentsMap.get(segment.getSegmentNo()).getReadCommittedScope());
            }
        }
    }
}

Also used : Arrays(java.util.Arrays) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSystem(org.apache.hadoop.fs.FileSystem) ExplainCollector(org.apache.carbondata.core.profiler.ExplainCollector) FileStatus(org.apache.hadoop.fs.FileStatus) FilterResolverIntf(org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) IndexChooser(org.apache.carbondata.core.index.IndexChooser) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) CarbonCommonConstants(org.apache.carbondata.core.constants.CarbonCommonConstants) Logger(org.apache.log4j.Logger) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) ReadCommittedScope(org.apache.carbondata.core.readcommitter.ReadCommittedScope) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) TableStatusReadCommittedScope(org.apache.carbondata.core.readcommitter.TableStatusReadCommittedScope) DeprecatedFeatureException(org.apache.carbondata.common.exceptions.DeprecatedFeatureException) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) BlockMappingVO(org.apache.carbondata.core.mutate.data.BlockMappingVO) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) IndexUtil(org.apache.carbondata.core.index.IndexUtil) CarbonProperties(org.apache.carbondata.core.util.CarbonProperties) CarbonUtil(org.apache.carbondata.core.util.CarbonUtil) Segment(org.apache.carbondata.core.index.Segment) HashMap(java.util.HashMap) StreamFile(org.apache.carbondata.core.stream.StreamFile) FileFactory(org.apache.carbondata.core.datastore.impl.FileFactory) IndexExprWrapper(org.apache.carbondata.core.index.dev.expr.IndexExprWrapper) SegmentStatus(org.apache.carbondata.core.statusmanager.SegmentStatus) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) StageInputCollector(org.apache.carbondata.core.statusmanager.StageInputCollector) CarbonUpdateUtil(org.apache.carbondata.core.mutate.CarbonUpdateUtil) LinkedList(java.util.LinkedList) LogServiceFactory(org.apache.carbondata.common.logging.LogServiceFactory) IndexStoreManager(org.apache.carbondata.core.index.IndexStoreManager) SegmentStatusManager(org.apache.carbondata.core.statusmanager.SegmentStatusManager) StreamPruner(org.apache.carbondata.core.stream.StreamPruner) CarbonCommonConstantsInternal(org.apache.carbondata.core.constants.CarbonCommonConstantsInternal) SegmentUpdateDetails(org.apache.carbondata.core.mutate.SegmentUpdateDetails) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) LatestFilesReadCommittedScope(org.apache.carbondata.core.readcommitter.LatestFilesReadCommittedScope) PartitionSpec(org.apache.carbondata.core.indexstore.PartitionSpec) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FileFormat(org.apache.carbondata.core.statusmanager.FileFormat) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) JobContext(org.apache.hadoop.mapreduce.JobContext) IndexFilter(org.apache.carbondata.core.index.IndexFilter) TableIndex(org.apache.carbondata.core.index.TableIndex) ArrayUtils(org.apache.commons.lang.ArrayUtils) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) Segment(org.apache.carbondata.core.index.Segment)

Example 8 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonInputFormat method checkAndAddImplicitExpression.

/**
 * This method will create an Implicit Expression and set it as right child in the given
 * expression
 */
private void checkAndAddImplicitExpression(IndexFilter indexFilter, InputSplit inputSplit, boolean hasColumnDrift) {
    if (inputSplit instanceof CarbonMultiBlockSplit) {
        CarbonMultiBlockSplit split = (CarbonMultiBlockSplit) inputSplit;
        List<CarbonInputSplit> splits = split.getAllSplits();
        // iterate over all the splits and create block to bblocklet mapping
        Map<String, Set<Integer>> blockIdToBlockletIdMapping = new HashMap<>();
        for (CarbonInputSplit carbonInputSplit : splits) {
            Set<Integer> validBlockletIds = carbonInputSplit.getValidBlockletIds();
            if (null != validBlockletIds && !validBlockletIds.isEmpty()) {
                String uniqueBlockPath = carbonInputSplit.getFilePath();
                String shortBlockPath = CarbonTablePath.getShortBlockId(uniqueBlockPath.substring(uniqueBlockPath.lastIndexOf("/Part") + 1));
                blockIdToBlockletIdMapping.put(shortBlockPath, validBlockletIds);
            }
        }
        if (!blockIdToBlockletIdMapping.isEmpty()) {
            // create implicit expression and set as right child
            FilterUtil.createImplicitExpressionAndSetAsRightChild(indexFilter.getExpression(), blockIdToBlockletIdMapping);
            // then the filter expression will be processed during QueryModelBuilder.build()
            if (hasColumnDrift) {
                indexFilter.processFilterExpression();
            }
        }
    }
}

Also used : Set(java.util.Set) HashSet(java.util.HashSet) CarbonMultiBlockSplit(org.apache.carbondata.hadoop.CarbonMultiBlockSplit) HashMap(java.util.HashMap) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 9 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonInputSplitTaskInfo method getLocations.

@Override
public String[] getLocations() {
    Set<String> locations = new HashSet<>();
    for (CarbonInputSplit splitInfo : carbonBlockInfoList) {
        try {
            locations.addAll(Arrays.asList(splitInfo.getLocations()));
        } catch (IOException e) {
            throw new RuntimeException("Fail to get location of split: " + splitInfo, e);
        }
    }
    locations.toArray(new String[locations.size()]);
    List<String> nodes = CarbonInputSplitTaskInfo.maxNoNodes(carbonBlockInfoList);
    return nodes.toArray(new String[nodes.size()]);
}

Also used : CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException) HashSet(java.util.HashSet)

Example 10 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonInputSplitTaskInfo method maxNoNodes.

/**
 * Finding which node has the maximum number of blocks for it.
 */
public static List<String> maxNoNodes(List<CarbonInputSplit> splitList) {
    boolean useIndex = true;
    Integer maxOccurrence = 0;
    String maxNode = null;
    Map<String, Integer> nodeAndOccurrenceMapping = new TreeMap<>();
    // populate the map of node and number of occurrences of that node.
    for (CarbonInputSplit split : splitList) {
        try {
            for (String node : split.getLocations()) {
                nodeAndOccurrenceMapping.putIfAbsent(node, 1);
            }
        } catch (IOException e) {
            throw new RuntimeException("Fail to get location of split: " + split, e);
        }
    }
    Integer previousValueOccurrence = null;
    // check which node is occurred maximum times.
    for (Map.Entry<String, Integer> entry : nodeAndOccurrenceMapping.entrySet()) {
        // finding the maximum node.
        if (entry.getValue() > maxOccurrence) {
            maxOccurrence = entry.getValue();
            maxNode = entry.getKey();
        }
        // first time scenario. initializing the previous value.
        if (null == previousValueOccurrence) {
            previousValueOccurrence = entry.getValue();
        } else {
            // we need to return complete list instead of max node.
            if (!Objects.equals(previousValueOccurrence, entry.getValue())) {
                useIndex = false;
            }
        }
    }
    // if all the nodes have equal occurrence then returning the complete key set.
    if (useIndex) {
        return new ArrayList<>(nodeAndOccurrenceMapping.keySet());
    }
    // if any max node is found then returning the max node.
    List<String> node = new ArrayList<>(1);
    node.add(maxNode);
    return node;
}

Also used : ArrayList(java.util.ArrayList) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException) TreeMap(java.util.TreeMap) TreeMap(java.util.TreeMap) Map(java.util.Map)

Aggregations

CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)33 ArrayList (java.util.ArrayList)17 IOException (java.io.IOException)15 InputSplit (org.apache.hadoop.mapreduce.InputSplit)10 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)8 LinkedList (java.util.LinkedList)6 CarbonMultiBlockSplit (org.apache.carbondata.hadoop.CarbonMultiBlockSplit)6 IndexFilter (org.apache.carbondata.core.index.IndexFilter)5 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)5 HashMap (java.util.HashMap)4 HashSet (java.util.HashSet)4 List (java.util.List)4 TableBlockInfo (org.apache.carbondata.core.datastore.block.TableBlockInfo)4 PartitionSpec (org.apache.carbondata.core.indexstore.PartitionSpec)4 LoadMetadataDetails (org.apache.carbondata.core.statusmanager.LoadMetadataDetails)4 SegmentUpdateStatusManager (org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager)4 CarbonTableInputFormat (org.apache.carbondata.hadoop.api.CarbonTableInputFormat)4 Configuration (org.apache.hadoop.conf.Configuration)4 Path (org.apache.hadoop.fs.Path)4 Gson (com.google.gson.Gson)3