Examples with CarbonInputSplit - org.apache.carbondata.hadoop.CarbonInputSplit

Example 1 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonInputSplitTaskInfo method getLocations.

@Override
public String[] getLocations() {
    Set<String> locations = new HashSet<String>();
    for (CarbonInputSplit splitInfo : carbonBlockInfoList) {
        try {
            locations.addAll(Arrays.asList(splitInfo.getLocations()));
        } catch (IOException e) {
            throw new RuntimeException("Fail to get location of split: " + splitInfo, e);
        }
    }
    locations.toArray(new String[locations.size()]);
    List<String> nodes = CarbonInputSplitTaskInfo.maxNoNodes(carbonBlockInfoList);
    return nodes.toArray(new String[nodes.size()]);
}

Also used : CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException) HashSet(java.util.HashSet)

Example 2 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonStreamInputFormatTest method buildInputSplit.

private InputSplit buildInputSplit() throws IOException {
    CarbonInputSplit carbonInputSplit = new CarbonInputSplit();
    List<CarbonInputSplit> splitList = new ArrayList<>();
    splitList.add(carbonInputSplit);
    return new CarbonMultiBlockSplit(splitList, new String[] { "localhost" }, FileFormat.ROW_V1);
}

Also used : CarbonMultiBlockSplit(org.apache.carbondata.hadoop.CarbonMultiBlockSplit) ArrayList(java.util.ArrayList) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 3 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonInputSplitTaskInfo method maxNoNodes.

/**
 * Finding which node has the maximum number of blocks for it.
 *
 * @param splitList
 * @return
 */
public static List<String> maxNoNodes(List<CarbonInputSplit> splitList) {
    boolean useIndex = true;
    Integer maxOccurence = 0;
    String maxNode = null;
    Map<String, Integer> nodeAndOccurenceMapping = new TreeMap<>();
    // populate the map of node and number of occurences of that node.
    for (CarbonInputSplit split : splitList) {
        try {
            for (String node : split.getLocations()) {
                Integer nodeOccurence = nodeAndOccurenceMapping.get(node);
                if (null == nodeOccurence) {
                    nodeAndOccurenceMapping.put(node, 1);
                } else {
                    nodeOccurence++;
                }
            }
        } catch (IOException e) {
            throw new RuntimeException("Fail to get location of split: " + split, e);
        }
    }
    Integer previousValueOccurence = null;
    // check which node is occured maximum times.
    for (Map.Entry<String, Integer> entry : nodeAndOccurenceMapping.entrySet()) {
        // finding the maximum node.
        if (entry.getValue() > maxOccurence) {
            maxOccurence = entry.getValue();
            maxNode = entry.getKey();
        }
        // first time scenario. initialzing the previous value.
        if (null == previousValueOccurence) {
            previousValueOccurence = entry.getValue();
        } else {
            // we need to return complete list instead of max node.
            if (!Objects.equals(previousValueOccurence, entry.getValue())) {
                useIndex = false;
            }
        }
    }
    // if all the nodes have equal occurence then returning the complete key set.
    if (useIndex) {
        return new ArrayList<>(nodeAndOccurenceMapping.keySet());
    }
    // if any max node is found then returning the max node.
    List<String> node = new ArrayList<>(1);
    node.add(maxNode);
    return node;
}

Also used : ArrayList(java.util.ArrayList) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException) TreeMap(java.util.TreeMap) TreeMap(java.util.TreeMap) Map(java.util.Map)

Example 4 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonFileInputFormat method getSplits.

/**
 * {@inheritDoc}
 * Configurations FileInputFormat.INPUT_DIR, CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS
 * are used to get table path to read.
 *
 * @return
 * @throws IOException
 */
private List<InputSplit> getSplits(JobContext job, FilterResolverIntf filterResolver, List<Segment> validSegments, BitSet matchedPartitions, PartitionInfo partitionInfo, List<Integer> oldPartitionIdList) throws IOException {
    numSegments = validSegments.size();
    List<InputSplit> result = new LinkedList<InputSplit>();
    UpdateVO invalidBlockVOForSegmentId = null;
    Boolean isIUDTable = false;
    SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(carbonTable);
    isIUDTable = (updateStatusManager.getUpdateStatusDetails().length != 0);
    // for each segment fetch blocks matching filter in Driver BTree
    List<CarbonInputSplit> dataBlocksOfSegment = getDataBlocksOfSegment(job, carbonTable, filterResolver, matchedPartitions, validSegments, partitionInfo, oldPartitionIdList);
    numBlocks = dataBlocksOfSegment.size();
    for (CarbonInputSplit inputSplit : dataBlocksOfSegment) {
        // Get the UpdateVO for those tables on which IUD operations being performed.
        if (isIUDTable) {
            invalidBlockVOForSegmentId = updateStatusManager.getInvalidTimestampRange(inputSplit.getSegmentId());
        }
        String[] deleteDeltaFilePath = null;
        if (isIUDTable) {
            // invalidated blocks.
            if (CarbonUtil.isInvalidTableBlock(inputSplit.getSegmentId(), inputSplit.getPath().toString(), invalidBlockVOForSegmentId, updateStatusManager)) {
                continue;
            }
            // When iud is done then only get delete delta files for a block
            try {
                deleteDeltaFilePath = updateStatusManager.getDeleteDeltaFilePath(inputSplit.getPath().toString(), inputSplit.getSegmentId());
            } catch (Exception e) {
                throw new IOException(e);
            }
        }
        inputSplit.setDeleteDeltaFiles(deleteDeltaFilePath);
        result.add(inputSplit);
    }
    return result;
}

Also used : SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) LinkedList(java.util.LinkedList) IOException(java.io.IOException)

Example 5 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonFileInputFormat method getSplits.

/**
 * {@inheritDoc}
 * Configurations FileInputFormat.INPUT_DIR
 * are used to get table path to read.
 *
 * @param job
 * @return List<InputSplit> list of CarbonInputSplit
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    AbsoluteTableIdentifier identifier = getAbsoluteTableIdentifier(job.getConfiguration());
    CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
    if (null == carbonTable) {
        throw new IOException("Missing/Corrupt schema file for table.");
    }
    if (getValidateSegmentsToAccess(job.getConfiguration())) {
        // get all valid segments and set them into the configuration
        // check for externalTable segment (Segment_null)
        // process and resolve the expression
        Expression filter = getFilterPredicates(job.getConfiguration());
        TableProvider tableProvider = new SingleTableProvider(carbonTable);
        // this will be null in case of corrupt schema file.
        PartitionInfo partitionInfo = carbonTable.getPartitionInfo(carbonTable.getTableName());
        carbonTable.processFilterExpression(filter, null, null);
        FilterResolverIntf filterInterface = carbonTable.resolveFilter(filter, tableProvider);
        String segmentDir = CarbonTablePath.getSegmentPath(identifier.getTablePath(), "null");
        FileFactory.FileType fileType = FileFactory.getFileType(segmentDir);
        if (FileFactory.isFileExist(segmentDir, fileType)) {
            // if external table Segments are found, add it to the List
            List<Segment> externalTableSegments = new ArrayList<Segment>();
            Segment seg = new Segment("null", null);
            externalTableSegments.add(seg);
            Map<String, String> indexFiles = new SegmentIndexFileStore().getIndexFilesFromSegment(segmentDir);
            if (indexFiles.size() == 0) {
                throw new RuntimeException("Index file not present to read the carbondata file");
            }
            // do block filtering and get split
            List<InputSplit> splits = getSplits(job, filterInterface, externalTableSegments, null, partitionInfo, null);
            return splits;
        }
    }
    return null;
}

Also used : SegmentIndexFileStore(org.apache.carbondata.core.indexstore.blockletindex.SegmentIndexFileStore) ArrayList(java.util.ArrayList) IOException(java.io.IOException) SingleTableProvider(org.apache.carbondata.core.scan.filter.SingleTableProvider) TableProvider(org.apache.carbondata.core.scan.filter.TableProvider) Segment(org.apache.carbondata.core.datamap.Segment) FileFactory(org.apache.carbondata.core.datastore.impl.FileFactory) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) SingleTableProvider(org.apache.carbondata.core.scan.filter.SingleTableProvider) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) Expression(org.apache.carbondata.core.scan.expression.Expression) PartitionInfo(org.apache.carbondata.core.metadata.schema.PartitionInfo) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FilterResolverIntf(org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf)

Aggregations

CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)17 IOException (java.io.IOException)9 ArrayList (java.util.ArrayList)9 InputSplit (org.apache.hadoop.mapreduce.InputSplit)6 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)5 Path (org.apache.hadoop.fs.Path)4 BitSet (java.util.BitSet)3 Segment (org.apache.carbondata.core.datamap.Segment)3 TableBlockInfo (org.apache.carbondata.core.datastore.block.TableBlockInfo)3 AbsoluteTableIdentifier (org.apache.carbondata.core.metadata.AbsoluteTableIdentifier)3 UpdateVO (org.apache.carbondata.core.mutate.UpdateVO)3 Expression (org.apache.carbondata.core.scan.expression.Expression)3 SingleTableProvider (org.apache.carbondata.core.scan.filter.SingleTableProvider)3 TableProvider (org.apache.carbondata.core.scan.filter.TableProvider)3 FilterResolverIntf (org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf)3 SegmentUpdateStatusManager (org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager)3 CarbonTableInputFormat (org.apache.carbondata.hadoop.api.CarbonTableInputFormat)3 LinkedList (java.util.LinkedList)2 PartitionInfo (org.apache.carbondata.core.metadata.schema.PartitionInfo)2 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)2