Search in sources :

Example 31 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class PaginationCarbonReader method getRows.

private Object[] getRows(long fromRowNumber, long toRowNumber) throws IOException, InterruptedException {
    int rowCount = 0;
    Object[] rows = new Object[(int) (toRowNumber - fromRowNumber + 1)];
    // get the matching split index (blocklets) range for the input range.
    Range blockletIndexRange = getBlockletIndexRange(fromRowNumber, toRowNumber);
    for (int i = blockletIndexRange.getFrom(); i <= blockletIndexRange.getTo(); i++) {
        String blockletUniqueId = String.valueOf(i);
        BlockletRows blockletRows;
        if (cache.get(blockletUniqueId) != null) {
            blockletRows = (BlockletRows) cache.get(blockletUniqueId);
        } else {
            BlockletDetailInfo detailInfo = ((CarbonInputSplit) allBlockletSplits.get(i)).getDetailInfo();
            List<Object> rowsInBlocklet = new ArrayList<>();
            // read the rows from the blocklet
            // TODO: read blocklets in multi-thread if there is a performance requirement.
            readerBuilder.setInputSplit(allBlockletSplits.get(i));
            CarbonReader<Object> carbonReader = readerBuilder.build();
            while (carbonReader.hasNext()) {
                rowsInBlocklet.add(carbonReader.readNextRow());
            }
            carbonReader.close();
            long fromRowId;
            if (i == 0) {
                fromRowId = 1;
            } else {
                // previous index will contain the sum of rows till previous blocklet.
                fromRowId = rowCountInSplits.get(i - 1) + 1;
            }
            blockletRows = new BlockletRows(fromRowId, detailInfo.getBlockSize(), rowsInBlocklet.toArray());
            // add entry to cache with no expiry time
            // key: unique blocklet id
            // value: BlockletRows
            cache.put(String.valueOf(i), blockletRows, blockletRows.getMemorySize(), Integer.MAX_VALUE);
        }
        long fromBlockletRow = blockletRows.getRowIdStartIndex();
        long toBlockletRow = fromBlockletRow + blockletRows.getRowsCount();
        Object[] rowsInBlocklet = blockletRows.getRows();
        if (toRowNumber >= toBlockletRow) {
            if (fromRowNumber >= fromBlockletRow) {
                // only fromRowNumber lies in this blocklet,
                // read from fromRowNumber to end of the blocklet.
                // -1 because row id starts form 0
                int start = (int) (fromRowNumber - blockletRows.getRowIdStartIndex());
                int end = blockletRows.getRowsCount();
                while (start < end) {
                    rows[rowCount++] = rowsInBlocklet[start++];
                }
            } else {
                // both fromRowNumber and toRowNumber doesn't lie in this blocklet.
                // Read the whole blocklet.
                System.arraycopy(rowsInBlocklet, 0, rows, rowCount, rowsInBlocklet.length);
                rowCount += rowsInBlocklet.length;
            }
        } else {
            if (fromRowNumber >= fromBlockletRow) {
                // both fromRowNumber and toRowNumber exist in this blocklet itself.
                // prune it and fill the results.
                int start = (int) (fromRowNumber - blockletRows.getRowIdStartIndex());
                int end = (int) (start + (toRowNumber + 1 - fromRowNumber));
                while (start < end) {
                    rows[rowCount++] = rowsInBlocklet[start++];
                }
            } else {
                // toRowNumber lies in this blocklet. Read from Starting of blocklet to toRowNumber.
                int start = 0;
                int end = (int) (toRowNumber + 1 - blockletRows.getRowIdStartIndex());
                while (start < end) {
                    rows[rowCount++] = rowsInBlocklet[start++];
                }
            }
        }
    }
    return rows;
}
Also used : BlockletDetailInfo(org.apache.carbondata.core.indexstore.BlockletDetailInfo) ArrayList(java.util.ArrayList) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) BlockletRows(org.apache.carbondata.sdk.file.cache.BlockletRows)

Example 32 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonStreamRecordReaderTest method buildInputSplit.

private InputSplit buildInputSplit() throws IOException {
    CarbonInputSplit carbonInputSplit = new CarbonInputSplit();
    List<CarbonInputSplit> splitList = new ArrayList<>();
    splitList.add(carbonInputSplit);
    return new CarbonMultiBlockSplit(splitList, new String[] { "localhost" }, FileFormat.ROW_V1);
}
Also used : CarbonMultiBlockSplit(org.apache.carbondata.hadoop.CarbonMultiBlockSplit) ArrayList(java.util.ArrayList) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 33 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class ExtendedBlocklet method deserializeFields.

/**
 * Method to deserialize extended blocklet and input split for index server
 * @param in data input stream to read the primitives of extended blocklet
 * @param locations locations of the input split
 * @param tablePath carbon table path
 * @throws IOException
 */
public void deserializeFields(DataInput in, String[] locations, String tablePath, boolean isCountJob, CdcVO cdcVO) throws IOException {
    super.readFields(in);
    if (isCountJob) {
        count = in.readLong();
        segmentNo = in.readUTF();
        return;
    } else if (cdcVO != null) {
        filePath = in.readUTF();
        this.columnToMinMaxMapping = new HashMap<>();
        for (String column : cdcVO.getColumnToIndexMap().keySet()) {
            List<FilePathMinMaxVO> minMaxOfColumnInList = new ArrayList<>();
            int minLength = in.readInt();
            byte[] minValuesForBlocklets = new byte[minLength];
            in.readFully(minValuesForBlocklets);
            int maxLength = in.readInt();
            byte[] maxValuesForBlocklets = new byte[maxLength];
            in.readFully(maxValuesForBlocklets);
            minMaxOfColumnInList.add(new FilePathMinMaxVO(filePath, minValuesForBlocklets, maxValuesForBlocklets));
            this.columnToMinMaxMapping.put(column, minMaxOfColumnInList);
        }
        return;
    }
    if (in.readBoolean()) {
        indexUniqueId = in.readUTF();
    }
    boolean isSplitPresent = in.readBoolean();
    if (isSplitPresent) {
        String filePath = getPath();
        boolean isExternalPath = in.readBoolean();
        if (!isExternalPath) {
            setFilePath(tablePath + filePath);
        } else {
            setFilePath(filePath);
        }
        // getting the length of the data
        final int serializeLen = in.readInt();
        this.inputSplit = new CarbonInputSplit(serializeLen, in, getFilePath(), locations, getBlockletId());
    }
}
Also used : FilePathMinMaxVO(org.apache.carbondata.core.mutate.FilePathMinMaxVO) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) List(java.util.List) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Aggregations

CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)33 ArrayList (java.util.ArrayList)17 IOException (java.io.IOException)15 InputSplit (org.apache.hadoop.mapreduce.InputSplit)10 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)8 LinkedList (java.util.LinkedList)6 CarbonMultiBlockSplit (org.apache.carbondata.hadoop.CarbonMultiBlockSplit)6 IndexFilter (org.apache.carbondata.core.index.IndexFilter)5 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)5 HashMap (java.util.HashMap)4 HashSet (java.util.HashSet)4 List (java.util.List)4 TableBlockInfo (org.apache.carbondata.core.datastore.block.TableBlockInfo)4 PartitionSpec (org.apache.carbondata.core.indexstore.PartitionSpec)4 LoadMetadataDetails (org.apache.carbondata.core.statusmanager.LoadMetadataDetails)4 SegmentUpdateStatusManager (org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager)4 CarbonTableInputFormat (org.apache.carbondata.hadoop.api.CarbonTableInputFormat)4 Configuration (org.apache.hadoop.conf.Configuration)4 Path (org.apache.hadoop.fs.Path)4 Gson (com.google.gson.Gson)3