Search in sources :

Example 6 with DataChunk2

use of org.apache.carbondata.format.DataChunk2 in project carbondata by apache.

the class CarbonMetadataUtil method getDataChunk2.

/**
   * Below method will be used to get the data chunk2 serialize object list
   *
   * @param nodeHolder        node holder
   * @param columnSchema     table columns
   * @param segmentProperties segment properties
   * @param isDimensionColumn to get the list of dimension column or measure column
   * @return list of data chunk2
   * @throws IOException
   */
public static List<byte[]> getDataChunk2(NodeHolder nodeHolder, List<ColumnSchema> columnSchema, SegmentProperties segmentProperties, boolean isDimensionColumn) throws IOException {
    List<byte[]> dataChunkBuffer = new ArrayList<>();
    if (isDimensionColumn) {
        for (int i = 0; i < nodeHolder.getKeyArray().length; i++) {
            DataChunk2 dataChunk = new DataChunk2();
            dataChunk.min_max = new BlockletMinMaxIndex();
            dataChunk.setChunk_meta(getChunkCompressionMeta());
            dataChunk.setNumberOfRowsInpage(nodeHolder.getEntryCount());
            List<Encoding> encodings = new ArrayList<Encoding>();
            dataChunk.setData_page_length(nodeHolder.getKeyLengths()[i]);
            if (containsEncoding(i, Encoding.DICTIONARY, columnSchema, segmentProperties)) {
                encodings.add(Encoding.DICTIONARY);
            }
            if (containsEncoding(i, Encoding.DIRECT_DICTIONARY, columnSchema, segmentProperties)) {
                encodings.add(Encoding.DIRECT_DICTIONARY);
            }
            dataChunk.setRowMajor(nodeHolder.getColGrpBlocks()[i]);
            if (nodeHolder.getAggBlocks()[i]) {
                dataChunk.setRle_page_length(nodeHolder.getDataIndexMapLength()[i]);
                encodings.add(Encoding.RLE);
            }
            dataChunk.setSort_state(nodeHolder.getIsSortedKeyBlock()[i] ? SortState.SORT_EXPLICIT : SortState.SORT_NATIVE);
            if (!nodeHolder.getIsSortedKeyBlock()[i]) {
                dataChunk.setRowid_page_length(nodeHolder.getKeyBlockIndexLength()[i]);
                encodings.add(Encoding.INVERTED_INDEX);
            }
            dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getColumnMaxData()[i]));
            dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getColumnMinData()[i]));
            dataChunk.setEncoders(encodings);
            dataChunkBuffer.add(CarbonUtil.getByteArray(dataChunk));
        }
    } else {
        for (int i = 0; i < nodeHolder.getDataArray().length; i++) {
            DataChunk2 dataChunk = new DataChunk2();
            dataChunk.min_max = new BlockletMinMaxIndex();
            dataChunk.setChunk_meta(getChunkCompressionMeta());
            dataChunk.setNumberOfRowsInpage(nodeHolder.getEntryCount());
            dataChunk.setData_page_length(nodeHolder.getDataArray()[i].length);
            List<Encoding> encodings = new ArrayList<Encoding>();
            // TODO : Right now the encodings are happening at runtime. change as
            // per this encoders.
            dataChunk.setEncoders(encodings);
            dataChunk.setRowMajor(false);
            // TODO : Right now the encodings are happening at runtime. change as
            // per this encoders.
            encodings.add(Encoding.DELTA);
            dataChunk.setEncoders(encodings);
            // TODO writing dummy presence meta need to set actual presence
            // meta
            PresenceMeta presenceMeta = new PresenceMeta();
            presenceMeta.setPresent_bit_streamIsSet(true);
            presenceMeta.setPresent_bit_stream(CompressorFactory.getInstance().getCompressor().compressByte(nodeHolder.getMeasureNullValueIndex()[i].toByteArray()));
            dataChunk.setPresence(presenceMeta);
            List<ByteBuffer> encoderMetaList = new ArrayList<ByteBuffer>();
            encoderMetaList.add(ByteBuffer.wrap(serializeEncodeMetaUsingByteBuffer(createValueEncoderMeta(nodeHolder.getCompressionModel(), i))));
            dataChunk.setEncoder_meta(encoderMetaList);
            dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMaxData()[i]));
            dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMinData()[i]));
            dataChunkBuffer.add(CarbonUtil.getByteArray(dataChunk));
        }
    }
    return dataChunkBuffer;
}
Also used : DataChunk2(org.apache.carbondata.format.DataChunk2) BlockletMinMaxIndex(org.apache.carbondata.format.BlockletMinMaxIndex) ArrayList(java.util.ArrayList) Encoding(org.apache.carbondata.format.Encoding) PresenceMeta(org.apache.carbondata.format.PresenceMeta) ByteBuffer(java.nio.ByteBuffer)

Example 7 with DataChunk2

use of org.apache.carbondata.format.DataChunk2 in project carbondata by apache.

the class CarbonMetadataUtil method getDatachunk2.

/**
   * Below method will be used to get the data chunk object for all the columns
   *
   * @param nodeHolderList       blocklet info
   * @param columnSchema        list of columns
   * @param segmentProperties    segment properties
   * @return list of data chunks
   * @throws IOException
   */
private static List<DataChunk2> getDatachunk2(List<NodeHolder> nodeHolderList, List<ColumnSchema> columnSchema, SegmentProperties segmentProperties, int index, boolean isDimensionColumn) throws IOException {
    List<DataChunk2> colDataChunks = new ArrayList<DataChunk2>();
    DataChunk2 dataChunk = null;
    NodeHolder nodeHolder = null;
    for (int i = 0; i < nodeHolderList.size(); i++) {
        nodeHolder = nodeHolderList.get(i);
        dataChunk = new DataChunk2();
        dataChunk.min_max = new BlockletMinMaxIndex();
        dataChunk.setChunk_meta(getChunkCompressionMeta());
        dataChunk.setNumberOfRowsInpage(nodeHolder.getEntryCount());
        List<Encoding> encodings = new ArrayList<Encoding>();
        if (isDimensionColumn) {
            dataChunk.setData_page_length(nodeHolder.getKeyLengths()[index]);
            if (containsEncoding(index, Encoding.DICTIONARY, columnSchema, segmentProperties)) {
                encodings.add(Encoding.DICTIONARY);
            }
            if (containsEncoding(index, Encoding.DIRECT_DICTIONARY, columnSchema, segmentProperties)) {
                encodings.add(Encoding.DIRECT_DICTIONARY);
            }
            dataChunk.setRowMajor(nodeHolder.getColGrpBlocks()[index]);
            // here.
            if (nodeHolder.getAggBlocks()[index]) {
                dataChunk.setRle_page_length(nodeHolder.getDataIndexMapLength()[index]);
                encodings.add(Encoding.RLE);
            }
            dataChunk.setSort_state(nodeHolder.getIsSortedKeyBlock()[index] ? SortState.SORT_EXPLICIT : SortState.SORT_NATIVE);
            if (!nodeHolder.getIsSortedKeyBlock()[index]) {
                dataChunk.setRowid_page_length(nodeHolder.getKeyBlockIndexLength()[index]);
                encodings.add(Encoding.INVERTED_INDEX);
            }
            dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getColumnMaxData()[index]));
            dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getColumnMinData()[index]));
        } else {
            dataChunk.setData_page_length(nodeHolder.getDataArray()[index].length);
            // TODO : Right now the encodings are happening at runtime. change as
            // per this encoders.
            dataChunk.setEncoders(encodings);
            dataChunk.setRowMajor(false);
            // TODO : Right now the encodings are happening at runtime. change as
            // per this encoders.
            encodings.add(Encoding.DELTA);
            dataChunk.setEncoders(encodings);
            // TODO writing dummy presence meta need to set actual presence
            // meta
            PresenceMeta presenceMeta = new PresenceMeta();
            presenceMeta.setPresent_bit_streamIsSet(true);
            presenceMeta.setPresent_bit_stream(CompressorFactory.getInstance().getCompressor().compressByte(nodeHolder.getMeasureNullValueIndex()[index].toByteArray()));
            dataChunk.setPresence(presenceMeta);
            List<ByteBuffer> encoderMetaList = new ArrayList<ByteBuffer>();
            encoderMetaList.add(ByteBuffer.wrap(serializeEncodeMetaUsingByteBuffer(createValueEncoderMeta(nodeHolder.getCompressionModel(), index))));
            dataChunk.setEncoder_meta(encoderMetaList);
            dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMaxData()[index]));
            dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMinData()[index]));
        }
        dataChunk.setEncoders(encodings);
        colDataChunks.add(dataChunk);
    }
    return colDataChunks;
}
Also used : DataChunk2(org.apache.carbondata.format.DataChunk2) BlockletMinMaxIndex(org.apache.carbondata.format.BlockletMinMaxIndex) ArrayList(java.util.ArrayList) Encoding(org.apache.carbondata.format.Encoding) PresenceMeta(org.apache.carbondata.format.PresenceMeta) ByteBuffer(java.nio.ByteBuffer)

Example 8 with DataChunk2

use of org.apache.carbondata.format.DataChunk2 in project carbondata by apache.

the class CarbonFactDataWriterImplV2 method writeBlockletData.

/**
   * Below method will be used to write the data to carbon data file
   *
   * @param holder
   * @throws CarbonDataWriterException any problem in writing operation
   */
@Override
public void writeBlockletData(NodeHolder holder) throws CarbonDataWriterException {
    if (holder.getEntryCount() == 0) {
        return;
    }
    // size to calculate the size of the blocklet
    int size = 0;
    // get the blocklet info object
    BlockletInfoColumnar blockletInfo = getBlockletInfo(holder, 0);
    List<DataChunk2> datachunks = null;
    try {
        // get all the data chunks
        datachunks = CarbonMetadataUtil.getDatachunk2(blockletInfo, thriftColumnSchemaList, dataWriterVo.getSegmentProperties());
    } catch (IOException e) {
        throw new CarbonDataWriterException("Problem while getting the data chunks", e);
    }
    // data chunk byte array
    byte[][] dataChunkByteArray = new byte[datachunks.size()][];
    for (int i = 0; i < dataChunkByteArray.length; i++) {
        dataChunkByteArray[i] = CarbonUtil.getByteArray(datachunks.get(i));
        // add the data chunk size
        size += dataChunkByteArray[i].length;
    }
    // add row id index length
    for (int i = 0; i < holder.getKeyBlockIndexLength().length; i++) {
        size += holder.getKeyBlockIndexLength()[i];
    }
    // add rle index length
    for (int i = 0; i < holder.getDataIndexMapLength().length; i++) {
        size += holder.getDataIndexMapLength()[i];
    }
    // add dimension column data page and measure column data page size
    long blockletDataSize = holder.getTotalDimensionArrayLength() + holder.getTotalMeasureArrayLength() + size;
    // if size of the file already reached threshold size then create a new file and get the file
    // channel object
    updateBlockletFileChannel(blockletDataSize);
    // this is done so carbondata file can be read separately
    try {
        if (fileChannel.size() == 0) {
            ColumnarFormatVersion version = CarbonProperties.getInstance().getFormatVersion();
            byte[] header = (CarbonCommonConstants.CARBON_DATA_VERSION_HEADER + version).getBytes();
            ByteBuffer buffer = ByteBuffer.allocate(header.length);
            buffer.put(header);
            buffer.rewind();
            fileChannel.write(buffer);
        }
    } catch (IOException e) {
        throw new CarbonDataWriterException("Problem while getting the file channel size", e);
    }
    // write data to file and get its offset
    writeDataToFile(holder, dataChunkByteArray, fileChannel);
    // add blocklet info to list
    blockletInfoList.add(blockletInfo);
    LOGGER.info("A new blocklet is added, its data size is: " + blockletDataSize + " Byte");
}
Also used : BlockletInfoColumnar(org.apache.carbondata.core.metadata.BlockletInfoColumnar) DataChunk2(org.apache.carbondata.format.DataChunk2) IOException(java.io.IOException) CarbonDataWriterException(org.apache.carbondata.processing.store.writer.exception.CarbonDataWriterException) ByteBuffer(java.nio.ByteBuffer) ColumnarFormatVersion(org.apache.carbondata.core.metadata.ColumnarFormatVersion)

Example 9 with DataChunk2

use of org.apache.carbondata.format.DataChunk2 in project carbondata by apache.

the class CompressedDimensionChunkFileBasedReaderV2 method convertToDimensionChunk.

public DimensionColumnDataChunk convertToDimensionChunk(DimensionRawColumnChunk dimensionRawColumnChunk, int pageNumber) throws IOException {
    byte[] dataPage = null;
    int[] invertedIndexes = null;
    int[] invertedIndexesReverse = null;
    int[] rlePage = null;
    DataChunk2 dimensionColumnChunk = null;
    int copySourcePoint = dimensionRawColumnChunk.getOffSet();
    int blockIndex = dimensionRawColumnChunk.getBlockletId();
    ByteBuffer rawData = dimensionRawColumnChunk.getRawData();
    if (dimensionChunksOffset.size() - 1 == blockIndex) {
        dimensionColumnChunk = CarbonUtil.readDataChunk(rawData, copySourcePoint, dimensionRawColumnChunk.getLength());
        int totalDimensionDataLength = dimensionColumnChunk.data_page_length + dimensionColumnChunk.rle_page_length + dimensionColumnChunk.rowid_page_length;
        synchronized (dimensionRawColumnChunk.getFileReader()) {
            rawData = dimensionRawColumnChunk.getFileReader().readByteBuffer(filePath, dimensionChunksOffset.get(blockIndex) + dimensionChunksLength.get(blockIndex), totalDimensionDataLength);
        }
    } else {
        dimensionColumnChunk = CarbonUtil.readDataChunk(rawData, copySourcePoint, dimensionChunksLength.get(blockIndex));
        copySourcePoint += dimensionChunksLength.get(blockIndex);
    }
    // first read the data and uncompressed it
    dataPage = COMPRESSOR.unCompressByte(rawData.array(), copySourcePoint, dimensionColumnChunk.data_page_length);
    copySourcePoint += dimensionColumnChunk.data_page_length;
    // if row id block is present then read the row id chunk and uncompress it
    if (hasEncoding(dimensionColumnChunk.encoders, Encoding.INVERTED_INDEX)) {
        byte[] dataInv = new byte[dimensionColumnChunk.rowid_page_length];
        rawData.position(copySourcePoint);
        rawData.get(dataInv);
        invertedIndexes = CarbonUtil.getUnCompressColumnIndex(dimensionColumnChunk.rowid_page_length, dataInv, numberComressor, 0);
        copySourcePoint += dimensionColumnChunk.rowid_page_length;
        // get the reverse index
        invertedIndexesReverse = getInvertedReverseIndex(invertedIndexes);
    }
    //then actual data based on rle block
    if (hasEncoding(dimensionColumnChunk.encoders, Encoding.RLE)) {
        byte[] dataRle = new byte[dimensionColumnChunk.rle_page_length];
        rawData.position(copySourcePoint);
        rawData.get(dataRle);
        rlePage = numberComressor.unCompress(dataRle, 0, dimensionColumnChunk.rle_page_length);
        // uncompress the data with rle indexes
        dataPage = UnBlockIndexer.uncompressData(dataPage, rlePage, eachColumnValueSize[blockIndex]);
    }
    // fill chunk attributes
    DimensionColumnDataChunk columnDataChunk = null;
    if (dimensionColumnChunk.isRowMajor()) {
        // to store fixed length column chunk values
        columnDataChunk = new ColumnGroupDimensionDataChunk(dataPage, eachColumnValueSize[blockIndex], numberOfRows);
    } else // and set to data chunk instance
    if (!hasEncoding(dimensionColumnChunk.encoders, Encoding.DICTIONARY)) {
        columnDataChunk = new VariableLengthDimensionDataChunk(dataPage, invertedIndexes, invertedIndexesReverse, numberOfRows);
    } else {
        // to store fixed length column chunk values
        columnDataChunk = new FixedLengthDimensionDataChunk(dataPage, invertedIndexes, invertedIndexesReverse, numberOfRows, eachColumnValueSize[blockIndex]);
    }
    return columnDataChunk;
}
Also used : FixedLengthDimensionDataChunk(org.apache.carbondata.core.datastore.chunk.impl.FixedLengthDimensionDataChunk) DataChunk2(org.apache.carbondata.format.DataChunk2) DimensionColumnDataChunk(org.apache.carbondata.core.datastore.chunk.DimensionColumnDataChunk) ColumnGroupDimensionDataChunk(org.apache.carbondata.core.datastore.chunk.impl.ColumnGroupDimensionDataChunk) ByteBuffer(java.nio.ByteBuffer) VariableLengthDimensionDataChunk(org.apache.carbondata.core.datastore.chunk.impl.VariableLengthDimensionDataChunk)

Aggregations

DataChunk2 (org.apache.carbondata.format.DataChunk2)9 ByteBuffer (java.nio.ByteBuffer)8 ArrayList (java.util.ArrayList)6 DataChunk3 (org.apache.carbondata.format.DataChunk3)3 Encoding (org.apache.carbondata.format.Encoding)3 PresenceMeta (org.apache.carbondata.format.PresenceMeta)3 DimensionColumnDataChunk (org.apache.carbondata.core.datastore.chunk.DimensionColumnDataChunk)2 MeasureColumnDataChunk (org.apache.carbondata.core.datastore.chunk.MeasureColumnDataChunk)2 ColumnGroupDimensionDataChunk (org.apache.carbondata.core.datastore.chunk.impl.ColumnGroupDimensionDataChunk)2 FixedLengthDimensionDataChunk (org.apache.carbondata.core.datastore.chunk.impl.FixedLengthDimensionDataChunk)2 VariableLengthDimensionDataChunk (org.apache.carbondata.core.datastore.chunk.impl.VariableLengthDimensionDataChunk)2 ValueCompressionHolder (org.apache.carbondata.core.datastore.compression.ValueCompressionHolder)2 WriterCompressModel (org.apache.carbondata.core.datastore.compression.WriterCompressModel)2 CarbonReadDataHolder (org.apache.carbondata.core.datastore.dataholder.CarbonReadDataHolder)2 ValueEncoderMeta (org.apache.carbondata.core.metadata.ValueEncoderMeta)2 BlockletMinMaxIndex (org.apache.carbondata.format.BlockletMinMaxIndex)2 IOException (java.io.IOException)1 BlockletInfoColumnar (org.apache.carbondata.core.metadata.BlockletInfoColumnar)1 ColumnarFormatVersion (org.apache.carbondata.core.metadata.ColumnarFormatVersion)1 CarbonDataWriterException (org.apache.carbondata.processing.store.writer.exception.CarbonDataWriterException)1