use of org.apache.carbondata.format.DataChunk2 in project carbondata by apache.
the class CarbonMetadataUtil method getDataChunk2.
/**
* Below method will be used to get the data chunk2 serialize object list
*
* @param nodeHolder node holder
* @param columnSchema table columns
* @param segmentProperties segment properties
* @param isDimensionColumn to get the list of dimension column or measure column
* @return list of data chunk2
* @throws IOException
*/
public static List<byte[]> getDataChunk2(NodeHolder nodeHolder, List<ColumnSchema> columnSchema, SegmentProperties segmentProperties, boolean isDimensionColumn) throws IOException {
List<byte[]> dataChunkBuffer = new ArrayList<>();
if (isDimensionColumn) {
for (int i = 0; i < nodeHolder.getKeyArray().length; i++) {
DataChunk2 dataChunk = new DataChunk2();
dataChunk.min_max = new BlockletMinMaxIndex();
dataChunk.setChunk_meta(getChunkCompressionMeta());
dataChunk.setNumberOfRowsInpage(nodeHolder.getEntryCount());
List<Encoding> encodings = new ArrayList<Encoding>();
dataChunk.setData_page_length(nodeHolder.getKeyLengths()[i]);
if (containsEncoding(i, Encoding.DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DICTIONARY);
}
if (containsEncoding(i, Encoding.DIRECT_DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DIRECT_DICTIONARY);
}
dataChunk.setRowMajor(nodeHolder.getColGrpBlocks()[i]);
if (nodeHolder.getAggBlocks()[i]) {
dataChunk.setRle_page_length(nodeHolder.getDataIndexMapLength()[i]);
encodings.add(Encoding.RLE);
}
dataChunk.setSort_state(nodeHolder.getIsSortedKeyBlock()[i] ? SortState.SORT_EXPLICIT : SortState.SORT_NATIVE);
if (!nodeHolder.getIsSortedKeyBlock()[i]) {
dataChunk.setRowid_page_length(nodeHolder.getKeyBlockIndexLength()[i]);
encodings.add(Encoding.INVERTED_INDEX);
}
dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getColumnMaxData()[i]));
dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getColumnMinData()[i]));
dataChunk.setEncoders(encodings);
dataChunkBuffer.add(CarbonUtil.getByteArray(dataChunk));
}
} else {
for (int i = 0; i < nodeHolder.getDataArray().length; i++) {
DataChunk2 dataChunk = new DataChunk2();
dataChunk.min_max = new BlockletMinMaxIndex();
dataChunk.setChunk_meta(getChunkCompressionMeta());
dataChunk.setNumberOfRowsInpage(nodeHolder.getEntryCount());
dataChunk.setData_page_length(nodeHolder.getDataArray()[i].length);
List<Encoding> encodings = new ArrayList<Encoding>();
// TODO : Right now the encodings are happening at runtime. change as
// per this encoders.
dataChunk.setEncoders(encodings);
dataChunk.setRowMajor(false);
// TODO : Right now the encodings are happening at runtime. change as
// per this encoders.
encodings.add(Encoding.DELTA);
dataChunk.setEncoders(encodings);
// TODO writing dummy presence meta need to set actual presence
// meta
PresenceMeta presenceMeta = new PresenceMeta();
presenceMeta.setPresent_bit_streamIsSet(true);
presenceMeta.setPresent_bit_stream(CompressorFactory.getInstance().getCompressor().compressByte(nodeHolder.getMeasureNullValueIndex()[i].toByteArray()));
dataChunk.setPresence(presenceMeta);
List<ByteBuffer> encoderMetaList = new ArrayList<ByteBuffer>();
encoderMetaList.add(ByteBuffer.wrap(serializeEncodeMetaUsingByteBuffer(createValueEncoderMeta(nodeHolder.getCompressionModel(), i))));
dataChunk.setEncoder_meta(encoderMetaList);
dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMaxData()[i]));
dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMinData()[i]));
dataChunkBuffer.add(CarbonUtil.getByteArray(dataChunk));
}
}
return dataChunkBuffer;
}
use of org.apache.carbondata.format.DataChunk2 in project carbondata by apache.
the class CarbonMetadataUtil method getDatachunk2.
/**
* Below method will be used to get the data chunk object for all the columns
*
* @param nodeHolderList blocklet info
* @param columnSchema list of columns
* @param segmentProperties segment properties
* @return list of data chunks
* @throws IOException
*/
private static List<DataChunk2> getDatachunk2(List<NodeHolder> nodeHolderList, List<ColumnSchema> columnSchema, SegmentProperties segmentProperties, int index, boolean isDimensionColumn) throws IOException {
List<DataChunk2> colDataChunks = new ArrayList<DataChunk2>();
DataChunk2 dataChunk = null;
NodeHolder nodeHolder = null;
for (int i = 0; i < nodeHolderList.size(); i++) {
nodeHolder = nodeHolderList.get(i);
dataChunk = new DataChunk2();
dataChunk.min_max = new BlockletMinMaxIndex();
dataChunk.setChunk_meta(getChunkCompressionMeta());
dataChunk.setNumberOfRowsInpage(nodeHolder.getEntryCount());
List<Encoding> encodings = new ArrayList<Encoding>();
if (isDimensionColumn) {
dataChunk.setData_page_length(nodeHolder.getKeyLengths()[index]);
if (containsEncoding(index, Encoding.DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DICTIONARY);
}
if (containsEncoding(index, Encoding.DIRECT_DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DIRECT_DICTIONARY);
}
dataChunk.setRowMajor(nodeHolder.getColGrpBlocks()[index]);
// here.
if (nodeHolder.getAggBlocks()[index]) {
dataChunk.setRle_page_length(nodeHolder.getDataIndexMapLength()[index]);
encodings.add(Encoding.RLE);
}
dataChunk.setSort_state(nodeHolder.getIsSortedKeyBlock()[index] ? SortState.SORT_EXPLICIT : SortState.SORT_NATIVE);
if (!nodeHolder.getIsSortedKeyBlock()[index]) {
dataChunk.setRowid_page_length(nodeHolder.getKeyBlockIndexLength()[index]);
encodings.add(Encoding.INVERTED_INDEX);
}
dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getColumnMaxData()[index]));
dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getColumnMinData()[index]));
} else {
dataChunk.setData_page_length(nodeHolder.getDataArray()[index].length);
// TODO : Right now the encodings are happening at runtime. change as
// per this encoders.
dataChunk.setEncoders(encodings);
dataChunk.setRowMajor(false);
// TODO : Right now the encodings are happening at runtime. change as
// per this encoders.
encodings.add(Encoding.DELTA);
dataChunk.setEncoders(encodings);
// TODO writing dummy presence meta need to set actual presence
// meta
PresenceMeta presenceMeta = new PresenceMeta();
presenceMeta.setPresent_bit_streamIsSet(true);
presenceMeta.setPresent_bit_stream(CompressorFactory.getInstance().getCompressor().compressByte(nodeHolder.getMeasureNullValueIndex()[index].toByteArray()));
dataChunk.setPresence(presenceMeta);
List<ByteBuffer> encoderMetaList = new ArrayList<ByteBuffer>();
encoderMetaList.add(ByteBuffer.wrap(serializeEncodeMetaUsingByteBuffer(createValueEncoderMeta(nodeHolder.getCompressionModel(), index))));
dataChunk.setEncoder_meta(encoderMetaList);
dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMaxData()[index]));
dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMinData()[index]));
}
dataChunk.setEncoders(encodings);
colDataChunks.add(dataChunk);
}
return colDataChunks;
}
use of org.apache.carbondata.format.DataChunk2 in project carbondata by apache.
the class CarbonFactDataWriterImplV2 method writeBlockletData.
/**
* Below method will be used to write the data to carbon data file
*
* @param holder
* @throws CarbonDataWriterException any problem in writing operation
*/
@Override
public void writeBlockletData(NodeHolder holder) throws CarbonDataWriterException {
if (holder.getEntryCount() == 0) {
return;
}
// size to calculate the size of the blocklet
int size = 0;
// get the blocklet info object
BlockletInfoColumnar blockletInfo = getBlockletInfo(holder, 0);
List<DataChunk2> datachunks = null;
try {
// get all the data chunks
datachunks = CarbonMetadataUtil.getDatachunk2(blockletInfo, thriftColumnSchemaList, dataWriterVo.getSegmentProperties());
} catch (IOException e) {
throw new CarbonDataWriterException("Problem while getting the data chunks", e);
}
// data chunk byte array
byte[][] dataChunkByteArray = new byte[datachunks.size()][];
for (int i = 0; i < dataChunkByteArray.length; i++) {
dataChunkByteArray[i] = CarbonUtil.getByteArray(datachunks.get(i));
// add the data chunk size
size += dataChunkByteArray[i].length;
}
// add row id index length
for (int i = 0; i < holder.getKeyBlockIndexLength().length; i++) {
size += holder.getKeyBlockIndexLength()[i];
}
// add rle index length
for (int i = 0; i < holder.getDataIndexMapLength().length; i++) {
size += holder.getDataIndexMapLength()[i];
}
// add dimension column data page and measure column data page size
long blockletDataSize = holder.getTotalDimensionArrayLength() + holder.getTotalMeasureArrayLength() + size;
// if size of the file already reached threshold size then create a new file and get the file
// channel object
updateBlockletFileChannel(blockletDataSize);
// this is done so carbondata file can be read separately
try {
if (fileChannel.size() == 0) {
ColumnarFormatVersion version = CarbonProperties.getInstance().getFormatVersion();
byte[] header = (CarbonCommonConstants.CARBON_DATA_VERSION_HEADER + version).getBytes();
ByteBuffer buffer = ByteBuffer.allocate(header.length);
buffer.put(header);
buffer.rewind();
fileChannel.write(buffer);
}
} catch (IOException e) {
throw new CarbonDataWriterException("Problem while getting the file channel size", e);
}
// write data to file and get its offset
writeDataToFile(holder, dataChunkByteArray, fileChannel);
// add blocklet info to list
blockletInfoList.add(blockletInfo);
LOGGER.info("A new blocklet is added, its data size is: " + blockletDataSize + " Byte");
}
use of org.apache.carbondata.format.DataChunk2 in project carbondata by apache.
the class CompressedDimensionChunkFileBasedReaderV2 method convertToDimensionChunk.
public DimensionColumnDataChunk convertToDimensionChunk(DimensionRawColumnChunk dimensionRawColumnChunk, int pageNumber) throws IOException {
byte[] dataPage = null;
int[] invertedIndexes = null;
int[] invertedIndexesReverse = null;
int[] rlePage = null;
DataChunk2 dimensionColumnChunk = null;
int copySourcePoint = dimensionRawColumnChunk.getOffSet();
int blockIndex = dimensionRawColumnChunk.getBlockletId();
ByteBuffer rawData = dimensionRawColumnChunk.getRawData();
if (dimensionChunksOffset.size() - 1 == blockIndex) {
dimensionColumnChunk = CarbonUtil.readDataChunk(rawData, copySourcePoint, dimensionRawColumnChunk.getLength());
int totalDimensionDataLength = dimensionColumnChunk.data_page_length + dimensionColumnChunk.rle_page_length + dimensionColumnChunk.rowid_page_length;
synchronized (dimensionRawColumnChunk.getFileReader()) {
rawData = dimensionRawColumnChunk.getFileReader().readByteBuffer(filePath, dimensionChunksOffset.get(blockIndex) + dimensionChunksLength.get(blockIndex), totalDimensionDataLength);
}
} else {
dimensionColumnChunk = CarbonUtil.readDataChunk(rawData, copySourcePoint, dimensionChunksLength.get(blockIndex));
copySourcePoint += dimensionChunksLength.get(blockIndex);
}
// first read the data and uncompressed it
dataPage = COMPRESSOR.unCompressByte(rawData.array(), copySourcePoint, dimensionColumnChunk.data_page_length);
copySourcePoint += dimensionColumnChunk.data_page_length;
// if row id block is present then read the row id chunk and uncompress it
if (hasEncoding(dimensionColumnChunk.encoders, Encoding.INVERTED_INDEX)) {
byte[] dataInv = new byte[dimensionColumnChunk.rowid_page_length];
rawData.position(copySourcePoint);
rawData.get(dataInv);
invertedIndexes = CarbonUtil.getUnCompressColumnIndex(dimensionColumnChunk.rowid_page_length, dataInv, numberComressor, 0);
copySourcePoint += dimensionColumnChunk.rowid_page_length;
// get the reverse index
invertedIndexesReverse = getInvertedReverseIndex(invertedIndexes);
}
//then actual data based on rle block
if (hasEncoding(dimensionColumnChunk.encoders, Encoding.RLE)) {
byte[] dataRle = new byte[dimensionColumnChunk.rle_page_length];
rawData.position(copySourcePoint);
rawData.get(dataRle);
rlePage = numberComressor.unCompress(dataRle, 0, dimensionColumnChunk.rle_page_length);
// uncompress the data with rle indexes
dataPage = UnBlockIndexer.uncompressData(dataPage, rlePage, eachColumnValueSize[blockIndex]);
}
// fill chunk attributes
DimensionColumnDataChunk columnDataChunk = null;
if (dimensionColumnChunk.isRowMajor()) {
// to store fixed length column chunk values
columnDataChunk = new ColumnGroupDimensionDataChunk(dataPage, eachColumnValueSize[blockIndex], numberOfRows);
} else // and set to data chunk instance
if (!hasEncoding(dimensionColumnChunk.encoders, Encoding.DICTIONARY)) {
columnDataChunk = new VariableLengthDimensionDataChunk(dataPage, invertedIndexes, invertedIndexesReverse, numberOfRows);
} else {
// to store fixed length column chunk values
columnDataChunk = new FixedLengthDimensionDataChunk(dataPage, invertedIndexes, invertedIndexesReverse, numberOfRows, eachColumnValueSize[blockIndex]);
}
return columnDataChunk;
}
Aggregations