use of org.apache.carbondata.format.Encoding in project carbondata by apache.
the class DictDimensionIndexCodec method createEncoder.
@Override
public ColumnPageEncoder createEncoder(Map<String, String> parameter) {
return new IndexStorageEncoder() {
@Override
void encodeIndexStorage(ColumnPage inputPage) {
IndexStorage indexStorage;
byte[][] data = inputPage.getByteArrayPage();
if (isInvertedIndex) {
indexStorage = new BlockIndexerStorageForShort(data, true, false, isSort);
} else {
indexStorage = new BlockIndexerStorageForNoInvertedIndexForShort(data, false);
}
byte[] flattened = ByteUtil.flatten(indexStorage.getDataPage());
super.compressedDataPage = compressor.compressByte(flattened);
super.indexStorage = indexStorage;
}
@Override
protected List<Encoding> getEncodingList() {
List<Encoding> encodings = new ArrayList<>();
encodings.add(Encoding.DICTIONARY);
encodings.add(Encoding.RLE);
if (isInvertedIndex) {
encodings.add(Encoding.INVERTED_INDEX);
}
return encodings;
}
};
}
use of org.apache.carbondata.format.Encoding in project carbondata by apache.
the class CarbonMetadataUtil method getDataChunk2.
/**
* Below method will be used to get the data chunk2 serialize object list
*
* @param nodeHolder node holder
* @param columnSchema table columns
* @param segmentProperties segment properties
* @param isDimensionColumn to get the list of dimension column or measure column
* @return list of data chunk2
* @throws IOException
*/
public static List<byte[]> getDataChunk2(NodeHolder nodeHolder, List<ColumnSchema> columnSchema, SegmentProperties segmentProperties, boolean isDimensionColumn) throws IOException {
List<byte[]> dataChunkBuffer = new ArrayList<>();
if (isDimensionColumn) {
for (int i = 0; i < nodeHolder.getKeyArray().length; i++) {
DataChunk2 dataChunk = new DataChunk2();
dataChunk.min_max = new BlockletMinMaxIndex();
dataChunk.setChunk_meta(getChunkCompressionMeta());
dataChunk.setNumberOfRowsInpage(nodeHolder.getEntryCount());
List<Encoding> encodings = new ArrayList<Encoding>();
dataChunk.setData_page_length(nodeHolder.getKeyLengths()[i]);
if (containsEncoding(i, Encoding.DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DICTIONARY);
}
if (containsEncoding(i, Encoding.DIRECT_DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DIRECT_DICTIONARY);
}
dataChunk.setRowMajor(nodeHolder.getColGrpBlocks()[i]);
if (nodeHolder.getAggBlocks()[i]) {
dataChunk.setRle_page_length(nodeHolder.getDataIndexMapLength()[i]);
encodings.add(Encoding.RLE);
}
dataChunk.setSort_state(nodeHolder.getIsSortedKeyBlock()[i] ? SortState.SORT_EXPLICIT : SortState.SORT_NATIVE);
if (!nodeHolder.getIsSortedKeyBlock()[i]) {
dataChunk.setRowid_page_length(nodeHolder.getKeyBlockIndexLength()[i]);
encodings.add(Encoding.INVERTED_INDEX);
}
dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getColumnMaxData()[i]));
dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getColumnMinData()[i]));
dataChunk.setEncoders(encodings);
dataChunkBuffer.add(CarbonUtil.getByteArray(dataChunk));
}
} else {
for (int i = 0; i < nodeHolder.getDataArray().length; i++) {
DataChunk2 dataChunk = new DataChunk2();
dataChunk.min_max = new BlockletMinMaxIndex();
dataChunk.setChunk_meta(getChunkCompressionMeta());
dataChunk.setNumberOfRowsInpage(nodeHolder.getEntryCount());
dataChunk.setData_page_length(nodeHolder.getDataArray()[i].length);
List<Encoding> encodings = new ArrayList<Encoding>();
// TODO : Right now the encodings are happening at runtime. change as
// per this encoders.
dataChunk.setEncoders(encodings);
dataChunk.setRowMajor(false);
// TODO : Right now the encodings are happening at runtime. change as
// per this encoders.
encodings.add(Encoding.DELTA);
dataChunk.setEncoders(encodings);
// TODO writing dummy presence meta need to set actual presence
// meta
PresenceMeta presenceMeta = new PresenceMeta();
presenceMeta.setPresent_bit_streamIsSet(true);
presenceMeta.setPresent_bit_stream(CompressorFactory.getInstance().getCompressor().compressByte(nodeHolder.getMeasureNullValueIndex()[i].toByteArray()));
dataChunk.setPresence(presenceMeta);
List<ByteBuffer> encoderMetaList = new ArrayList<ByteBuffer>();
encoderMetaList.add(ByteBuffer.wrap(serializeEncodeMetaUsingByteBuffer(createValueEncoderMeta(nodeHolder.getCompressionModel(), i))));
dataChunk.setEncoder_meta(encoderMetaList);
dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMaxData()[i]));
dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMinData()[i]));
dataChunkBuffer.add(CarbonUtil.getByteArray(dataChunk));
}
}
return dataChunkBuffer;
}
use of org.apache.carbondata.format.Encoding in project carbondata by apache.
the class CarbonMetadataUtil method getBlockletInfo.
private static BlockletInfo getBlockletInfo(BlockletInfoColumnar blockletInfoColumnar, List<ColumnSchema> columnSchema, SegmentProperties segmentProperties) throws IOException {
BlockletInfo blockletInfo = new BlockletInfo();
blockletInfo.setNum_rows(blockletInfoColumnar.getNumberOfKeys());
List<DataChunk> colDataChunks = new ArrayList<DataChunk>();
int j = 0;
int aggregateIndex = 0;
boolean[] isSortedKeyColumn = blockletInfoColumnar.getIsSortedKeyColumn();
boolean[] aggKeyBlock = blockletInfoColumnar.getAggKeyBlock();
boolean[] colGrpblock = blockletInfoColumnar.getColGrpBlocks();
for (int i = 0; i < blockletInfoColumnar.getKeyLengths().length; i++) {
DataChunk dataChunk = new DataChunk();
dataChunk.setChunk_meta(getChunkCompressionMeta());
List<Encoding> encodings = new ArrayList<Encoding>();
if (containsEncoding(i, Encoding.DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DICTIONARY);
}
if (containsEncoding(i, Encoding.DIRECT_DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DIRECT_DICTIONARY);
}
dataChunk.setRowMajor(colGrpblock[i]);
// TODO : Once schema PR is merged and information needs to be passed
// here.
dataChunk.setColumn_ids(new ArrayList<Integer>());
dataChunk.setData_page_length(blockletInfoColumnar.getKeyLengths()[i]);
dataChunk.setData_page_offset(blockletInfoColumnar.getKeyOffSets()[i]);
if (aggKeyBlock[i]) {
dataChunk.setRle_page_offset(blockletInfoColumnar.getDataIndexMapOffsets()[aggregateIndex]);
dataChunk.setRle_page_length(blockletInfoColumnar.getDataIndexMapLength()[aggregateIndex]);
encodings.add(Encoding.RLE);
aggregateIndex++;
}
dataChunk.setSort_state(isSortedKeyColumn[i] ? SortState.SORT_EXPLICIT : SortState.SORT_NATIVE);
if (!isSortedKeyColumn[i]) {
dataChunk.setRowid_page_offset(blockletInfoColumnar.getKeyBlockIndexOffSets()[j]);
dataChunk.setRowid_page_length(blockletInfoColumnar.getKeyBlockIndexLength()[j]);
if (!encodings.contains(Encoding.INVERTED_INDEX)) {
encodings.add(Encoding.INVERTED_INDEX);
}
j++;
}
// TODO : Right now the encodings are happening at runtime. change as per
// this encoders.
dataChunk.setEncoders(encodings);
colDataChunks.add(dataChunk);
}
for (int i = 0; i < blockletInfoColumnar.getMeasureLength().length; i++) {
DataChunk dataChunk = new DataChunk();
dataChunk.setChunk_meta(getChunkCompressionMeta());
dataChunk.setRowMajor(false);
// TODO : Once schema PR is merged and information needs to be passed
// here.
dataChunk.setColumn_ids(new ArrayList<Integer>());
dataChunk.setData_page_length(blockletInfoColumnar.getMeasureLength()[i]);
dataChunk.setData_page_offset(blockletInfoColumnar.getMeasureOffset()[i]);
// TODO : Right now the encodings are happening at runtime. change as per
// this encoders.
List<Encoding> encodings = new ArrayList<Encoding>();
encodings.add(Encoding.DELTA);
dataChunk.setEncoders(encodings);
// TODO writing dummy presence meta need to set actual presence
// meta
PresenceMeta presenceMeta = new PresenceMeta();
presenceMeta.setPresent_bit_streamIsSet(true);
presenceMeta.setPresent_bit_stream(blockletInfoColumnar.getMeasureNullValueIndex()[i].toByteArray());
dataChunk.setPresence(presenceMeta);
// TODO : PresenceMeta needs to be implemented and set here
// dataChunk.setPresence(new PresenceMeta());
// TODO : Need to write ValueCompression meta here.
List<ByteBuffer> encoderMetaList = new ArrayList<ByteBuffer>();
encoderMetaList.add(ByteBuffer.wrap(serializeEncoderMeta(createValueEncoderMeta(blockletInfoColumnar.getCompressionModel(), i))));
dataChunk.setEncoder_meta(encoderMetaList);
colDataChunks.add(dataChunk);
}
blockletInfo.setColumn_data_chunks(colDataChunks);
return blockletInfo;
}
use of org.apache.carbondata.format.Encoding in project carbondata by apache.
the class CarbonMetadataUtil method getDatachunk2.
/**
* Below method will be used to get the data chunk object for all the columns
*
* @param nodeHolderList blocklet info
* @param columnSchema list of columns
* @param segmentProperties segment properties
* @return list of data chunks
* @throws IOException
*/
private static List<DataChunk2> getDatachunk2(List<NodeHolder> nodeHolderList, List<ColumnSchema> columnSchema, SegmentProperties segmentProperties, int index, boolean isDimensionColumn) throws IOException {
List<DataChunk2> colDataChunks = new ArrayList<DataChunk2>();
DataChunk2 dataChunk = null;
NodeHolder nodeHolder = null;
for (int i = 0; i < nodeHolderList.size(); i++) {
nodeHolder = nodeHolderList.get(i);
dataChunk = new DataChunk2();
dataChunk.min_max = new BlockletMinMaxIndex();
dataChunk.setChunk_meta(getChunkCompressionMeta());
dataChunk.setNumberOfRowsInpage(nodeHolder.getEntryCount());
List<Encoding> encodings = new ArrayList<Encoding>();
if (isDimensionColumn) {
dataChunk.setData_page_length(nodeHolder.getKeyLengths()[index]);
if (containsEncoding(index, Encoding.DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DICTIONARY);
}
if (containsEncoding(index, Encoding.DIRECT_DICTIONARY, columnSchema, segmentProperties)) {
encodings.add(Encoding.DIRECT_DICTIONARY);
}
dataChunk.setRowMajor(nodeHolder.getColGrpBlocks()[index]);
// here.
if (nodeHolder.getAggBlocks()[index]) {
dataChunk.setRle_page_length(nodeHolder.getDataIndexMapLength()[index]);
encodings.add(Encoding.RLE);
}
dataChunk.setSort_state(nodeHolder.getIsSortedKeyBlock()[index] ? SortState.SORT_EXPLICIT : SortState.SORT_NATIVE);
if (!nodeHolder.getIsSortedKeyBlock()[index]) {
dataChunk.setRowid_page_length(nodeHolder.getKeyBlockIndexLength()[index]);
encodings.add(Encoding.INVERTED_INDEX);
}
dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getColumnMaxData()[index]));
dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getColumnMinData()[index]));
} else {
dataChunk.setData_page_length(nodeHolder.getDataArray()[index].length);
// TODO : Right now the encodings are happening at runtime. change as
// per this encoders.
dataChunk.setEncoders(encodings);
dataChunk.setRowMajor(false);
// TODO : Right now the encodings are happening at runtime. change as
// per this encoders.
encodings.add(Encoding.DELTA);
dataChunk.setEncoders(encodings);
// TODO writing dummy presence meta need to set actual presence
// meta
PresenceMeta presenceMeta = new PresenceMeta();
presenceMeta.setPresent_bit_streamIsSet(true);
presenceMeta.setPresent_bit_stream(CompressorFactory.getInstance().getCompressor().compressByte(nodeHolder.getMeasureNullValueIndex()[index].toByteArray()));
dataChunk.setPresence(presenceMeta);
List<ByteBuffer> encoderMetaList = new ArrayList<ByteBuffer>();
encoderMetaList.add(ByteBuffer.wrap(serializeEncodeMetaUsingByteBuffer(createValueEncoderMeta(nodeHolder.getCompressionModel(), index))));
dataChunk.setEncoder_meta(encoderMetaList);
dataChunk.min_max.addToMax_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMaxData()[index]));
dataChunk.min_max.addToMin_values(ByteBuffer.wrap(nodeHolder.getMeasureColumnMinData()[index]));
}
dataChunk.setEncoders(encodings);
colDataChunks.add(dataChunk);
}
return colDataChunks;
}
use of org.apache.carbondata.format.Encoding in project carbondata by apache.
the class CompressedMeasureChunkFileBasedReaderV3 method decodeMeasure.
/**
* Decode measure column page with page header and raw data starting from offset
*/
protected ColumnPage decodeMeasure(DataChunk2 pageMetadata, ByteBuffer pageData, int offset) throws MemoryException, IOException {
List<Encoding> encodings = pageMetadata.getEncoders();
List<ByteBuffer> encoderMetas = pageMetadata.getEncoder_meta();
ColumnPageDecoder codec = encodingFactory.createDecoder(encodings, encoderMetas);
return codec.decode(pageData.array(), offset, pageMetadata.data_page_length);
}
Aggregations