use of org.apache.carbondata.format.LocalDictionaryChunk in project carbondata by apache.
the class ColumnPageEncoder method encodeDictionary.
/**
* Below method to encode the dictionary page
* @param dictionaryPage
* dictionary column page
* @return local dictionary chunk
* @throws IOException
* Problem in encoding
*/
public LocalDictionaryChunk encodeDictionary(ColumnPage dictionaryPage) throws IOException {
LocalDictionaryChunk localDictionaryChunk = new LocalDictionaryChunk();
localDictionaryChunk.setDictionary_data(encodeData(dictionaryPage));
LocalDictionaryChunkMeta localDictionaryChunkMeta = new LocalDictionaryChunkMeta();
localDictionaryChunkMeta.setEncoders(getEncodingList());
localDictionaryChunkMeta.setEncoder_meta(buildEncoderMeta(dictionaryPage));
localDictionaryChunk.setDictionary_meta(localDictionaryChunkMeta);
return localDictionaryChunk;
}
use of org.apache.carbondata.format.LocalDictionaryChunk in project carbondata by apache.
the class TestPageLevelDictionary method testPageLevelDictionaryContainsOnlyUsedDictionaryValuesWhenMultiplePagesUseSameDictionary.
@Test
public void testPageLevelDictionaryContainsOnlyUsedDictionaryValuesWhenMultiplePagesUseSameDictionary() {
LocalDictionaryGenerator generator = new ColumnLocalDictionaryGenerator(1000, 2);
String columnName = "column1";
PageLevelDictionary pageLevelDictionary1 = new PageLevelDictionary(generator, columnName, DataTypes.STRING, false, compressorName);
byte[][] validateData = new byte[10][];
int index = 0;
try {
for (int i = 1; i <= 5; i++) {
byte[] data = ("vishal" + i).getBytes();
ByteBuffer byteBuffer = ByteBuffer.allocate(data.length + 2);
byteBuffer.putShort((short) data.length);
byteBuffer.put(data);
validateData[index] = data;
pageLevelDictionary1.getDictionaryValue(byteBuffer.array());
index++;
}
Assert.assertTrue(true);
} catch (DictionaryThresholdReachedException e) {
Assert.assertTrue(false);
}
PageLevelDictionary pageLevelDictionary2 = new PageLevelDictionary(generator, columnName, DataTypes.STRING, false, compressorName);
try {
for (int i = 1; i <= 5; i++) {
byte[] data = ("vikas" + i).getBytes();
ByteBuffer byteBuffer = ByteBuffer.allocate(data.length + 2);
byteBuffer.putShort((short) data.length);
byteBuffer.put(data);
pageLevelDictionary2.getDictionaryValue(byteBuffer.array());
}
Assert.assertTrue(true);
} catch (DictionaryThresholdReachedException e) {
Assert.assertTrue(false);
}
try {
for (int i = 6; i <= 10; i++) {
byte[] data = ("vishal" + i).getBytes();
ByteBuffer byteBuffer = ByteBuffer.allocate(data.length + 2);
byteBuffer.putShort((short) data.length);
byteBuffer.put(data);
validateData[index] = data;
pageLevelDictionary1.getDictionaryValue(byteBuffer.array());
index++;
}
Assert.assertTrue(true);
} catch (DictionaryThresholdReachedException e) {
Assert.assertTrue(false);
}
try {
for (int i = 6; i <= 10; i++) {
byte[] data = ("vikas" + i).getBytes();
ByteBuffer byteBuffer = ByteBuffer.allocate(data.length + 2);
byteBuffer.putShort((short) data.length);
byteBuffer.put(data);
pageLevelDictionary2.getDictionaryValue(byteBuffer.array());
}
Assert.assertTrue(true);
} catch (DictionaryThresholdReachedException e) {
Assert.assertTrue(false);
}
try {
LocalDictionaryChunk localDictionaryChunkForBlocklet = pageLevelDictionary1.getLocalDictionaryChunkForBlocklet();
List<Encoding> encodings = localDictionaryChunkForBlocklet.getDictionary_meta().getEncoders();
EncodingFactory encodingFactory = DefaultEncodingFactory.getInstance();
List<ByteBuffer> encoderMetas = localDictionaryChunkForBlocklet.getDictionary_meta().getEncoder_meta();
ColumnPageDecoder decoder = encodingFactory.createDecoder(encodings, encoderMetas, compressorName);
ColumnPage decode = decoder.decode(localDictionaryChunkForBlocklet.getDictionary_data(), 0, localDictionaryChunkForBlocklet.getDictionary_data().length);
BitSet bitSet = BitSet.valueOf(CompressorFactory.getInstance().getCompressor(compressorName).unCompressByte(localDictionaryChunkForBlocklet.getDictionary_values()));
Assert.assertTrue(bitSet.cardinality() == validateData.length);
for (int i = 0; i < validateData.length; i++) {
Assert.assertTrue(Arrays.equals(decode.getBytes(i), validateData[i]));
}
} catch (IOException e) {
Assert.assertTrue(false);
}
}
use of org.apache.carbondata.format.LocalDictionaryChunk in project carbondata by apache.
the class TestPageLevelDictionary method testPageLevelDictionaryContainsOnlyUsedDictionaryValues.
@Test
public void testPageLevelDictionaryContainsOnlyUsedDictionaryValues() {
LocalDictionaryGenerator generator = new ColumnLocalDictionaryGenerator(1000, 2);
String columnName = "column1";
PageLevelDictionary pageLevelDictionary1 = new PageLevelDictionary(generator, columnName, DataTypes.STRING, false, compressorName);
byte[][] validateData = new byte[500][];
try {
for (int i = 1; i <= 500; i++) {
byte[] data = ("vishal" + i).getBytes();
ByteBuffer byteBuffer = ByteBuffer.allocate(data.length + 2);
byteBuffer.putShort((short) data.length);
byteBuffer.put(data);
validateData[i - 1] = data;
pageLevelDictionary1.getDictionaryValue(byteBuffer.array());
}
Assert.assertTrue(true);
} catch (DictionaryThresholdReachedException e) {
Assert.assertTrue(false);
}
PageLevelDictionary pageLevelDictionary2 = new PageLevelDictionary(generator, columnName, DataTypes.STRING, false, compressorName);
try {
for (int i = 1; i <= 500; i++) {
byte[] data = ("vikas" + i).getBytes();
ByteBuffer byteBuffer = ByteBuffer.allocate(data.length + 2);
byteBuffer.putShort((short) data.length);
byteBuffer.put(data);
pageLevelDictionary2.getDictionaryValue(byteBuffer.array());
}
Assert.assertTrue(true);
} catch (DictionaryThresholdReachedException e) {
Assert.assertTrue(false);
}
try {
LocalDictionaryChunk localDictionaryChunkForBlocklet = pageLevelDictionary1.getLocalDictionaryChunkForBlocklet();
List<Encoding> encodings = localDictionaryChunkForBlocklet.getDictionary_meta().getEncoders();
EncodingFactory encodingFactory = DefaultEncodingFactory.getInstance();
List<ByteBuffer> encoderMetas = localDictionaryChunkForBlocklet.getDictionary_meta().getEncoder_meta();
ColumnPageDecoder decoder = encodingFactory.createDecoder(encodings, encoderMetas, compressorName);
ColumnPage decode = decoder.decode(localDictionaryChunkForBlocklet.getDictionary_data(), 0, localDictionaryChunkForBlocklet.getDictionary_data().length);
for (int i = 0; i < 500; i++) {
Arrays.equals(decode.getBytes(i), validateData[i]);
}
} catch (IOException e) {
Assert.assertTrue(false);
}
}
use of org.apache.carbondata.format.LocalDictionaryChunk in project carbondata by apache.
the class PageLevelDictionary method getLocalDictionaryChunkForBlocklet.
/**
* Below method will be used to get the local dictionary chunk for writing
* @TODO Support for numeric data type dictionary exclude columns
* @return encoded local dictionary chunk
* @throws IOException
* in case of problem in encoding
*/
public LocalDictionaryChunk getLocalDictionaryChunkForBlocklet() throws IOException {
// TODO support for actual data type dictionary ColumnSPEC
ColumnType columnType = ColumnType.PLAIN_VALUE;
boolean isVarcharType = false;
int lvSize = CarbonCommonConstants.SHORT_SIZE_IN_BYTE;
if (DataTypes.VARCHAR == dataType) {
columnType = ColumnType.PLAIN_LONG_VALUE;
lvSize = CarbonCommonConstants.INT_SIZE_IN_BYTE;
isVarcharType = true;
}
TableSpec.ColumnSpec spec = TableSpec.ColumnSpec.newInstance(columnName, DataTypes.BYTE_ARRAY, columnType);
ColumnPage dictionaryColumnPage = ColumnPage.newPage(new ColumnPageEncoderMeta(spec, DataTypes.BYTE_ARRAY, columnCompressor), usedDictionaryValues.cardinality());
// TODO support data type specific stats collector for numeric data types
dictionaryColumnPage.setStatsCollector(new DummyStatsCollector());
int rowId = 0;
ByteBuffer byteBuffer = null;
for (int i = usedDictionaryValues.nextSetBit(0); i >= 0; i = usedDictionaryValues.nextSetBit(i + 1)) {
if (!isComplexTypePrimitive) {
dictionaryColumnPage.putData(rowId++, localDictionaryGenerator.getDictionaryKeyBasedOnValue(i));
} else {
byte[] dictionaryKeyBasedOnValue = localDictionaryGenerator.getDictionaryKeyBasedOnValue(i);
byteBuffer = ByteBuffer.allocate(lvSize + dictionaryKeyBasedOnValue.length);
if (!isVarcharType) {
byteBuffer.putShort((short) dictionaryKeyBasedOnValue.length);
} else {
byteBuffer.putInt(dictionaryKeyBasedOnValue.length);
}
byteBuffer.put(dictionaryKeyBasedOnValue);
dictionaryColumnPage.putData(rowId++, byteBuffer.array());
}
}
// creating a encoder
ColumnPageEncoder encoder = new DirectCompressCodec(DataTypes.BYTE_ARRAY).createEncoder(null);
// get encoded dictionary values
LocalDictionaryChunk localDictionaryChunk = encoder.encodeDictionary(dictionaryColumnPage);
// set compressed dictionary values
localDictionaryChunk.setDictionary_values(CompressorFactory.getInstance().getCompressor(columnCompressor).compressByte(usedDictionaryValues.toByteArray()));
// free the dictionary page memory
dictionaryColumnPage.freeMemory();
return localDictionaryChunk;
}
use of org.apache.carbondata.format.LocalDictionaryChunk in project carbondata by apache.
the class CarbonTestUtil method validateDictionary.
public static Boolean validateDictionary(DimensionRawColumnChunk rawColumnPage, String[] data) throws IOException {
LocalDictionaryChunk local_dictionary = rawColumnPage.getDataChunkV3().local_dictionary;
if (null != local_dictionary) {
String compressorName = CarbonMetadataUtil.getCompressorNameFromChunkMeta(rawColumnPage.getDataChunkV3().getData_chunk_list().get(0).getChunk_meta());
List<org.apache.carbondata.format.Encoding> encodings = local_dictionary.getDictionary_meta().encoders;
DefaultEncodingFactory encodingFactory = (DefaultEncodingFactory) DefaultEncodingFactory.getInstance();
ColumnPageDecoder decoder = encodingFactory.createDecoder(encodings, local_dictionary.getDictionary_meta().getEncoder_meta(), compressorName);
LazyColumnPage dictionaryPage = (LazyColumnPage) decoder.decode(local_dictionary.getDictionary_data(), 0, local_dictionary.getDictionary_data().length);
HashMap<DictionaryByteArrayWrapper, Integer> dictionaryMap = new HashMap<>();
BitSet usedDictionaryValues = BitSet.valueOf(CompressorFactory.getInstance().getCompressor(compressorName).unCompressByte(local_dictionary.getDictionary_values()));
int index = 0;
int i = usedDictionaryValues.nextSetBit(0);
while (i >= 0) {
dictionaryMap.put(new DictionaryByteArrayWrapper(dictionaryPage.getBytes(index)), i);
i = usedDictionaryValues.nextSetBit(i + 1);
index += 1;
}
for (i = 0; i < data.length; i++) {
if (null == dictionaryMap.get(new DictionaryByteArrayWrapper(data[i].getBytes(Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET))))) {
return false;
}
}
return true;
}
return false;
}
Aggregations