Search in sources :

Example 1 with DictionaryPage

use of com.facebook.presto.parquet.DictionaryPage in project presto by prestodb.

the class TestValuesDecoders method testBinaryRLEDictionary.

@Test
public void testBinaryRLEDictionary() throws IOException {
    Random random = new Random(83);
    int valueCount = 2048;
    int dictionarySize = 29;
    List<Object> dictionary = new ArrayList<>();
    List<Integer> dictionaryIds = new ArrayList<>();
    byte[] dictionaryPage = TestParquetUtils.generatePlainValuesPage(dictionarySize, -1, random, dictionary);
    byte[] dataPage = TestParquetUtils.generateDictionaryIdPage2048(dictionarySize - 1, random, dictionaryIds);
    List<Object> expectedValues = new ArrayList<>();
    for (Integer dictionaryId : dictionaryIds) {
        expectedValues.add(dictionary.get(dictionaryId));
    }
    BinaryBatchDictionary binaryDictionary = new BinaryBatchDictionary(new DictionaryPage(Slices.wrappedBuffer(dictionaryPage), dictionarySize, PLAIN_DICTIONARY));
    binaryBatchReadWithSkipHelper(valueCount, 0, valueCount, binaryDictionary(dataPage, dictionarySize, binaryDictionary), expectedValues);
    binaryBatchReadWithSkipHelper(29, 0, valueCount, binaryDictionary(dataPage, dictionarySize, binaryDictionary), expectedValues);
    binaryBatchReadWithSkipHelper(89, 0, valueCount, binaryDictionary(dataPage, dictionarySize, binaryDictionary), expectedValues);
    binaryBatchReadWithSkipHelper(1024, 0, valueCount, binaryDictionary(dataPage, dictionarySize, binaryDictionary), expectedValues);
    binaryBatchReadWithSkipHelper(256, 29, valueCount, binaryDictionary(dataPage, dictionarySize, binaryDictionary), expectedValues);
    binaryBatchReadWithSkipHelper(89, 29, valueCount, binaryDictionary(dataPage, dictionarySize, binaryDictionary), expectedValues);
    binaryBatchReadWithSkipHelper(1024, 1024, valueCount, binaryDictionary(dataPage, dictionarySize, binaryDictionary), expectedValues);
}
Also used : Random(java.util.Random) ArrayList(java.util.ArrayList) DictionaryPage(com.facebook.presto.parquet.DictionaryPage) BinaryBatchDictionary(com.facebook.presto.parquet.batchreader.dictionary.BinaryBatchDictionary) Test(org.testng.annotations.Test)

Example 2 with DictionaryPage

use of com.facebook.presto.parquet.DictionaryPage in project presto by prestodb.

the class TestValuesDecoders method testInt64RLEDictionary.

@Test
public void testInt64RLEDictionary() throws IOException {
    Random random = new Random(83);
    int valueCount = 2048;
    int dictionarySize = 29;
    List<Object> dictionary = new ArrayList<>();
    List<Integer> dictionaryIds = new ArrayList<>();
    byte[] dictionaryPage = generatePlainValuesPage(dictionarySize, 64, random, dictionary);
    byte[] dataPage = generateDictionaryIdPage2048(dictionarySize - 1, random, dictionaryIds);
    List<Object> expectedValues = new ArrayList<>();
    for (Integer dictionaryId : dictionaryIds) {
        expectedValues.add(dictionary.get(dictionaryId));
    }
    LongDictionary longDictionary = new LongDictionary(new DictionaryPage(Slices.wrappedBuffer(dictionaryPage), dictionarySize, PLAIN_DICTIONARY));
    int64BatchReadWithSkipHelper(valueCount, 0, valueCount, int64Dictionary(dataPage, dictionarySize, longDictionary), expectedValues);
    int64BatchReadWithSkipHelper(29, 0, valueCount, int64Dictionary(dataPage, dictionarySize, longDictionary), expectedValues);
    int64BatchReadWithSkipHelper(89, 0, valueCount, int64Dictionary(dataPage, dictionarySize, longDictionary), expectedValues);
    int64BatchReadWithSkipHelper(1024, 0, valueCount, int64Dictionary(dataPage, dictionarySize, longDictionary), expectedValues);
    int64BatchReadWithSkipHelper(256, 29, valueCount, int64Dictionary(dataPage, dictionarySize, longDictionary), expectedValues);
    int64BatchReadWithSkipHelper(89, 29, valueCount, int64Dictionary(dataPage, dictionarySize, longDictionary), expectedValues);
    int64BatchReadWithSkipHelper(1024, 1024, valueCount, int64Dictionary(dataPage, dictionarySize, longDictionary), expectedValues);
    List<Object> expectedTimestampValues = expectedValues.stream().map(v -> (long) v / 1000L).collect(Collectors.toList());
    int64BatchReadWithSkipHelper(valueCount, 0, valueCount, int64TimestampMicrosDictionary(dataPage, dictionarySize, longDictionary), expectedTimestampValues);
    int64BatchReadWithSkipHelper(29, 0, valueCount, int64TimestampMicrosDictionary(dataPage, dictionarySize, longDictionary), expectedTimestampValues);
    int64BatchReadWithSkipHelper(89, 0, valueCount, int64TimestampMicrosDictionary(dataPage, dictionarySize, longDictionary), expectedTimestampValues);
    int64BatchReadWithSkipHelper(1024, 0, valueCount, int64TimestampMicrosDictionary(dataPage, dictionarySize, longDictionary), expectedTimestampValues);
    int64BatchReadWithSkipHelper(256, 29, valueCount, int64TimestampMicrosDictionary(dataPage, dictionarySize, longDictionary), expectedTimestampValues);
    int64BatchReadWithSkipHelper(89, 29, valueCount, int64TimestampMicrosDictionary(dataPage, dictionarySize, longDictionary), expectedTimestampValues);
    int64BatchReadWithSkipHelper(1024, 1024, valueCount, int64TimestampMicrosDictionary(dataPage, dictionarySize, longDictionary), expectedTimestampValues);
}
Also used : Int64TimestampMicrosPlainValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.plain.Int64TimestampMicrosPlainValuesDecoder) BooleanValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.ValuesDecoder.BooleanValuesDecoder) LongDictionary(com.facebook.presto.parquet.dictionary.LongDictionary) Arrays(java.util.Arrays) BinaryBatchDictionary(com.facebook.presto.parquet.batchreader.dictionary.BinaryBatchDictionary) Int32ValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.ValuesDecoder.Int32ValuesDecoder) Int64TimestampMicrosRLEDictionaryValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.rle.Int64TimestampMicrosRLEDictionaryValuesDecoder) BytesUtils.getWidthFromMaxInt(org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt) Assert.assertEquals(org.testng.Assert.assertEquals) Test(org.testng.annotations.Test) Random(java.util.Random) ByteBuffer(java.nio.ByteBuffer) TestParquetUtils.generatePlainValuesPage(com.facebook.presto.parquet.batchreader.decoders.TestParquetUtils.generatePlainValuesPage) ArrayList(java.util.ArrayList) ByteArrayInputStream(java.io.ByteArrayInputStream) Int64ValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.ValuesDecoder.Int64ValuesDecoder) BooleanRLEValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.rle.BooleanRLEValuesDecoder) Int32RLEDictionaryValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.rle.Int32RLEDictionaryValuesDecoder) PLAIN_DICTIONARY(com.facebook.presto.parquet.ParquetEncoding.PLAIN_DICTIONARY) Slices(io.airlift.slice.Slices) UTF8(org.apache.parquet.bytes.BytesUtils.UTF8) Int64TimestampMicrosValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.ValuesDecoder.Int64TimestampMicrosValuesDecoder) TimestampRLEDictionaryValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.rle.TimestampRLEDictionaryValuesDecoder) TimestampValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.ValuesDecoder.TimestampValuesDecoder) BinaryPlainValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.plain.BinaryPlainValuesDecoder) BooleanPlainValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.plain.BooleanPlainValuesDecoder) BinaryRLEDictionaryValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.rle.BinaryRLEDictionaryValuesDecoder) IOException(java.io.IOException) Int64PlainValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.plain.Int64PlainValuesDecoder) Math.min(java.lang.Math.min) Collectors(java.util.stream.Collectors) TestParquetUtils.generateDictionaryIdPage2048(com.facebook.presto.parquet.batchreader.decoders.TestParquetUtils.generateDictionaryIdPage2048) TimestampPlainValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.plain.TimestampPlainValuesDecoder) List(java.util.List) IntegerDictionary(com.facebook.presto.parquet.dictionary.IntegerDictionary) Int32PlainValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.plain.Int32PlainValuesDecoder) Int64RLEDictionaryValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.rle.Int64RLEDictionaryValuesDecoder) BinaryValuesDecoder(com.facebook.presto.parquet.batchreader.decoders.ValuesDecoder.BinaryValuesDecoder) TimestampDictionary(com.facebook.presto.parquet.batchreader.dictionary.TimestampDictionary) DictionaryPage(com.facebook.presto.parquet.DictionaryPage) Random(java.util.Random) LongDictionary(com.facebook.presto.parquet.dictionary.LongDictionary) ArrayList(java.util.ArrayList) DictionaryPage(com.facebook.presto.parquet.DictionaryPage) Test(org.testng.annotations.Test)

Example 3 with DictionaryPage

use of com.facebook.presto.parquet.DictionaryPage in project presto by prestodb.

the class BinaryFlatBatchReader method init.

@Override
public void init(PageReader pageReader, Field field, RowRanges rowRanges) {
    checkArgument(!isInitialized(), "Parquet batch reader already initialized");
    this.pageReader = requireNonNull(pageReader, "pageReader is null");
    checkArgument(pageReader.getTotalValueCount() > 0, "page is empty");
    this.field = requireNonNull(field, "field is null");
    DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
    if (dictionaryPage != null) {
        dictionary = Dictionaries.createDictionary(columnDescriptor, dictionaryPage);
    }
}
Also used : DictionaryPage(com.facebook.presto.parquet.DictionaryPage)

Example 4 with DictionaryPage

use of com.facebook.presto.parquet.DictionaryPage in project presto by prestodb.

the class ParquetColumnChunk method readAllPages.

public PageReader readAllPages() throws IOException {
    LinkedList<DataPage> pages = new LinkedList<>();
    DictionaryPage dictionaryPage = null;
    long valueCount = 0;
    int dataPageCount = 0;
    while (hasMorePages(valueCount, dataPageCount)) {
        PageHeader pageHeader = readPageHeader();
        int uncompressedPageSize = pageHeader.getUncompressed_page_size();
        int compressedPageSize = pageHeader.getCompressed_page_size();
        long firstRowIndex = -1;
        switch(pageHeader.type) {
            case DICTIONARY_PAGE:
                if (dictionaryPage != null) {
                    throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor());
                }
                dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize);
                break;
            case DATA_PAGE:
                firstRowIndex = PageReader.getFirstRowIndex(dataPageCount, offsetIndex);
                valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, firstRowIndex, pages);
                dataPageCount = dataPageCount + 1;
                break;
            case DATA_PAGE_V2:
                firstRowIndex = PageReader.getFirstRowIndex(dataPageCount, offsetIndex);
                valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, firstRowIndex, pages);
                dataPageCount = dataPageCount + 1;
                break;
            default:
                stream.skipFully(compressedPageSize);
                break;
        }
    }
    return new PageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage, offsetIndex);
}
Also used : ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) DataPage(com.facebook.presto.parquet.DataPage) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) DataPageHeader(org.apache.parquet.format.DataPageHeader) PageHeader(org.apache.parquet.format.PageHeader) LinkedList(java.util.LinkedList) DictionaryPage(com.facebook.presto.parquet.DictionaryPage)

Example 5 with DictionaryPage

use of com.facebook.presto.parquet.DictionaryPage in project presto by prestodb.

the class PredicateUtils method readDictionaryPage.

private static Optional<DictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName) {
    try {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        PageHeader pageHeader = Util.readPageHeader(inputStream);
        if (pageHeader.type != PageType.DICTIONARY_PAGE) {
            return Optional.empty();
        }
        Slice compressedData = wrappedBuffer(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size());
        DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
        ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()));
        int dictionarySize = dicHeader.getNum_values();
        return Optional.of(new DictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding));
    } catch (IOException ignored) {
        return Optional.empty();
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) PageHeader(org.apache.parquet.format.PageHeader) Slice(io.airlift.slice.Slice) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) IOException(java.io.IOException) DictionaryPage(com.facebook.presto.parquet.DictionaryPage) ParquetEncoding(com.facebook.presto.parquet.ParquetEncoding) ParquetTypeUtils.getParquetEncoding(com.facebook.presto.parquet.ParquetTypeUtils.getParquetEncoding)

Aggregations

DictionaryPage (com.facebook.presto.parquet.DictionaryPage)11 ArrayList (java.util.ArrayList)5 Random (java.util.Random)4 Test (org.testng.annotations.Test)4 IOException (java.io.IOException)3 ParquetCorruptionException (com.facebook.presto.parquet.ParquetCorruptionException)2 BinaryBatchDictionary (com.facebook.presto.parquet.batchreader.dictionary.BinaryBatchDictionary)2 TimestampDictionary (com.facebook.presto.parquet.batchreader.dictionary.TimestampDictionary)2 IntegerDictionary (com.facebook.presto.parquet.dictionary.IntegerDictionary)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 DictionaryPageHeader (org.apache.parquet.format.DictionaryPageHeader)2 PageHeader (org.apache.parquet.format.PageHeader)2 DataPage (com.facebook.presto.parquet.DataPage)1 ParquetEncoding (com.facebook.presto.parquet.ParquetEncoding)1 PLAIN_DICTIONARY (com.facebook.presto.parquet.ParquetEncoding.PLAIN_DICTIONARY)1 ParquetTypeUtils.getParquetEncoding (com.facebook.presto.parquet.ParquetTypeUtils.getParquetEncoding)1 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)1 TestParquetUtils.generateDictionaryIdPage2048 (com.facebook.presto.parquet.batchreader.decoders.TestParquetUtils.generateDictionaryIdPage2048)1 TestParquetUtils.generatePlainValuesPage (com.facebook.presto.parquet.batchreader.decoders.TestParquetUtils.generatePlainValuesPage)1 BinaryValuesDecoder (com.facebook.presto.parquet.batchreader.decoders.ValuesDecoder.BinaryValuesDecoder)1