Search in sources :

Example 11 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    {
        ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
        writer.start();
        writer.startBlock(rowCount);
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        assertEquals(statistics.toString(), page.getStatistics().toString());
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 12 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class DeltaBinaryPackingValuesWriterForIntegerTest method shouldConsumePageDataInInitialization.

@Test
public void shouldConsumePageDataInInitialization() throws IOException {
    int[] data = new int[2 * blockSize + 3];
    for (int i = 0; i < data.length; i++) {
        data[i] = i * 32;
    }
    writeData(data);
    reader = new DeltaBinaryPackingValuesReader();
    BytesInput bytes = writer.getBytes();
    byte[] valueContent = bytes.toByteArray();
    byte[] pageContent = new byte[valueContent.length * 10];
    int contentOffsetInPage = 33;
    System.arraycopy(valueContent, 0, pageContent, contentOffsetInPage, valueContent.length);
    // offset should be correct
    ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.wrap(pageContent));
    stream.skipFully(contentOffsetInPage);
    reader.initFromPage(100, stream);
    long offset = stream.position();
    assertEquals(valueContent.length + contentOffsetInPage, offset);
    // should be able to read data correctly
    for (int i : data) {
        assertEquals(i, reader.readInteger());
    }
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) ByteBufferInputStream(org.apache.parquet.bytes.ByteBufferInputStream) Test(org.junit.Test)

Example 13 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class DeltaBinaryPackingValuesWriterForLongTest method shouldReturnCorrectOffsetAfterInitialization.

@Test
public void shouldReturnCorrectOffsetAfterInitialization() throws IOException {
    long[] data = new long[2 * blockSize + 3];
    for (int i = 0; i < data.length; i++) {
        data[i] = i * 32;
    }
    writeData(data);
    reader = new DeltaBinaryPackingValuesReader();
    BytesInput bytes = writer.getBytes();
    byte[] valueContent = bytes.toByteArray();
    byte[] pageContent = new byte[valueContent.length * 10];
    int contentOffsetInPage = 33;
    System.arraycopy(valueContent, 0, pageContent, contentOffsetInPage, valueContent.length);
    // offset should be correct
    ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.wrap(pageContent));
    stream.skipFully(contentOffsetInPage);
    reader.initFromPage(100, stream);
    long offset = stream.position();
    assertEquals(valueContent.length + contentOffsetInPage, offset);
    // should be able to read data correctly
    for (long i : data) {
        assertEquals(i, reader.readLong());
    }
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) ByteBufferInputStream(org.apache.parquet.bytes.ByteBufferInputStream) Test(org.junit.Test)

Example 14 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class ParquetFileReader method readCompressedDictionary.

private DictionaryPage readCompressedDictionary(PageHeader pageHeader, SeekableInputStream fin) throws IOException {
    DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();
    int uncompressedPageSize = pageHeader.getUncompressed_page_size();
    int compressedPageSize = pageHeader.getCompressed_page_size();
    byte[] dictPageBytes = new byte[compressedPageSize];
    fin.readFully(dictPageBytes);
    BytesInput bin = BytesInput.from(dictPageBytes);
    return new DictionaryPage(bin, uncompressedPageSize, dictHeader.getNum_values(), converter.getEncoding(dictHeader.getEncoding()));
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 15 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDictionary method testFirstPageFallBack.

@Test
public void testFirstPageFallBack() throws IOException {
    int COUNT = 1000;
    ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(10000, 10000);
    writeDistinct(COUNT, cw, "a");
    // not efficient so falls back
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN);
    writeRepeated(COUNT, cw, "b");
    // still plain because we fell back on first page
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
    ValuesReader cr = new BinaryPlainValuesReader();
    checkDistinct(COUNT, bytes1, cr, "a");
    checkRepeated(COUNT, bytes2, cr, "b");
}
Also used : ValuesReader(org.apache.parquet.column.values.ValuesReader) PlainValuesReader(org.apache.parquet.column.values.plain.PlainValuesReader) BinaryPlainValuesReader(org.apache.parquet.column.values.plain.BinaryPlainValuesReader) BinaryPlainValuesReader(org.apache.parquet.column.values.plain.BinaryPlainValuesReader) BytesInput(org.apache.parquet.bytes.BytesInput) PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) PlainIntegerDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter) PlainFloatDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) PlainBinaryDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter) PlainLongDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter) FallbackValuesWriter(org.apache.parquet.column.values.fallback.FallbackValuesWriter) PlainDoubleDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter) Test(org.junit.Test)

Aggregations

BytesInput (org.apache.parquet.bytes.BytesInput)19 Test (org.junit.Test)13 PlainValuesWriter (org.apache.parquet.column.values.plain.PlainValuesWriter)8 ValuesWriter (org.apache.parquet.column.values.ValuesWriter)5 PlainDoubleDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter)5 PlainFloatDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter)5 PlainIntegerDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter)5 PlainLongDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter)5 PlainBinaryDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter)4 FallbackValuesWriter (org.apache.parquet.column.values.fallback.FallbackValuesWriter)4 BinaryPlainValuesReader (org.apache.parquet.column.values.plain.BinaryPlainValuesReader)4 IOException (java.io.IOException)3 ByteBufferInputStream (org.apache.parquet.bytes.ByteBufferInputStream)3 HeapByteBufferAllocator (org.apache.parquet.bytes.HeapByteBufferAllocator)3 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)3 PageWriter (org.apache.parquet.column.page.PageWriter)3 ValuesReader (org.apache.parquet.column.values.ValuesReader)3 Encoding (org.apache.parquet.column.Encoding)2 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)2 PlainValuesReader (org.apache.parquet.column.values.plain.PlainValuesReader)2