Search in sources :

Example 6 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDictionary method getBytesAndCheckEncoding.

private BytesInput getBytesAndCheckEncoding(ValuesWriter cw, Encoding encoding) throws IOException {
    BytesInput bytes = BytesInput.copy(cw.getBytes());
    assertEquals(encoding, cw.getEncoding());
    cw.reset();
    return bytes;
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput)

Example 7 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDictionary method testDoubleDictionary.

@Test
public void testDoubleDictionary() throws IOException {
    int COUNT = 1000;
    int COUNT2 = 2000;
    final FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw = newPlainDoubleDictionaryValuesWriter(10000, 10000);
    for (double i = 0; i < COUNT; i++) {
        cw.writeDouble(i % 50);
    }
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    assertEquals(50, cw.initialWriter.getDictionarySize());
    for (double i = COUNT2; i > 0; i--) {
        cw.writeDouble(i % 50);
    }
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    assertEquals(50, cw.initialWriter.getDictionarySize());
    final DictionaryValuesReader cr = initDicReader(cw, DOUBLE);
    cr.initFromPage(COUNT, bytes1.toInputStream());
    for (double i = 0; i < COUNT; i++) {
        double back = cr.readDouble();
        assertEquals(i % 50, back, 0.0);
    }
    cr.initFromPage(COUNT2, bytes2.toInputStream());
    for (double i = COUNT2; i > 0; i--) {
        double back = cr.readDouble();
        assertEquals(i % 50, back, 0.0);
    }
}
Also used : PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) BytesInput(org.apache.parquet.bytes.BytesInput) PlainDoubleDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter) Test(org.junit.Test)

Example 8 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class ColumnReaderImpl method readPageV1.

private void readPageV1(DataPageV1 page) {
    ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL);
    ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL);
    this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
    this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
    try {
        BytesInput bytes = page.getBytes();
        LOG.debug("page size {} bytes and {} records", bytes.size(), pageValueCount);
        LOG.debug("reading repetition levels at 0");
        ByteBufferInputStream in = bytes.toInputStream();
        rlReader.initFromPage(pageValueCount, in);
        LOG.debug("reading definition levels at {}", in.position());
        dlReader.initFromPage(pageValueCount, in);
        LOG.debug("reading data at {}", in.position());
        initDataReader(page.getValueEncoding(), in, page.getValueCount());
    } catch (IOException e) {
        throw new ParquetDecodingException("could not read page " + page + " in col " + path, e);
    }
}
Also used : ValuesReader(org.apache.parquet.column.values.ValuesReader) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) BytesInput(org.apache.parquet.bytes.BytesInput) ByteBufferInputStream(org.apache.parquet.bytes.ByteBufferInputStream) IOException(java.io.IOException)

Example 9 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.

@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
    ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
    MemPageStore pages = new MemPageStore(0);
    PageWriter memWriter = pages.getPageWriter(column);
    ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
    // get generic repetition and definition level bytes to use for pages
    ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
    for (int i = 0; i < 10; i += 1) {
        rdValues.writeInteger(0);
    }
    // use a byte array backed BytesInput because it is reused
    BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
    DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
    String lastValue = null;
    List<String> values = new ArrayList<String>();
    for (int i = 0; i < 10; i += 1) {
        lastValue = str(i);
        writer.writeBytes(Binary.fromString(lastValue));
        values.add(lastValue);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    // sets previous to new byte[0]
    writer.reset();
    corruptWriter(writer, lastValue);
    for (int i = 10; i < 20; i += 1) {
        String value = str(i);
        writer.writeBytes(Binary.fromString(value));
        values.add(value);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    final List<String> actualValues = new ArrayList<String>();
    PrimitiveConverter converter = new PrimitiveConverter() {

        @Override
        public void addBinary(Binary value) {
            actualValues.add(value.toStringUsingUTF8());
        }
    };
    ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
    while (actualValues.size() < columnReader.getTotalValueCount()) {
        columnReader.writeCurrentValueToConverter();
        columnReader.consume();
    }
    Assert.assertEquals(values, actualValues);
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) DeltaByteArrayWriter(org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ParquetProperties(org.apache.parquet.column.ParquetProperties) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) Binary(org.apache.parquet.io.api.Binary) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 10 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method testColumnOrderV1.

@Test
public void testColumnOrderV1() throws IOException {
    ParquetFileWriter mockFileWriter = Mockito.mock(ParquetFileWriter.class);
    InOrder inOrder = inOrder(mockFileWriter);
    MessageType schema = Types.buildMessage().required(BINARY).as(UTF8).named("a_string").required(INT32).named("an_int").required(INT64).named("a_long").required(FLOAT).named("a_float").required(DOUBLE).named("a_double").named("order_test");
    BytesInput fakeData = BytesInput.fromInt(34);
    int fakeCount = 3;
    BinaryStatistics fakeStats = new BinaryStatistics();
    // TODO - look back at this, an allocator was being passed here in the ByteBuffer changes
    // see comment at this constructor
    ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(UNCOMPRESSED), schema, new HeapByteBufferAllocator());
    for (ColumnDescriptor col : schema.getColumns()) {
        PageWriter pageWriter = store.getPageWriter(col);
        pageWriter.writePage(fakeData, fakeCount, fakeStats, RLE, RLE, PLAIN);
    }
    // flush to the mock writer
    store.flushToFileWriter(mockFileWriter);
    for (ColumnDescriptor col : schema.getColumns()) {
        inOrder.verify(mockFileWriter).startColumn(eq(col), eq((long) fakeCount), eq(UNCOMPRESSED));
    }
}
Also used : InOrder(org.mockito.InOrder) BytesInput(org.apache.parquet.bytes.BytesInput) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Aggregations

BytesInput (org.apache.parquet.bytes.BytesInput)19 Test (org.junit.Test)13 PlainValuesWriter (org.apache.parquet.column.values.plain.PlainValuesWriter)8 ValuesWriter (org.apache.parquet.column.values.ValuesWriter)5 PlainDoubleDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter)5 PlainFloatDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter)5 PlainIntegerDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter)5 PlainLongDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter)5 PlainBinaryDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter)4 FallbackValuesWriter (org.apache.parquet.column.values.fallback.FallbackValuesWriter)4 BinaryPlainValuesReader (org.apache.parquet.column.values.plain.BinaryPlainValuesReader)4 IOException (java.io.IOException)3 ByteBufferInputStream (org.apache.parquet.bytes.ByteBufferInputStream)3 HeapByteBufferAllocator (org.apache.parquet.bytes.HeapByteBufferAllocator)3 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)3 PageWriter (org.apache.parquet.column.page.PageWriter)3 ValuesReader (org.apache.parquet.column.values.ValuesReader)3 Encoding (org.apache.parquet.column.Encoding)2 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)2 PlainValuesReader (org.apache.parquet.column.values.plain.PlainValuesReader)2