Search in sources :

Example 1 with ValuesWriter

use of org.apache.parquet.column.values.ValuesWriter in project parquet-mr by apache.

the class SmallRangeWritingBenchmarkTest method writeRLEWithSmallBitWidthTest.

@BenchmarkOptions(benchmarkRounds = 10, warmupRounds = 2)
@Test
public void writeRLEWithSmallBitWidthTest() {
    ValuesWriter writer = new RunLengthBitPackingHybridValuesWriter(2, 100, 20000, new DirectByteBufferAllocator());
    runWriteTest(writer);
}
Also used : DirectByteBufferAllocator(org.apache.parquet.bytes.DirectByteBufferAllocator) RunLengthBitPackingHybridValuesWriter(org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) RunLengthBitPackingHybridValuesWriter(org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter) Test(org.junit.Test) BenchmarkOptions(com.carrotsearch.junitbenchmarks.BenchmarkOptions)

Example 2 with ValuesWriter

use of org.apache.parquet.column.values.ValuesWriter in project parquet-mr by apache.

the class TestDictionary method testBinaryDictionary.

@Test
public void testBinaryDictionary() throws IOException {
    int COUNT = 100;
    ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
    writeRepeated(COUNT, cw, "a");
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    writeRepeated(COUNT, cw, "b");
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    // now we will fall back
    writeDistinct(COUNT, cw, "c");
    BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
    DictionaryValuesReader cr = initDicReader(cw, BINARY);
    checkRepeated(COUNT, bytes1, cr, "a");
    checkRepeated(COUNT, bytes2, cr, "b");
    BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
    checkDistinct(COUNT, bytes3, cr2, "c");
}
Also used : BinaryPlainValuesReader(org.apache.parquet.column.values.plain.BinaryPlainValuesReader) BytesInput(org.apache.parquet.bytes.BytesInput) PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) PlainIntegerDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter) PlainFloatDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) PlainBinaryDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter) PlainLongDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter) FallbackValuesWriter(org.apache.parquet.column.values.fallback.FallbackValuesWriter) PlainDoubleDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter) Test(org.junit.Test)

Example 3 with ValuesWriter

use of org.apache.parquet.column.values.ValuesWriter in project parquet-mr by apache.

the class TestDictionary method testBinaryDictionaryChangedValues.

@Test
public void testBinaryDictionaryChangedValues() throws IOException {
    int COUNT = 100;
    ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
    writeRepeatedWithReuse(COUNT, cw, "a");
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    writeRepeatedWithReuse(COUNT, cw, "b");
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    // now we will fall back
    writeDistinct(COUNT, cw, "c");
    BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
    DictionaryValuesReader cr = initDicReader(cw, BINARY);
    checkRepeated(COUNT, bytes1, cr, "a");
    checkRepeated(COUNT, bytes2, cr, "b");
    BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
    checkDistinct(COUNT, bytes3, cr2, "c");
}
Also used : BinaryPlainValuesReader(org.apache.parquet.column.values.plain.BinaryPlainValuesReader) BytesInput(org.apache.parquet.bytes.BytesInput) PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) PlainIntegerDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter) PlainFloatDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) PlainBinaryDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter) PlainLongDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter) FallbackValuesWriter(org.apache.parquet.column.values.fallback.FallbackValuesWriter) PlainDoubleDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter) Test(org.junit.Test)

Example 4 with ValuesWriter

use of org.apache.parquet.column.values.ValuesWriter in project parquet-mr by apache.

the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.

@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
    ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
    MemPageStore pages = new MemPageStore(0);
    PageWriter memWriter = pages.getPageWriter(column);
    ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
    // get generic repetition and definition level bytes to use for pages
    ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
    for (int i = 0; i < 10; i += 1) {
        rdValues.writeInteger(0);
    }
    // use a byte array backed BytesInput because it is reused
    BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
    DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
    String lastValue = null;
    List<String> values = new ArrayList<String>();
    for (int i = 0; i < 10; i += 1) {
        lastValue = str(i);
        writer.writeBytes(Binary.fromString(lastValue));
        values.add(lastValue);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    // sets previous to new byte[0]
    writer.reset();
    corruptWriter(writer, lastValue);
    for (int i = 10; i < 20; i += 1) {
        String value = str(i);
        writer.writeBytes(Binary.fromString(value));
        values.add(value);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    final List<String> actualValues = new ArrayList<String>();
    PrimitiveConverter converter = new PrimitiveConverter() {

        @Override
        public void addBinary(Binary value) {
            actualValues.add(value.toStringUsingUTF8());
        }
    };
    ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
    while (actualValues.size() < columnReader.getTotalValueCount()) {
        columnReader.writeCurrentValueToConverter();
        columnReader.consume();
    }
    Assert.assertEquals(values, actualValues);
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) DeltaByteArrayWriter(org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ParquetProperties(org.apache.parquet.column.ParquetProperties) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) Binary(org.apache.parquet.io.api.Binary) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 5 with ValuesWriter

use of org.apache.parquet.column.values.ValuesWriter in project parquet-mr by apache.

the class RandomWritingBenchmarkTest method writeRLETest.

@BenchmarkOptions(benchmarkRounds = 10, warmupRounds = 2)
@Test
public void writeRLETest() {
    ValuesWriter writer = new RunLengthBitPackingHybridValuesWriter(32, 100, 20000, new DirectByteBufferAllocator());
    runWriteTest(writer);
}
Also used : DirectByteBufferAllocator(org.apache.parquet.bytes.DirectByteBufferAllocator) RunLengthBitPackingHybridValuesWriter(org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) RunLengthBitPackingHybridValuesWriter(org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter) DeltaBinaryPackingValuesWriter(org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriter) Test(org.junit.Test) BenchmarkOptions(com.carrotsearch.junitbenchmarks.BenchmarkOptions)

Aggregations

ValuesWriter (org.apache.parquet.column.values.ValuesWriter)11 Test (org.junit.Test)8 FallbackValuesWriter (org.apache.parquet.column.values.fallback.FallbackValuesWriter)6 PlainValuesWriter (org.apache.parquet.column.values.plain.PlainValuesWriter)6 BytesInput (org.apache.parquet.bytes.BytesInput)5 PlainBinaryDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter)5 PlainDoubleDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter)5 PlainFloatDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter)5 PlainIntegerDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter)5 PlainLongDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter)5 BinaryPlainValuesReader (org.apache.parquet.column.values.plain.BinaryPlainValuesReader)5 ValuesReader (org.apache.parquet.column.values.ValuesReader)4 RunLengthBitPackingHybridValuesWriter (org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter)4 DirectByteBufferAllocator (org.apache.parquet.bytes.DirectByteBufferAllocator)3 PlainValuesReader (org.apache.parquet.column.values.plain.PlainValuesReader)3 BenchmarkOptions (com.carrotsearch.junitbenchmarks.BenchmarkOptions)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)2 DeltaBinaryPackingValuesWriter (org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriter)2 Binary (org.apache.parquet.io.api.Binary)2 ArrayList (java.util.ArrayList)1