Search in sources :

Example 1 with ParquetProperties

use of org.apache.parquet.column.ParquetProperties in project parquet-mr by apache.

the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.

@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
    ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
    MemPageStore pages = new MemPageStore(0);
    PageWriter memWriter = pages.getPageWriter(column);
    ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
    // get generic repetition and definition level bytes to use for pages
    ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
    for (int i = 0; i < 10; i += 1) {
        rdValues.writeInteger(0);
    }
    // use a byte array backed BytesInput because it is reused
    BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
    DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
    String lastValue = null;
    List<String> values = new ArrayList<String>();
    for (int i = 0; i < 10; i += 1) {
        lastValue = str(i);
        writer.writeBytes(Binary.fromString(lastValue));
        values.add(lastValue);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    // sets previous to new byte[0]
    writer.reset();
    corruptWriter(writer, lastValue);
    for (int i = 10; i < 20; i += 1) {
        String value = str(i);
        writer.writeBytes(Binary.fromString(value));
        values.add(value);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    final List<String> actualValues = new ArrayList<String>();
    PrimitiveConverter converter = new PrimitiveConverter() {

        @Override
        public void addBinary(Binary value) {
            actualValues.add(value.toStringUsingUTF8());
        }
    };
    ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
    while (actualValues.size() < columnReader.getTotalValueCount()) {
        columnReader.writeCurrentValueToConverter();
        columnReader.consume();
    }
    Assert.assertEquals(values, actualValues);
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) DeltaByteArrayWriter(org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ParquetProperties(org.apache.parquet.column.ParquetProperties) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) Binary(org.apache.parquet.io.api.Binary) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 2 with ParquetProperties

use of org.apache.parquet.column.ParquetProperties in project parquet-mr by apache.

the class ParquetOutputFormat method getRecordWriter.

public RecordWriter<Void, T> getRecordWriter(Configuration conf, Path file, CompressionCodecName codec) throws IOException, InterruptedException {
    final WriteSupport<T> writeSupport = getWriteSupport(conf);
    ParquetProperties props = ParquetProperties.builder().withPageSize(getPageSize(conf)).withDictionaryPageSize(getDictionaryPageSize(conf)).withDictionaryEncoding(getEnableDictionary(conf)).withWriterVersion(getWriterVersion(conf)).estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)).withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf)).withMaxRowCountForPageSizeCheck(getMaxRowCountForPageSizeCheck(conf)).build();
    long blockSize = getLongBlockSize(conf);
    int maxPaddingSize = getMaxPaddingSize(conf);
    boolean validating = getValidation(conf);
    if (LOG.isInfoEnabled()) {
        LOG.info("Parquet block size to {}", blockSize);
        LOG.info("Parquet page size to {}", props.getPageSizeThreshold());
        LOG.info("Parquet dictionary page size to {}", props.getDictionaryPageSizeThreshold());
        LOG.info("Dictionary is {}", (props.isEnableDictionary() ? "on" : "off"));
        LOG.info("Validation is {}", (validating ? "on" : "off"));
        LOG.info("Writer version is: {}", props.getWriterVersion());
        LOG.info("Maximum row group padding size is {} bytes", maxPaddingSize);
        LOG.info("Page size checking is: {}", (props.estimateNextSizeCheck() ? "estimated" : "constant"));
        LOG.info("Min row count for page size check is: {}", props.getMinRowCountForPageSizeCheck());
        LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck());
    }
    WriteContext init = writeSupport.init(conf);
    ParquetFileWriter w = new ParquetFileWriter(HadoopOutputFile.fromPath(file, conf), init.getSchema(), Mode.CREATE, blockSize, maxPaddingSize);
    w.start();
    float maxLoad = conf.getFloat(ParquetOutputFormat.MEMORY_POOL_RATIO, MemoryManager.DEFAULT_MEMORY_POOL_RATIO);
    long minAllocation = conf.getLong(ParquetOutputFormat.MIN_MEMORY_ALLOCATION, MemoryManager.DEFAULT_MIN_MEMORY_ALLOCATION);
    synchronized (ParquetOutputFormat.class) {
        if (memoryManager == null) {
            memoryManager = new MemoryManager(maxLoad, minAllocation);
        }
    }
    if (memoryManager.getMemoryPoolRatio() != maxLoad) {
        LOG.warn("The configuration " + MEMORY_POOL_RATIO + " has been set. It should not " + "be reset by the new value: " + maxLoad);
    }
    return new ParquetRecordWriter<T>(w, writeSupport, init.getSchema(), init.getExtraMetaData(), blockSize, codec, validating, props, memoryManager, conf);
}
Also used : ParquetProperties(org.apache.parquet.column.ParquetProperties) WriteContext(org.apache.parquet.hadoop.api.WriteSupport.WriteContext)

Aggregations

ParquetProperties (org.apache.parquet.column.ParquetProperties)2 ArrayList (java.util.ArrayList)1 ParsedVersion (org.apache.parquet.VersionParser.ParsedVersion)1 BytesInput (org.apache.parquet.bytes.BytesInput)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 PageWriter (org.apache.parquet.column.page.PageWriter)1 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)1 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)1 ValuesWriter (org.apache.parquet.column.values.ValuesWriter)1 DeltaByteArrayWriter (org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter)1 WriteContext (org.apache.parquet.hadoop.api.WriteSupport.WriteContext)1 Binary (org.apache.parquet.io.api.Binary)1 PrimitiveConverter (org.apache.parquet.io.api.PrimitiveConverter)1 Test (org.junit.Test)1