Search in sources :

Example 1 with WriteContext

use of org.apache.parquet.hadoop.api.WriteSupport.WriteContext in project parquet-mr by apache.

the class ParquetOutputFormat method getRecordWriter.

public RecordWriter<Void, T> getRecordWriter(Configuration conf, Path file, CompressionCodecName codec) throws IOException, InterruptedException {
    final WriteSupport<T> writeSupport = getWriteSupport(conf);
    ParquetProperties props = ParquetProperties.builder().withPageSize(getPageSize(conf)).withDictionaryPageSize(getDictionaryPageSize(conf)).withDictionaryEncoding(getEnableDictionary(conf)).withWriterVersion(getWriterVersion(conf)).estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)).withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf)).withMaxRowCountForPageSizeCheck(getMaxRowCountForPageSizeCheck(conf)).build();
    long blockSize = getLongBlockSize(conf);
    int maxPaddingSize = getMaxPaddingSize(conf);
    boolean validating = getValidation(conf);
    if (LOG.isInfoEnabled()) {
        LOG.info("Parquet block size to {}", blockSize);
        LOG.info("Parquet page size to {}", props.getPageSizeThreshold());
        LOG.info("Parquet dictionary page size to {}", props.getDictionaryPageSizeThreshold());
        LOG.info("Dictionary is {}", (props.isEnableDictionary() ? "on" : "off"));
        LOG.info("Validation is {}", (validating ? "on" : "off"));
        LOG.info("Writer version is: {}", props.getWriterVersion());
        LOG.info("Maximum row group padding size is {} bytes", maxPaddingSize);
        LOG.info("Page size checking is: {}", (props.estimateNextSizeCheck() ? "estimated" : "constant"));
        LOG.info("Min row count for page size check is: {}", props.getMinRowCountForPageSizeCheck());
        LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck());
    }
    WriteContext init = writeSupport.init(conf);
    ParquetFileWriter w = new ParquetFileWriter(HadoopOutputFile.fromPath(file, conf), init.getSchema(), Mode.CREATE, blockSize, maxPaddingSize);
    w.start();
    float maxLoad = conf.getFloat(ParquetOutputFormat.MEMORY_POOL_RATIO, MemoryManager.DEFAULT_MEMORY_POOL_RATIO);
    long minAllocation = conf.getLong(ParquetOutputFormat.MIN_MEMORY_ALLOCATION, MemoryManager.DEFAULT_MIN_MEMORY_ALLOCATION);
    synchronized (ParquetOutputFormat.class) {
        if (memoryManager == null) {
            memoryManager = new MemoryManager(maxLoad, minAllocation);
        }
    }
    if (memoryManager.getMemoryPoolRatio() != maxLoad) {
        LOG.warn("The configuration " + MEMORY_POOL_RATIO + " has been set. It should not " + "be reset by the new value: " + maxLoad);
    }
    return new ParquetRecordWriter<T>(w, writeSupport, init.getSchema(), init.getExtraMetaData(), blockSize, codec, validating, props, memoryManager, conf);
}
Also used : ParquetProperties(org.apache.parquet.column.ParquetProperties) WriteContext(org.apache.parquet.hadoop.api.WriteSupport.WriteContext)

Aggregations

ParquetProperties (org.apache.parquet.column.ParquetProperties)1 WriteContext (org.apache.parquet.hadoop.api.WriteSupport.WriteContext)1