use of org.apache.parquet.hadoop.api.WriteSupport.WriteContext in project parquet-mr by apache.
the class ParquetOutputFormat method getRecordWriter.
public RecordWriter<Void, T> getRecordWriter(Configuration conf, Path file, CompressionCodecName codec) throws IOException, InterruptedException {
final WriteSupport<T> writeSupport = getWriteSupport(conf);
ParquetProperties props = ParquetProperties.builder().withPageSize(getPageSize(conf)).withDictionaryPageSize(getDictionaryPageSize(conf)).withDictionaryEncoding(getEnableDictionary(conf)).withWriterVersion(getWriterVersion(conf)).estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)).withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf)).withMaxRowCountForPageSizeCheck(getMaxRowCountForPageSizeCheck(conf)).build();
long blockSize = getLongBlockSize(conf);
int maxPaddingSize = getMaxPaddingSize(conf);
boolean validating = getValidation(conf);
if (LOG.isInfoEnabled()) {
LOG.info("Parquet block size to {}", blockSize);
LOG.info("Parquet page size to {}", props.getPageSizeThreshold());
LOG.info("Parquet dictionary page size to {}", props.getDictionaryPageSizeThreshold());
LOG.info("Dictionary is {}", (props.isEnableDictionary() ? "on" : "off"));
LOG.info("Validation is {}", (validating ? "on" : "off"));
LOG.info("Writer version is: {}", props.getWriterVersion());
LOG.info("Maximum row group padding size is {} bytes", maxPaddingSize);
LOG.info("Page size checking is: {}", (props.estimateNextSizeCheck() ? "estimated" : "constant"));
LOG.info("Min row count for page size check is: {}", props.getMinRowCountForPageSizeCheck());
LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck());
}
WriteContext init = writeSupport.init(conf);
ParquetFileWriter w = new ParquetFileWriter(HadoopOutputFile.fromPath(file, conf), init.getSchema(), Mode.CREATE, blockSize, maxPaddingSize);
w.start();
float maxLoad = conf.getFloat(ParquetOutputFormat.MEMORY_POOL_RATIO, MemoryManager.DEFAULT_MEMORY_POOL_RATIO);
long minAllocation = conf.getLong(ParquetOutputFormat.MIN_MEMORY_ALLOCATION, MemoryManager.DEFAULT_MIN_MEMORY_ALLOCATION);
synchronized (ParquetOutputFormat.class) {
if (memoryManager == null) {
memoryManager = new MemoryManager(maxLoad, minAllocation);
}
}
if (memoryManager.getMemoryPoolRatio() != maxLoad) {
LOG.warn("The configuration " + MEMORY_POOL_RATIO + " has been set. It should not " + "be reset by the new value: " + maxLoad);
}
return new ParquetRecordWriter<T>(w, writeSupport, init.getSchema(), init.getExtraMetaData(), blockSize, codec, validating, props, memoryManager, conf);
}
Aggregations