Search in sources :

Example 1 with Mode

use of org.apache.parquet.hadoop.ParquetFileWriter.Mode in project parquet-mr by apache.

the class ParquetOutputFormat method getRecordWriter.

public RecordWriter<Void, T> getRecordWriter(Configuration conf, Path file, CompressionCodecName codec, Mode mode) throws IOException, InterruptedException {
    final WriteSupport<T> writeSupport = getWriteSupport(conf);
    ParquetProperties.Builder propsBuilder = ParquetProperties.builder().withPageSize(getPageSize(conf)).withDictionaryPageSize(getDictionaryPageSize(conf)).withDictionaryEncoding(getEnableDictionary(conf)).withWriterVersion(getWriterVersion(conf)).estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)).withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf)).withMaxRowCountForPageSizeCheck(getMaxRowCountForPageSizeCheck(conf)).withColumnIndexTruncateLength(getColumnIndexTruncateLength(conf)).withStatisticsTruncateLength(getStatisticsTruncateLength(conf)).withMaxBloomFilterBytes(getBloomFilterMaxBytes(conf)).withBloomFilterEnabled(getBloomFilterEnabled(conf)).withPageRowCountLimit(getPageRowCountLimit(conf)).withPageWriteChecksumEnabled(getPageWriteChecksumEnabled(conf));
    new ColumnConfigParser().withColumnConfig(ENABLE_DICTIONARY, key -> conf.getBoolean(key, false), propsBuilder::withDictionaryEncoding).withColumnConfig(BLOOM_FILTER_ENABLED, key -> conf.getBoolean(key, false), propsBuilder::withBloomFilterEnabled).withColumnConfig(BLOOM_FILTER_EXPECTED_NDV, key -> conf.getLong(key, -1L), propsBuilder::withBloomFilterNDV).parseConfig(conf);
    ParquetProperties props = propsBuilder.build();
    long blockSize = getLongBlockSize(conf);
    int maxPaddingSize = getMaxPaddingSize(conf);
    boolean validating = getValidation(conf);
    LOG.info("ParquetRecordWriter [block size: {}b, row group padding size: {}b, validating: {}]", blockSize, maxPaddingSize, validating);
    LOG.debug("Parquet properties are:\n{}", props);
    WriteContext fileWriteContext = writeSupport.init(conf);
    FileEncryptionProperties encryptionProperties = createEncryptionProperties(conf, file, fileWriteContext);
    ParquetFileWriter w = new ParquetFileWriter(HadoopOutputFile.fromPath(file, conf), fileWriteContext.getSchema(), mode, blockSize, maxPaddingSize, props.getColumnIndexTruncateLength(), props.getStatisticsTruncateLength(), props.getPageWriteChecksumEnabled(), encryptionProperties);
    w.start();
    float maxLoad = conf.getFloat(ParquetOutputFormat.MEMORY_POOL_RATIO, MemoryManager.DEFAULT_MEMORY_POOL_RATIO);
    long minAllocation = conf.getLong(ParquetOutputFormat.MIN_MEMORY_ALLOCATION, MemoryManager.DEFAULT_MIN_MEMORY_ALLOCATION);
    synchronized (ParquetOutputFormat.class) {
        if (memoryManager == null) {
            memoryManager = new MemoryManager(maxLoad, minAllocation);
        }
    }
    if (memoryManager.getMemoryPoolRatio() != maxLoad) {
        LOG.warn("The configuration " + MEMORY_POOL_RATIO + " has been set. It should not " + "be reset by the new value: " + maxLoad);
    }
    return new ParquetRecordWriter<T>(w, writeSupport, fileWriteContext.getSchema(), fileWriteContext.getExtraMetaData(), blockSize, codec, validating, props, memoryManager, conf);
}
Also used : RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) LoggerFactory(org.slf4j.LoggerFactory) EncryptionPropertiesFactory(org.apache.parquet.crypto.EncryptionPropertiesFactory) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ConfigurationUtil(org.apache.parquet.hadoop.util.ConfigurationUtil) Mode(org.apache.parquet.hadoop.ParquetFileWriter.Mode) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) ParquetProperties(org.apache.parquet.column.ParquetProperties) FileEncryptionProperties(org.apache.parquet.crypto.FileEncryptionProperties) CodecConfig(org.apache.parquet.hadoop.codec.CodecConfig) Logger(org.slf4j.Logger) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion) HadoopOutputFile(org.apache.parquet.hadoop.util.HadoopOutputFile) IOException(java.io.IOException) WriteContext(org.apache.parquet.hadoop.api.WriteSupport.WriteContext) DEFAULT_BLOOM_FILTER_ENABLED(org.apache.parquet.column.ParquetProperties.DEFAULT_BLOOM_FILTER_ENABLED) Objects(java.util.Objects) JobConf(org.apache.hadoop.mapred.JobConf) FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) OutputCommitter(org.apache.hadoop.mapreduce.OutputCommitter) Job(org.apache.hadoop.mapreduce.Job) JobContext(org.apache.hadoop.mapreduce.JobContext) ContextUtil.getConfiguration(org.apache.parquet.hadoop.util.ContextUtil.getConfiguration) WriteSupport(org.apache.parquet.hadoop.api.WriteSupport) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) DEFAULT_BLOCK_SIZE(org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE) ParquetProperties(org.apache.parquet.column.ParquetProperties) WriteContext(org.apache.parquet.hadoop.api.WriteSupport.WriteContext) FileEncryptionProperties(org.apache.parquet.crypto.FileEncryptionProperties)

Aggregations

IOException (java.io.IOException)1 Objects (java.util.Objects)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 JobConf (org.apache.hadoop.mapred.JobConf)1 Job (org.apache.hadoop.mapreduce.Job)1 JobContext (org.apache.hadoop.mapreduce.JobContext)1 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)1 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)1 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)1 FileOutputFormat (org.apache.hadoop.mapreduce.lib.output.FileOutputFormat)1 ParquetProperties (org.apache.parquet.column.ParquetProperties)1 DEFAULT_BLOOM_FILTER_ENABLED (org.apache.parquet.column.ParquetProperties.DEFAULT_BLOOM_FILTER_ENABLED)1 WriterVersion (org.apache.parquet.column.ParquetProperties.WriterVersion)1 EncryptionPropertiesFactory (org.apache.parquet.crypto.EncryptionPropertiesFactory)1 FileEncryptionProperties (org.apache.parquet.crypto.FileEncryptionProperties)1 Mode (org.apache.parquet.hadoop.ParquetFileWriter.Mode)1 DEFAULT_BLOCK_SIZE (org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE)1 WriteSupport (org.apache.parquet.hadoop.api.WriteSupport)1 WriteContext (org.apache.parquet.hadoop.api.WriteSupport.WriteContext)1