Search in sources :

Example 11 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project presto by prestodb.

the class ParquetTester method assertRoundTrip.

void assertRoundTrip(List<ObjectInspector> objectInspectors, Iterable<?>[] writeValues, Iterable<?>[] readValues, List<String> columnNames, List<Type> columnTypes, Optional<MessageType> parquetSchema, boolean singleLevelArray) throws Exception {
    for (WriterVersion version : versions) {
        for (CompressionCodecName compressionCodecName : compressions) {
            for (ConnectorSession session : sessions) {
                try (TempFile tempFile = new TempFile("test", "parquet")) {
                    JobConf jobConf = new JobConf();
                    jobConf.setEnum(COMPRESSION, compressionCodecName);
                    jobConf.setBoolean(ENABLE_DICTIONARY, true);
                    jobConf.setEnum(WRITER_VERSION, version);
                    writeParquetColumn(jobConf, tempFile.getFile(), compressionCodecName, createTableProperties(columnNames, objectInspectors), getStandardStructObjectInspector(columnNames, objectInspectors), getIterators(writeValues), parquetSchema, singleLevelArray);
                    assertFileContents(session, tempFile.getFile(), getIterators(readValues), columnNames, columnTypes);
                }
            }
        }
    }
    // write presto parquet
    for (CompressionCodecName compressionCodecName : writerCompressions) {
        for (ConnectorSession session : sessions) {
            try (TempFile tempFile = new TempFile("test", "parquet")) {
                OptionalInt min = stream(writeValues).mapToInt(Iterables::size).min();
                checkState(min.isPresent());
                writeParquetFileFromPresto(tempFile.getFile(), columnTypes, columnNames, getIterators(readValues), min.getAsInt(), compressionCodecName);
                assertFileContents(session, tempFile.getFile(), getIterators(readValues), columnNames, columnTypes);
            }
        }
    }
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ConnectorSession(com.facebook.presto.spi.ConnectorSession) TestingConnectorSession(com.facebook.presto.testing.TestingConnectorSession) OptionalInt(java.util.OptionalInt) JobConf(org.apache.hadoop.mapred.JobConf) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion)

Example 12 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project presto by prestodb.

the class ParquetTester method assertNonHiveWriterRoundTrip.

void assertNonHiveWriterRoundTrip(List<ObjectInspector> objectInspectors, Iterable<?>[] writeValues, Iterable<?>[] readValues, List<String> columnNames, List<Type> columnTypes, org.apache.parquet.schema.MessageType parquetSchema) throws Exception {
    for (WriterVersion version : versions) {
        for (CompressionCodecName compression : compressions) {
            org.apache.parquet.hadoop.metadata.CompressionCodecName compressionCodecName = org.apache.parquet.hadoop.metadata.CompressionCodecName.valueOf(compression.name());
            for (ConnectorSession session : sessions) {
                try (TempFile tempFile = new TempFile("test", "parquet")) {
                    JobConf jobConf = new JobConf();
                    jobConf.setEnum(COMPRESSION, compressionCodecName);
                    jobConf.setBoolean(ENABLE_DICTIONARY, true);
                    jobConf.setEnum(WRITER_VERSION, version);
                    nonHiveParquetWriter(jobConf, tempFile.getFile(), compressionCodecName, getStandardStructObjectInspector(columnNames, objectInspectors), getIterators(writeValues), parquetSchema);
                    assertFileContents(session, tempFile.getFile(), getIterators(readValues), columnNames, columnTypes);
                }
            }
        }
    }
}
Also used : CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ConnectorSession(com.facebook.presto.spi.ConnectorSession) TestingConnectorSession(com.facebook.presto.testing.TestingConnectorSession) JobConf(org.apache.hadoop.mapred.JobConf) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion)

Example 13 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project presto by prestodb.

the class ParquetTester method assertMaxReadBytes.

void assertMaxReadBytes(List<ObjectInspector> objectInspectors, Iterable<?>[] writeValues, Iterable<?>[] readValues, List<String> columnNames, List<Type> columnTypes, Optional<MessageType> parquetSchema, DataSize maxReadBlockSize) throws Exception {
    WriterVersion version = PARQUET_1_0;
    CompressionCodecName compressionCodecName = UNCOMPRESSED;
    HiveClientConfig config = new HiveClientConfig().setHiveStorageFormat(HiveStorageFormat.PARQUET).setUseParquetColumnNames(false).setParquetMaxReadBlockSize(maxReadBlockSize);
    ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(config, new OrcFileWriterConfig(), new ParquetFileWriterConfig(), new CacheConfig()).getSessionProperties());
    try (TempFile tempFile = new TempFile("test", "parquet")) {
        JobConf jobConf = new JobConf();
        jobConf.setEnum(COMPRESSION, compressionCodecName);
        jobConf.setBoolean(ENABLE_DICTIONARY, true);
        jobConf.setEnum(WRITER_VERSION, version);
        writeParquetColumn(jobConf, tempFile.getFile(), compressionCodecName, createTableProperties(columnNames, objectInspectors), getStandardStructObjectInspector(columnNames, objectInspectors), getIterators(writeValues), parquetSchema, false);
        Iterator<?>[] expectedValues = getIterators(readValues);
        try (ConnectorPageSource pageSource = getFileFormat().createFileFormatReader(session, HDFS_ENVIRONMENT, tempFile.getFile(), columnNames, columnTypes)) {
            assertPageSource(columnTypes, expectedValues, pageSource, Optional.of(getParquetMaxReadBlockSize(session).toBytes()));
            assertFalse(stream(expectedValues).allMatch(Iterator::hasNext));
        }
    }
}
Also used : TestingConnectorSession(com.facebook.presto.testing.TestingConnectorSession) OrcFileWriterConfig(com.facebook.presto.hive.OrcFileWriterConfig) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) HiveSessionProperties(com.facebook.presto.hive.HiveSessionProperties) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) AbstractIterator(com.google.common.collect.AbstractIterator) Iterator(java.util.Iterator) ConnectorSession(com.facebook.presto.spi.ConnectorSession) TestingConnectorSession(com.facebook.presto.testing.TestingConnectorSession) CacheConfig(com.facebook.presto.cache.CacheConfig) JobConf(org.apache.hadoop.mapred.JobConf) ParquetFileWriterConfig(com.facebook.presto.hive.ParquetFileWriterConfig) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig)

Example 14 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project nifi by apache.

the class PutParquet method applyCommonConfig.

private void applyCommonConfig(final ParquetWriter.Builder<?, ?> builder, final ProcessContext context, final FlowFile flowFile, final Configuration conf) {
    builder.withConf(conf);
    // Required properties
    final boolean overwrite = context.getProperty(OVERWRITE).asBoolean();
    final ParquetFileWriter.Mode mode = overwrite ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE;
    builder.withWriteMode(mode);
    final PropertyDescriptor compressionTypeDescriptor = getPropertyDescriptor(COMPRESSION_TYPE.getName());
    final String compressionTypeValue = context.getProperty(compressionTypeDescriptor).getValue();
    final CompressionCodecName codecName = CompressionCodecName.valueOf(compressionTypeValue);
    builder.withCompressionCodec(codecName);
    if (context.getProperty(ROW_GROUP_SIZE).isSet()) {
        try {
            final Double rowGroupSize = context.getProperty(ROW_GROUP_SIZE).evaluateAttributeExpressions(flowFile).asDataSize(DataUnit.B);
            if (rowGroupSize != null) {
                builder.withRowGroupSize(rowGroupSize.intValue());
            }
        } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException("Invalid data size for " + ROW_GROUP_SIZE.getDisplayName(), e);
        }
    }
    if (context.getProperty(PAGE_SIZE).isSet()) {
        try {
            final Double pageSize = context.getProperty(PAGE_SIZE).evaluateAttributeExpressions(flowFile).asDataSize(DataUnit.B);
            if (pageSize != null) {
                builder.withPageSize(pageSize.intValue());
            }
        } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException("Invalid data size for " + PAGE_SIZE.getDisplayName(), e);
        }
    }
    if (context.getProperty(DICTIONARY_PAGE_SIZE).isSet()) {
        try {
            final Double dictionaryPageSize = context.getProperty(DICTIONARY_PAGE_SIZE).evaluateAttributeExpressions(flowFile).asDataSize(DataUnit.B);
            if (dictionaryPageSize != null) {
                builder.withDictionaryPageSize(dictionaryPageSize.intValue());
            }
        } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException("Invalid data size for " + DICTIONARY_PAGE_SIZE.getDisplayName(), e);
        }
    }
    if (context.getProperty(MAX_PADDING_SIZE).isSet()) {
        try {
            final Double maxPaddingSize = context.getProperty(MAX_PADDING_SIZE).evaluateAttributeExpressions(flowFile).asDataSize(DataUnit.B);
            if (maxPaddingSize != null) {
                builder.withMaxPaddingSize(maxPaddingSize.intValue());
            }
        } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException("Invalid data size for " + MAX_PADDING_SIZE.getDisplayName(), e);
        }
    }
    if (context.getProperty(ENABLE_DICTIONARY_ENCODING).isSet()) {
        final boolean enableDictionaryEncoding = context.getProperty(ENABLE_DICTIONARY_ENCODING).asBoolean();
        builder.withDictionaryEncoding(enableDictionaryEncoding);
    }
    if (context.getProperty(ENABLE_VALIDATION).isSet()) {
        final boolean enableValidation = context.getProperty(ENABLE_VALIDATION).asBoolean();
        builder.withValidation(enableValidation);
    }
    if (context.getProperty(WRITER_VERSION).isSet()) {
        final String writerVersionValue = context.getProperty(WRITER_VERSION).getValue();
        builder.withWriterVersion(ParquetProperties.WriterVersion.valueOf(writerVersionValue));
    }
}
Also used : ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName)

Example 15 with CompressionCodecName

use of org.apache.parquet.hadoop.metadata.CompressionCodecName in project parquet-mr by apache.

the class ConvertCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() == 1, "A data file is required.");
    String source = targets.get(0);
    CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName);
    Schema schema;
    if (avroSchemaFile != null) {
        schema = Schemas.fromAvsc(open(avroSchemaFile));
    } else {
        schema = getAvroSchema(source);
    }
    Schema projection = filterSchema(schema, columns);
    Path outPath = qualifiedPath(outputPath);
    FileSystem outFS = outPath.getFileSystem(getConf());
    if (overwrite && outFS.exists(outPath)) {
        console.debug("Deleting output file {} (already exists)", outPath);
        outFS.delete(outPath);
    }
    Iterable<Record> reader = openDataFile(source, projection);
    boolean threw = true;
    long count = 0;
    try {
        try (ParquetWriter<Record> writer = AvroParquetWriter.<Record>builder(qualifiedPath(outputPath)).withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0).withConf(getConf()).withCompressionCodec(codec).withRowGroupSize(rowGroupSize).withDictionaryPageSize(dictionaryPageSize < 64 ? 64 : dictionaryPageSize).withDictionaryEncoding(dictionaryPageSize != 0).withPageSize(pageSize).withDataModel(GenericData.get()).withSchema(projection).build()) {
            for (Record record : reader) {
                writer.write(record);
                count += 1;
            }
        }
        threw = false;
    } catch (RuntimeException e) {
        throw new RuntimeException("Failed on record " + count, e);
    } finally {
        if (reader instanceof Closeable) {
            Closeables.close((Closeable) reader, threw);
        }
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Schema(org.apache.avro.Schema) Expressions.filterSchema(org.apache.parquet.cli.util.Expressions.filterSchema) FileSystem(org.apache.hadoop.fs.FileSystem) Closeable(java.io.Closeable) Record(org.apache.avro.generic.GenericData.Record)

Aggregations

CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)25 DrillBuf (io.netty.buffer.DrillBuf)6 ByteBuffer (java.nio.ByteBuffer)6 Configuration (org.apache.hadoop.conf.Configuration)6 Path (org.apache.hadoop.fs.Path)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)5 ConnectorSession (com.facebook.presto.spi.ConnectorSession)4 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 MessageType (org.apache.parquet.schema.MessageType)4 TestingConnectorSession (com.facebook.presto.testing.TestingConnectorSession)3 IOException (java.io.IOException)3 JobConf (org.apache.hadoop.mapred.JobConf)3 WriterVersion (org.apache.parquet.column.ParquetProperties.WriterVersion)3 ParquetFileWriter (org.apache.parquet.hadoop.ParquetFileWriter)3 HiveClientConfig (com.facebook.presto.hive.HiveClientConfig)2 Stopwatch (com.google.common.base.Stopwatch)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2