Search in sources :

Example 11 with GroupWriteSupport

use of org.apache.parquet.hadoop.example.GroupWriteSupport in project incubator-gobblin by apache.

the class ParquetDataWriterBuilder method getVersionSpecificWriter.

/**
 * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
 * @param writerConfiguration
 * @return
 * @throws IOException
 */
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration) throws IOException {
    CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
    ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion.fromString(writerConfiguration.getWriterVersion());
    Configuration conf = new Configuration();
    ParquetWriter versionSpecificWriter = null;
    switch(writerConfiguration.getRecordFormat()) {
        case GROUP:
            {
                GroupWriteSupport.setSchema((MessageType) this.schema, conf);
                WriteSupport support = new GroupWriteSupport();
                versionSpecificWriter = new ParquetWriter<Group>(writerConfiguration.getAbsoluteStagingFile(), support, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.getDictPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate(), writerVersion, conf);
                break;
            }
        case AVRO:
            {
                versionSpecificWriter = new AvroParquetWriter(writerConfiguration.getAbsoluteStagingFile(), (Schema) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), conf);
                break;
            }
        case PROTOBUF:
            {
                versionSpecificWriter = new ProtoParquetWriter(writerConfiguration.getAbsoluteStagingFile(), (Class<? extends Message>) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate());
                break;
            }
        default:
            throw new RuntimeException("Record format not supported");
    }
    ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;
    return new ParquetWriterShim() {

        @Override
        public void write(Object record) throws IOException {
            finalVersionSpecificWriter.write(record);
        }

        @Override
        public void close() throws IOException {
            finalVersionSpecificWriter.close();
        }
    };
}
Also used : ParquetWriterConfiguration(org.apache.gobblin.parquet.writer.ParquetWriterConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ProtoParquetWriter(org.apache.parquet.proto.ProtoParquetWriter) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) ParquetProperties(org.apache.parquet.column.ParquetProperties) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) WriteSupport(org.apache.parquet.hadoop.api.WriteSupport) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) ProtoParquetWriter(org.apache.parquet.proto.ProtoParquetWriter) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) ParquetWriterShim(org.apache.gobblin.parquet.writer.ParquetWriterShim) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) MessageType(org.apache.parquet.schema.MessageType)

Example 12 with GroupWriteSupport

use of org.apache.parquet.hadoop.example.GroupWriteSupport in project parquet-mr by apache.

the class TestParquetFileWriter method testWriteReadStatisticsAllNulls.

@Test
public void testWriteReadStatisticsAllNulls() throws Exception {
    // this test assumes statistics will be read
    Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));
    File testFile = temp.newFile();
    testFile.delete();
    writeSchema = "message example {\n" + "required binary content (UTF8);\n" + "}";
    Path path = new Path(testFile.toURI());
    MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
    Configuration configuration = new Configuration();
    configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
    GroupWriteSupport.setSchema(schema, configuration);
    ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());
    Group r1 = new SimpleGroup(schema);
    writer.write(r1);
    writer.close();
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
    // assert the statistics object is not empty
    org.apache.parquet.column.statistics.Statistics stats = readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
    assertFalse("is empty: " + stats, stats.isEmpty());
    // assert the number of nulls are correct for the first block
    assertEquals("nulls: " + stats, 1, stats.getNumNulls());
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) Configuration(org.apache.hadoop.conf.Configuration) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Aggregations

GroupWriteSupport (org.apache.parquet.hadoop.example.GroupWriteSupport)12 Group (org.apache.parquet.example.data.Group)11 Configuration (org.apache.hadoop.conf.Configuration)10 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)10 MessageType (org.apache.parquet.schema.MessageType)9 Path (org.apache.hadoop.fs.Path)7 SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)7 File (java.io.File)6 MessageTypeParser.parseMessageType (org.apache.parquet.schema.MessageTypeParser.parseMessageType)4 AvroParquetWriter (org.apache.parquet.avro.AvroParquetWriter)3 Test (org.junit.Test)3 ParquetProperties (org.apache.parquet.column.ParquetProperties)2 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)2 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)2 HashMap (java.util.HashMap)1 OperatorTest (org.apache.drill.categories.OperatorTest)1 UnlikelyTest (org.apache.drill.categories.UnlikelyTest)1 ParquetWriterConfiguration (org.apache.gobblin.parquet.writer.ParquetWriterConfiguration)1 ParquetWriterShim (org.apache.gobblin.parquet.writer.ParquetWriterShim)1 FileSystem (org.apache.hadoop.fs.FileSystem)1