use of org.apache.parquet.hadoop.example.GroupWriteSupport in project incubator-gobblin by apache.
the class ParquetDataWriterBuilder method getVersionSpecificWriter.
/**
* Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
* @param writerConfiguration
* @return
* @throws IOException
*/
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration) throws IOException {
CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion.fromString(writerConfiguration.getWriterVersion());
Configuration conf = new Configuration();
ParquetWriter versionSpecificWriter = null;
switch(writerConfiguration.getRecordFormat()) {
case GROUP:
{
GroupWriteSupport.setSchema((MessageType) this.schema, conf);
WriteSupport support = new GroupWriteSupport();
versionSpecificWriter = new ParquetWriter<Group>(writerConfiguration.getAbsoluteStagingFile(), support, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.getDictPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate(), writerVersion, conf);
break;
}
case AVRO:
{
versionSpecificWriter = new AvroParquetWriter(writerConfiguration.getAbsoluteStagingFile(), (Schema) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), conf);
break;
}
case PROTOBUF:
{
versionSpecificWriter = new ProtoParquetWriter(writerConfiguration.getAbsoluteStagingFile(), (Class<? extends Message>) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate());
break;
}
default:
throw new RuntimeException("Record format not supported");
}
ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;
return new ParquetWriterShim() {
@Override
public void write(Object record) throws IOException {
finalVersionSpecificWriter.write(record);
}
@Override
public void close() throws IOException {
finalVersionSpecificWriter.close();
}
};
}
use of org.apache.parquet.hadoop.example.GroupWriteSupport in project parquet-mr by apache.
the class TestParquetFileWriter method testWriteReadStatisticsAllNulls.
@Test
public void testWriteReadStatisticsAllNulls() throws Exception {
// this test assumes statistics will be read
Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));
File testFile = temp.newFile();
testFile.delete();
writeSchema = "message example {\n" + "required binary content (UTF8);\n" + "}";
Path path = new Path(testFile.toURI());
MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
Configuration configuration = new Configuration();
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
GroupWriteSupport.setSchema(schema, configuration);
ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());
Group r1 = new SimpleGroup(schema);
writer.write(r1);
writer.close();
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
// assert the statistics object is not empty
org.apache.parquet.column.statistics.Statistics stats = readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
assertFalse("is empty: " + stats, stats.isEmpty());
// assert the number of nulls are correct for the first block
assertEquals("nulls: " + stats, 1, stats.getNumNulls());
}
Aggregations