use of org.apache.parquet.hadoop.api.WriteSupport in project flink by apache.
the class ParquetWriterUtil method writeParquetFile.
public static void writeParquetFile(Path path, MessageType schema, List<Row> records, int rowGroupSize) throws IOException {
WriteSupport<Row> support = new WriteSupport<Row>() {
private RecordConsumer consumer;
@Override
public WriteContext init(Configuration configuration) {
return new WriteContext(schema, new HashMap<>());
}
@Override
public void prepareForWrite(RecordConsumer consumer) {
this.consumer = consumer;
}
@Override
public void write(Row row) {
consumer.startMessage();
for (int i = 0; i < row.getArity(); i++) {
PrimitiveType type = schema.getColumns().get(i).getPrimitiveType();
Object field = row.getField(i);
if (field != null) {
consumer.startField("f" + i, i);
switch(type.getPrimitiveTypeName()) {
case INT64:
consumer.addLong(((Number) field).longValue());
break;
case INT32:
consumer.addInteger(((Number) field).intValue());
break;
case BOOLEAN:
consumer.addBoolean((Boolean) field);
break;
case BINARY:
if (field instanceof String) {
field = ((String) field).getBytes(StandardCharsets.UTF_8);
} else if (field instanceof BigDecimal) {
field = ((BigDecimal) field).unscaledValue().toByteArray();
}
consumer.addBinary(Binary.fromConstantByteArray((byte[]) field));
break;
case FLOAT:
consumer.addFloat(((Number) field).floatValue());
break;
case DOUBLE:
consumer.addDouble(((Number) field).doubleValue());
break;
case INT96:
consumer.addBinary(timestampToInt96((LocalDateTime) field));
break;
case FIXED_LEN_BYTE_ARRAY:
byte[] bytes = ((BigDecimal) field).unscaledValue().toByteArray();
byte signByte = (byte) (bytes[0] < 0 ? -1 : 0);
int numBytes = 16;
byte[] newBytes = new byte[numBytes];
Arrays.fill(newBytes, 0, numBytes - bytes.length, signByte);
System.arraycopy(bytes, 0, newBytes, numBytes - bytes.length, bytes.length);
consumer.addBinary(Binary.fromConstantByteArray(newBytes));
break;
}
consumer.endField("f" + i, i);
}
}
consumer.endMessage();
}
};
ParquetWriter<Row> writer = new ParquetWriterBuilder(new org.apache.hadoop.fs.Path(path.getPath()), support).withRowGroupSize(rowGroupSize).build();
for (Row record : records) {
writer.write(record);
}
writer.close();
}
use of org.apache.parquet.hadoop.api.WriteSupport in project incubator-gobblin by apache.
the class ParquetDataWriterBuilder method getVersionSpecificWriter.
/**
* Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
* @param writerConfiguration
* @return
* @throws IOException
*/
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration) throws IOException {
CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion.fromString(writerConfiguration.getWriterVersion());
Configuration conf = new Configuration();
ParquetWriter versionSpecificWriter = null;
switch(writerConfiguration.getRecordFormat()) {
case GROUP:
{
GroupWriteSupport.setSchema((MessageType) this.schema, conf);
WriteSupport support = new GroupWriteSupport();
versionSpecificWriter = new ParquetWriter<Group>(writerConfiguration.getAbsoluteStagingFile(), support, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.getDictPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate(), writerVersion, conf);
break;
}
case AVRO:
{
versionSpecificWriter = new AvroParquetWriter(writerConfiguration.getAbsoluteStagingFile(), (Schema) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), conf);
break;
}
case PROTOBUF:
{
versionSpecificWriter = new ProtoParquetWriter(writerConfiguration.getAbsoluteStagingFile(), (Class<? extends Message>) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate());
break;
}
default:
throw new RuntimeException("Record format not supported");
}
ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;
return new ParquetWriterShim() {
@Override
public void write(Object record) throws IOException {
finalVersionSpecificWriter.write(record);
}
@Override
public void close() throws IOException {
finalVersionSpecificWriter.close();
}
};
}
Aggregations