Search in sources :

Example 1 with WriteSupport

use of org.apache.parquet.hadoop.api.WriteSupport in project flink by apache.

the class ParquetWriterUtil method writeParquetFile.

public static void writeParquetFile(Path path, MessageType schema, List<Row> records, int rowGroupSize) throws IOException {
    WriteSupport<Row> support = new WriteSupport<Row>() {

        private RecordConsumer consumer;

        @Override
        public WriteContext init(Configuration configuration) {
            return new WriteContext(schema, new HashMap<>());
        }

        @Override
        public void prepareForWrite(RecordConsumer consumer) {
            this.consumer = consumer;
        }

        @Override
        public void write(Row row) {
            consumer.startMessage();
            for (int i = 0; i < row.getArity(); i++) {
                PrimitiveType type = schema.getColumns().get(i).getPrimitiveType();
                Object field = row.getField(i);
                if (field != null) {
                    consumer.startField("f" + i, i);
                    switch(type.getPrimitiveTypeName()) {
                        case INT64:
                            consumer.addLong(((Number) field).longValue());
                            break;
                        case INT32:
                            consumer.addInteger(((Number) field).intValue());
                            break;
                        case BOOLEAN:
                            consumer.addBoolean((Boolean) field);
                            break;
                        case BINARY:
                            if (field instanceof String) {
                                field = ((String) field).getBytes(StandardCharsets.UTF_8);
                            } else if (field instanceof BigDecimal) {
                                field = ((BigDecimal) field).unscaledValue().toByteArray();
                            }
                            consumer.addBinary(Binary.fromConstantByteArray((byte[]) field));
                            break;
                        case FLOAT:
                            consumer.addFloat(((Number) field).floatValue());
                            break;
                        case DOUBLE:
                            consumer.addDouble(((Number) field).doubleValue());
                            break;
                        case INT96:
                            consumer.addBinary(timestampToInt96((LocalDateTime) field));
                            break;
                        case FIXED_LEN_BYTE_ARRAY:
                            byte[] bytes = ((BigDecimal) field).unscaledValue().toByteArray();
                            byte signByte = (byte) (bytes[0] < 0 ? -1 : 0);
                            int numBytes = 16;
                            byte[] newBytes = new byte[numBytes];
                            Arrays.fill(newBytes, 0, numBytes - bytes.length, signByte);
                            System.arraycopy(bytes, 0, newBytes, numBytes - bytes.length, bytes.length);
                            consumer.addBinary(Binary.fromConstantByteArray(newBytes));
                            break;
                    }
                    consumer.endField("f" + i, i);
                }
            }
            consumer.endMessage();
        }
    };
    ParquetWriter<Row> writer = new ParquetWriterBuilder(new org.apache.hadoop.fs.Path(path.getPath()), support).withRowGroupSize(rowGroupSize).build();
    for (Row record : records) {
        writer.write(record);
    }
    writer.close();
}
Also used : LocalDateTime(java.time.LocalDateTime) Configuration(org.apache.hadoop.conf.Configuration) WriteSupport(org.apache.parquet.hadoop.api.WriteSupport) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) BigDecimal(java.math.BigDecimal) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Row(org.apache.flink.types.Row)

Example 2 with WriteSupport

use of org.apache.parquet.hadoop.api.WriteSupport in project incubator-gobblin by apache.

the class ParquetDataWriterBuilder method getVersionSpecificWriter.

/**
 * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
 * @param writerConfiguration
 * @return
 * @throws IOException
 */
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration) throws IOException {
    CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
    ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion.fromString(writerConfiguration.getWriterVersion());
    Configuration conf = new Configuration();
    ParquetWriter versionSpecificWriter = null;
    switch(writerConfiguration.getRecordFormat()) {
        case GROUP:
            {
                GroupWriteSupport.setSchema((MessageType) this.schema, conf);
                WriteSupport support = new GroupWriteSupport();
                versionSpecificWriter = new ParquetWriter<Group>(writerConfiguration.getAbsoluteStagingFile(), support, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.getDictPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate(), writerVersion, conf);
                break;
            }
        case AVRO:
            {
                versionSpecificWriter = new AvroParquetWriter(writerConfiguration.getAbsoluteStagingFile(), (Schema) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), conf);
                break;
            }
        case PROTOBUF:
            {
                versionSpecificWriter = new ProtoParquetWriter(writerConfiguration.getAbsoluteStagingFile(), (Class<? extends Message>) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate());
                break;
            }
        default:
            throw new RuntimeException("Record format not supported");
    }
    ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;
    return new ParquetWriterShim() {

        @Override
        public void write(Object record) throws IOException {
            finalVersionSpecificWriter.write(record);
        }

        @Override
        public void close() throws IOException {
            finalVersionSpecificWriter.close();
        }
    };
}
Also used : ParquetWriterConfiguration(org.apache.gobblin.parquet.writer.ParquetWriterConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ProtoParquetWriter(org.apache.parquet.proto.ProtoParquetWriter) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) ParquetProperties(org.apache.parquet.column.ParquetProperties) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) WriteSupport(org.apache.parquet.hadoop.api.WriteSupport) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) ProtoParquetWriter(org.apache.parquet.proto.ProtoParquetWriter) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) ParquetWriterShim(org.apache.gobblin.parquet.writer.ParquetWriterShim) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

Configuration (org.apache.hadoop.conf.Configuration)2 WriteSupport (org.apache.parquet.hadoop.api.WriteSupport)2 BigDecimal (java.math.BigDecimal)1 LocalDateTime (java.time.LocalDateTime)1 Row (org.apache.flink.types.Row)1 ParquetWriterConfiguration (org.apache.gobblin.parquet.writer.ParquetWriterConfiguration)1 ParquetWriterShim (org.apache.gobblin.parquet.writer.ParquetWriterShim)1 AvroParquetWriter (org.apache.parquet.avro.AvroParquetWriter)1 ParquetProperties (org.apache.parquet.column.ParquetProperties)1 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)1 GroupWriteSupport (org.apache.parquet.hadoop.example.GroupWriteSupport)1 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)1 RecordConsumer (org.apache.parquet.io.api.RecordConsumer)1 ProtoParquetWriter (org.apache.parquet.proto.ProtoParquetWriter)1 MessageType (org.apache.parquet.schema.MessageType)1 PrimitiveType (org.apache.parquet.schema.PrimitiveType)1