Search in sources :

Example 1 with HoodieAvroWriteSupport

use of org.apache.hudi.avro.HoodieAvroWriteSupport in project hudi by apache.

the class HoodieWriteableTestTable method withInserts.

public Path withInserts(String partition, String fileId, List<HoodieRecord> records, TaskContextSupplier contextSupplier) throws Exception {
    FileCreateUtils.createPartitionMetaFile(basePath, partition);
    String fileName = baseFileName(currentInstantTime, fileId);
    Path baseFilePath = new Path(Paths.get(basePath, partition, fileName).toString());
    if (this.fs.exists(baseFilePath)) {
        LOG.warn("Deleting the existing base file " + baseFilePath);
        this.fs.delete(baseFilePath, true);
    }
    if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.PARQUET)) {
        HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
        HoodieAvroParquetConfig config = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue()));
        try (HoodieParquetWriter writer = new HoodieParquetWriter(currentInstantTime, new Path(Paths.get(basePath, partition, fileName).toString()), config, schema, contextSupplier, populateMetaFields)) {
            int seqId = 1;
            for (HoodieRecord record : records) {
                GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get();
                if (populateMetaFields) {
                    HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++));
                    HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName);
                    writer.writeAvro(record.getRecordKey(), avroRecord);
                    filter.add(record.getRecordKey());
                } else {
                    writer.writeAvro(record.getRecordKey(), avroRecord);
                }
            }
        }
    } else if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.ORC)) {
        Configuration conf = new Configuration();
        int orcStripSize = Integer.parseInt(HoodieStorageConfig.ORC_STRIPE_SIZE.defaultValue());
        int orcBlockSize = Integer.parseInt(HoodieStorageConfig.ORC_BLOCK_SIZE.defaultValue());
        int maxFileSize = Integer.parseInt(HoodieStorageConfig.ORC_FILE_MAX_SIZE.defaultValue());
        HoodieOrcConfig config = new HoodieOrcConfig(conf, CompressionKind.ZLIB, orcStripSize, orcBlockSize, maxFileSize, filter);
        try (HoodieOrcWriter writer = new HoodieOrcWriter(currentInstantTime, new Path(Paths.get(basePath, partition, fileName).toString()), config, schema, contextSupplier)) {
            int seqId = 1;
            for (HoodieRecord record : records) {
                GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get();
                HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++));
                HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName);
                writer.writeAvro(record.getRecordKey(), avroRecord);
                filter.add(record.getRecordKey());
            }
        }
    }
    return baseFilePath;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieParquetWriter(org.apache.hudi.io.storage.HoodieParquetWriter) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) Configuration(org.apache.hadoop.conf.Configuration) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) HoodieOrcConfig(org.apache.hudi.io.storage.HoodieOrcConfig) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieOrcWriter(org.apache.hudi.io.storage.HoodieOrcWriter) HoodieAvroParquetConfig(org.apache.hudi.io.storage.HoodieAvroParquetConfig) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 2 with HoodieAvroWriteSupport

use of org.apache.hudi.avro.HoodieAvroWriteSupport in project hudi by apache.

the class HoodieParquetDataBlock method serializeRecords.

@Override
protected byte[] serializeRecords(List<IndexedRecord> records) throws IOException {
    if (records.size() == 0) {
        return new byte[0];
    }
    Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(writerSchema), writerSchema, Option.empty());
    HoodieAvroParquetConfig avroParquetConfig = new HoodieAvroParquetConfig(writeSupport, compressionCodecName.get(), ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 1024 * 1024 * 1024, new Configuration(), // HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue()));
    Double.parseDouble(String.valueOf(0.1)));
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    try (FSDataOutputStream outputStream = new FSDataOutputStream(baos)) {
        try (HoodieParquetStreamWriter<IndexedRecord> parquetWriter = new HoodieParquetStreamWriter<>(outputStream, avroParquetConfig)) {
            for (IndexedRecord record : records) {
                String recordKey = getRecordKey(record).orElse(null);
                parquetWriter.writeAvro(recordKey, record);
            }
            outputStream.flush();
        }
    }
    return baos.toByteArray();
}
Also used : HoodieParquetStreamWriter(org.apache.hudi.io.storage.HoodieParquetStreamWriter) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieAvroParquetConfig(org.apache.hudi.io.storage.HoodieAvroParquetConfig) Configuration(org.apache.hadoop.conf.Configuration) IndexedRecord(org.apache.avro.generic.IndexedRecord) Schema(org.apache.avro.Schema) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) ByteArrayOutputStream(java.io.ByteArrayOutputStream) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 3 with HoodieAvroWriteSupport

use of org.apache.hudi.avro.HoodieAvroWriteSupport in project hudi by apache.

the class HiveTestUtil method generateParquetDataWithSchema.

private static void generateParquetDataWithSchema(Path filePath, Schema schema) throws IOException {
    org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
    BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1, BloomFilterTypeCode.SIMPLE.name());
    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, Option.of(filter));
    ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
    List<IndexedRecord> testRecords = SchemaTestUtil.generateTestRecordsForSchema(schema);
    testRecords.forEach(s -> {
        try {
            writer.write(s);
        } catch (IOException e) {
            fail("IOException while writing test records as parquet" + e.toString());
        }
    });
    writer.close();
}
Also used : AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) IndexedRecord(org.apache.avro.generic.IndexedRecord) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) IOException(java.io.IOException) BloomFilter(org.apache.hudi.common.bloom.BloomFilter)

Example 4 with HoodieAvroWriteSupport

use of org.apache.hudi.avro.HoodieAvroWriteSupport in project hudi by apache.

the class TestCluster method generateParquetData.

@SuppressWarnings({ "unchecked", "deprecation" })
private void generateParquetData(Path filePath, boolean isParquetSchemaSimple) throws IOException, URISyntaxException {
    Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema());
    org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
    BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1, BloomFilterTypeCode.SIMPLE.name());
    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, Option.of(filter));
    ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, dfsCluster.getFileSystem().getConf());
    List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
    testRecords.forEach(s -> {
        try {
            writer.write(s);
        } catch (IOException e) {
            fail("IOException while writing test records as parquet" + e.toString());
        }
    });
    writer.close();
}
Also used : AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) IndexedRecord(org.apache.avro.generic.IndexedRecord) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) Schema(org.apache.avro.Schema) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) IOException(java.io.IOException) BloomFilter(org.apache.hudi.common.bloom.BloomFilter)

Example 5 with HoodieAvroWriteSupport

use of org.apache.hudi.avro.HoodieAvroWriteSupport in project hudi by apache.

the class TestParquetUtils method writeParquetFile.

private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPathValue, boolean useMetaFields, String recordFieldName, String partitionFieldName) throws Exception {
    // Write out a parquet file
    BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, 10000, typeCode);
    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
    ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
    for (String rowKey : rowKeys) {
        GenericRecord rec = new GenericData.Record(schema);
        rec.put(useMetaFields ? HoodieRecord.RECORD_KEY_METADATA_FIELD : recordFieldName, rowKey);
        if (addPartitionPathField) {
            rec.put(useMetaFields ? HoodieRecord.PARTITION_PATH_METADATA_FIELD : partitionFieldName, partitionPathValue);
        }
        writer.write(rec);
        writeSupport.add(rowKey);
    }
    writer.close();
}
Also used : Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter)

Aggregations

HoodieAvroWriteSupport (org.apache.hudi.avro.HoodieAvroWriteSupport)7 AvroSchemaConverter (org.apache.parquet.avro.AvroSchemaConverter)7 BloomFilter (org.apache.hudi.common.bloom.BloomFilter)5 IndexedRecord (org.apache.avro.generic.IndexedRecord)4 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)4 IOException (java.io.IOException)3 Schema (org.apache.avro.Schema)3 GenericRecord (org.apache.avro.generic.GenericRecord)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieAvroParquetConfig (org.apache.hudi.io.storage.HoodieAvroParquetConfig)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)1 HoodieOrcConfig (org.apache.hudi.io.storage.HoodieOrcConfig)1 HoodieOrcWriter (org.apache.hudi.io.storage.HoodieOrcWriter)1 HoodieParquetStreamWriter (org.apache.hudi.io.storage.HoodieParquetStreamWriter)1 HoodieParquetWriter (org.apache.hudi.io.storage.HoodieParquetWriter)1