Search in sources :

Example 1 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class HoodieWriteableTestTable method withInserts.

public Path withInserts(String partition, String fileId, List<HoodieRecord> records, TaskContextSupplier contextSupplier) throws Exception {
    FileCreateUtils.createPartitionMetaFile(basePath, partition);
    String fileName = baseFileName(currentInstantTime, fileId);
    Path baseFilePath = new Path(Paths.get(basePath, partition, fileName).toString());
    if (this.fs.exists(baseFilePath)) {
        LOG.warn("Deleting the existing base file " + baseFilePath);
        this.fs.delete(baseFilePath, true);
    }
    if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.PARQUET)) {
        HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
        HoodieAvroParquetConfig config = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue()));
        try (HoodieParquetWriter writer = new HoodieParquetWriter(currentInstantTime, new Path(Paths.get(basePath, partition, fileName).toString()), config, schema, contextSupplier, populateMetaFields)) {
            int seqId = 1;
            for (HoodieRecord record : records) {
                GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get();
                if (populateMetaFields) {
                    HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++));
                    HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName);
                    writer.writeAvro(record.getRecordKey(), avroRecord);
                    filter.add(record.getRecordKey());
                } else {
                    writer.writeAvro(record.getRecordKey(), avroRecord);
                }
            }
        }
    } else if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.ORC)) {
        Configuration conf = new Configuration();
        int orcStripSize = Integer.parseInt(HoodieStorageConfig.ORC_STRIPE_SIZE.defaultValue());
        int orcBlockSize = Integer.parseInt(HoodieStorageConfig.ORC_BLOCK_SIZE.defaultValue());
        int maxFileSize = Integer.parseInt(HoodieStorageConfig.ORC_FILE_MAX_SIZE.defaultValue());
        HoodieOrcConfig config = new HoodieOrcConfig(conf, CompressionKind.ZLIB, orcStripSize, orcBlockSize, maxFileSize, filter);
        try (HoodieOrcWriter writer = new HoodieOrcWriter(currentInstantTime, new Path(Paths.get(basePath, partition, fileName).toString()), config, schema, contextSupplier)) {
            int seqId = 1;
            for (HoodieRecord record : records) {
                GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get();
                HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++));
                HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName);
                writer.writeAvro(record.getRecordKey(), avroRecord);
                filter.add(record.getRecordKey());
            }
        }
    }
    return baseFilePath;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieParquetWriter(org.apache.hudi.io.storage.HoodieParquetWriter) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) Configuration(org.apache.hadoop.conf.Configuration) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) HoodieOrcConfig(org.apache.hudi.io.storage.HoodieOrcConfig) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieOrcWriter(org.apache.hudi.io.storage.HoodieOrcWriter) HoodieAvroParquetConfig(org.apache.hudi.io.storage.HoodieAvroParquetConfig) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 2 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class HDFSParquetImporter method dataImport.

protected int dataImport(JavaSparkContext jsc) throws IOException {
    try {
        if (fs.exists(new Path(cfg.targetPath)) && !isUpsert()) {
            // cleanup target directory.
            fs.delete(new Path(cfg.targetPath), true);
        }
        if (!fs.exists(new Path(cfg.targetPath))) {
            // Initialize target hoodie table.
            Properties properties = HoodieTableMetaClient.withPropertyBuilder().setTableName(cfg.tableName).setTableType(cfg.tableType).build();
            HoodieTableMetaClient.initTableAndGetMetaClient(jsc.hadoopConfiguration(), cfg.targetPath, properties);
        }
        // Get schema.
        String schemaStr = UtilHelpers.parseSchema(fs, cfg.schemaFile);
        SparkRDDWriteClient<HoodieRecordPayload> client = UtilHelpers.createHoodieClient(jsc, cfg.targetPath, schemaStr, cfg.parallelism, Option.empty(), props);
        JavaRDD<HoodieRecord<HoodieRecordPayload>> hoodieRecords = buildHoodieRecordsForImport(jsc, schemaStr);
        // Get instant time.
        String instantTime = client.startCommit();
        JavaRDD<WriteStatus> writeResponse = load(client, instantTime, hoodieRecords);
        return UtilHelpers.handleErrors(jsc, instantTime, writeResponse);
    } catch (Throwable t) {
        LOG.error("Error occurred.", t);
    }
    return -1;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Properties(java.util.Properties) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 3 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class HoodieClusteringJob method doScheduleAndCluster.

private int doScheduleAndCluster(JavaSparkContext jsc) throws Exception {
    LOG.info("Step 1: Do schedule");
    String schemaStr = getSchemaFromLatestInstant();
    try (SparkRDDWriteClient<HoodieRecordPayload> client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) {
        Option<String> instantTime = Option.empty();
        if (cfg.retryLastFailedClusteringJob) {
            HoodieSparkTable<HoodieRecordPayload> table = HoodieSparkTable.create(client.getConfig(), client.getEngineContext());
            HoodieTimeline inflightHoodieTimeline = table.getActiveTimeline().filterPendingReplaceTimeline().filterInflights();
            if (!inflightHoodieTimeline.empty()) {
                HoodieInstant inflightClusteringInstant = inflightHoodieTimeline.lastInstant().get();
                Date clusteringStartTime = HoodieActiveTimeline.parseDateFromInstantTime(inflightClusteringInstant.getTimestamp());
                if (clusteringStartTime.getTime() + cfg.maxProcessingTimeMs < System.currentTimeMillis()) {
                    // if there has failed clustering, then we will use the failed clustering instant-time to trigger next clustering action which will rollback and clustering.
                    LOG.info("Found failed clustering instant at : " + inflightClusteringInstant + "; Will rollback the failed clustering and re-trigger again.");
                    instantTime = Option.of(inflightHoodieTimeline.lastInstant().get().getTimestamp());
                } else {
                    LOG.info(inflightClusteringInstant + " might still be in progress, will trigger a new clustering job.");
                }
            }
        }
        instantTime = instantTime.isPresent() ? instantTime : doSchedule(client);
        if (!instantTime.isPresent()) {
            LOG.info("Couldn't generate cluster plan");
            return -1;
        }
        LOG.info("The schedule instant time is " + instantTime.get());
        LOG.info("Step 2: Do cluster");
        Option<HoodieCommitMetadata> metadata = client.cluster(instantTime.get(), true).getCommitMetadata();
        return UtilHelpers.handleErrors(metadata.get(), instantTime.get());
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) Date(java.util.Date)

Example 4 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class HoodieFileSliceReader method getFileSliceReader.

public static HoodieFileSliceReader getFileSliceReader(Option<HoodieFileReader> baseFileReader, HoodieMergedLogRecordScanner scanner, Schema schema, String payloadClass, String preCombineField, Option<Pair<String, String>> simpleKeyGenFieldsOpt) throws IOException {
    if (baseFileReader.isPresent()) {
        Iterator baseIterator = baseFileReader.get().getRecordIterator(schema);
        while (baseIterator.hasNext()) {
            GenericRecord record = (GenericRecord) baseIterator.next();
            HoodieRecord<? extends HoodieRecordPayload> hoodieRecord = transform(record, scanner, payloadClass, preCombineField, simpleKeyGenFieldsOpt);
            scanner.processNextRecord(hoodieRecord);
        }
        return new HoodieFileSliceReader(scanner.iterator());
    } else {
        Iterable<HoodieRecord<? extends HoodieRecordPayload>> iterable = () -> scanner.iterator();
        HoodiePayloadConfig payloadConfig = HoodiePayloadConfig.newBuilder().withPayloadOrderingField(preCombineField).build();
        return new HoodieFileSliceReader(StreamSupport.stream(iterable.spliterator(), false).map(e -> {
            try {
                GenericRecord record = (GenericRecord) e.getData().getInsertValue(schema, payloadConfig.getProps()).get();
                return transform(record, scanner, payloadClass, preCombineField, simpleKeyGenFieldsOpt);
            } catch (IOException io) {
                throw new HoodieIOException("Error while creating reader for file slice with no base file.", io);
            }
        }).iterator());
    }
}
Also used : HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Iterator(java.util.Iterator) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodiePayloadConfig(org.apache.hudi.config.HoodiePayloadConfig)

Example 5 with HoodieRecordPayload

use of org.apache.hudi.common.model.HoodieRecordPayload in project hudi by apache.

the class HoodieCreateHandle method write.

/**
 * Perform the actual writing of the given record into the backing file.
 */
@Override
public void write(HoodieRecord record, Option<IndexedRecord> avroRecord) {
    Option recordMetadata = ((HoodieRecordPayload) record.getData()).getMetadata();
    if (HoodieOperation.isDelete(record.getOperation())) {
        avroRecord = Option.empty();
    }
    try {
        if (avroRecord.isPresent()) {
            if (avroRecord.get().equals(IGNORE_RECORD)) {
                return;
            }
            // Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
            IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) avroRecord.get());
            if (preserveHoodieMetadata) {
                // do not preserve FILENAME_METADATA_FIELD
                recordWithMetadataInSchema.put(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.FILENAME_METADATA_FIELD), path.getName());
                fileWriter.writeAvro(record.getRecordKey(), recordWithMetadataInSchema);
            } else {
                fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
            }
            // update the new location of record, so we know where to find it next
            record.unseal();
            record.setNewLocation(new HoodieRecordLocation(instantTime, writeStatus.getFileId()));
            record.seal();
            recordsWritten++;
            insertRecordsWritten++;
        } else {
            recordsDeleted++;
        }
        writeStatus.markSuccess(record, recordMetadata);
        // deflate record payload after recording success. This will help users access payload as a
        // part of marking
        // record successful.
        record.deflate();
    } catch (Throwable t) {
        // Not throwing exception from here, since we don't want to fail the entire job
        // for a single record
        writeStatus.markFailure(record, t, recordMetadata);
        LOG.error("Error writing record " + record, t);
    }
}
Also used : IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Option(org.apache.hudi.common.util.Option) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Aggregations

HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)38 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)30 Schema (org.apache.avro.Schema)19 IOException (java.io.IOException)18 GenericRecord (org.apache.avro.generic.GenericRecord)18 IndexedRecord (org.apache.avro.generic.IndexedRecord)14 ArrayList (java.util.ArrayList)12 HashMap (java.util.HashMap)12 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)12 Option (org.apache.hudi.common.util.Option)12 Map (java.util.Map)11 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)11 List (java.util.List)9 Path (org.apache.hadoop.fs.Path)9 HoodieKey (org.apache.hudi.common.model.HoodieKey)9 Collectors (java.util.stream.Collectors)8 HoodieRecordSizeEstimator (org.apache.hudi.common.util.HoodieRecordSizeEstimator)8 Test (org.junit.jupiter.api.Test)8 UncheckedIOException (java.io.UncheckedIOException)7 Arrays (java.util.Arrays)7