Search in sources :

Example 16 with HoodieUpsertException

use of org.apache.hudi.exception.HoodieUpsertException in project hudi by apache.

the class TestTableSchemaEvolution method testCopyOnWriteTable.

@Test
public void testCopyOnWriteTable() throws Exception {
    // Create the table
    HoodieTableMetaClient.withPropertyBuilder().fromMetaClient(metaClient).setTimelineLayoutVersion(VERSION_1).initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
    HoodieWriteConfig hoodieWriteConfig = getWriteConfigBuilder(TRIP_EXAMPLE_SCHEMA).withRollbackUsingMarkers(false).build();
    SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
    // Initial inserts with TRIP_EXAMPLE_SCHEMA
    int numRecords = 10;
    insertFirstBatch(hoodieWriteConfig, client, "001", initCommitTime, numRecords, SparkRDDWriteClient::insert, false, true, numRecords);
    checkReadRecords("000", numRecords);
    // Updates with same schema is allowed
    final int numUpdateRecords = 5;
    updateBatch(hoodieWriteConfig, client, "002", "001", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, true, numUpdateRecords, numRecords, 2);
    checkReadRecords("000", numRecords);
    // Delete with same schema is allowed
    final int numDeleteRecords = 2;
    numRecords -= numDeleteRecords;
    deleteBatch(hoodieWriteConfig, client, "003", "002", initCommitTime, numDeleteRecords, SparkRDDWriteClient::delete, false, true, 0, numRecords);
    checkReadRecords("000", numRecords);
    // Insert with devolved schema is not allowed
    HoodieWriteConfig hoodieDevolvedWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA_DEVOLVED);
    client = getHoodieWriteClient(hoodieDevolvedWriteConfig);
    final List<HoodieRecord> failedRecords = generateInsertsWithSchema("004", numRecords, TRIP_EXAMPLE_SCHEMA_DEVOLVED);
    try {
        // We cannot use insertBatch directly here because we want to insert records
        // with a devolved schema.
        writeBatch(client, "004", "003", Option.empty(), "003", numRecords, (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, true, numRecords, numRecords, 1, false);
        fail("Insert with devolved scheme should fail");
    } catch (HoodieInsertException ex) {
        // no new commit
        HoodieTimeline curTimeline = metaClient.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants();
        assertTrue(curTimeline.lastInstant().get().getTimestamp().equals("003"));
        client.rollback("004");
    }
    // Update with devolved schema is not allowed
    try {
        updateBatch(hoodieDevolvedWriteConfig, client, "004", "003", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, true, numUpdateRecords, 2 * numRecords, 5);
        fail("Update with devolved scheme should fail");
    } catch (HoodieUpsertException ex) {
        // no new commit
        HoodieTimeline curTimeline = metaClient.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants();
        assertTrue(curTimeline.lastInstant().get().getTimestamp().equals("003"));
        client.rollback("004");
    }
    // Insert with evolved scheme is allowed
    HoodieWriteConfig hoodieEvolvedWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA_EVOLVED);
    client = getHoodieWriteClient(hoodieEvolvedWriteConfig);
    final List<HoodieRecord> evolvedRecords = generateInsertsWithSchema("004", numRecords, TRIP_EXAMPLE_SCHEMA_EVOLVED);
    // We cannot use insertBatch directly here because we want to insert records
    // with a evolved schema.
    writeBatch(client, "004", "003", Option.empty(), initCommitTime, numRecords, (String s, Integer a) -> evolvedRecords, SparkRDDWriteClient::insert, true, numRecords, 2 * numRecords, 4, false);
    // new commit
    HoodieTimeline curTimeline = metaClient.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants();
    assertTrue(curTimeline.lastInstant().get().getTimestamp().equals("004"));
    checkReadRecords("000", 2 * numRecords);
    // Updates with evolved schema is allowed
    final List<HoodieRecord> updateRecords = generateUpdatesWithSchema("005", numUpdateRecords, TRIP_EXAMPLE_SCHEMA_EVOLVED);
    writeBatch(client, "005", "004", Option.empty(), initCommitTime, numUpdateRecords, (String s, Integer a) -> updateRecords, SparkRDDWriteClient::upsert, true, numUpdateRecords, 2 * numRecords, 5, false);
    checkReadRecords("000", 2 * numRecords);
    // Now even the original schema cannot be used for updates as it is devolved
    // in relation to the current schema of the dataset.
    client = getHoodieWriteClient(hoodieWriteConfig);
    try {
        updateBatch(hoodieWriteConfig, client, "006", "005", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, true, numUpdateRecords, numRecords, 2);
        fail("Update with original scheme should fail");
    } catch (HoodieUpsertException ex) {
        // no new commit
        curTimeline = metaClient.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants();
        assertTrue(curTimeline.lastInstant().get().getTimestamp().equals("005"));
        client.rollback("006");
    }
    // in relation to the current schema of the dataset.
    try {
        // We are not using insertBatch directly here because insertion of these
        // records will fail and we dont want to keep these records within
        // HoodieTestDataGenerator.
        failedRecords.clear();
        failedRecords.addAll(dataGen.generateInserts("006", numRecords));
        writeBatch(client, "006", "005", Option.empty(), initCommitTime, numRecords, (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, true, numRecords, numRecords, 1, false);
        fail("Insert with original scheme should fail");
    } catch (HoodieInsertException ex) {
        // no new commit
        curTimeline = metaClient.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants();
        assertTrue(curTimeline.lastInstant().get().getTimestamp().equals("005"));
        client.rollback("006");
        // or deletes for records which do not even exist.
        for (HoodieRecord record : failedRecords) {
            assertTrue(dataGen.deleteExistingKeyIfPresent(record.getKey()));
        }
    }
    // Revert to the older commit and ensure that the original schema can now
    // be used for inserts and inserts.
    client.restoreToInstant("003");
    curTimeline = metaClient.reloadActiveTimeline().getCommitTimeline().filterCompletedInstants();
    assertTrue(curTimeline.lastInstant().get().getTimestamp().equals("003"));
    checkReadRecords("000", numRecords);
    // Insert with original schema is allowed now
    insertBatch(hoodieWriteConfig, client, "007", "003", numRecords, SparkRDDWriteClient::insert, false, true, numRecords, 2 * numRecords, 1, Option.empty());
    checkReadRecords("000", 2 * numRecords);
    // Update with original schema is allowed now
    updateBatch(hoodieWriteConfig, client, "008", "007", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, true, numUpdateRecords, 2 * numRecords, 5);
    checkReadRecords("000", 2 * numRecords);
}
Also used : HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) Test(org.junit.jupiter.api.Test)

Example 17 with HoodieUpsertException

use of org.apache.hudi.exception.HoodieUpsertException in project hudi by apache.

the class TestTableSchemaEvolution method testMORTable.

@Test
public void testMORTable() throws Exception {
    tableType = HoodieTableType.MERGE_ON_READ;
    // Create the table
    HoodieTableMetaClient.withPropertyBuilder().fromMetaClient(metaClient).setTableType(HoodieTableType.MERGE_ON_READ).setTimelineLayoutVersion(VERSION_1).initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
    HoodieWriteConfig hoodieWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA);
    SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
    // Initial inserts with TRIP_EXAMPLE_SCHEMA
    int numRecords = 10;
    insertFirstBatch(hoodieWriteConfig, client, "001", initCommitTime, numRecords, SparkRDDWriteClient::insert, false, false, numRecords);
    checkLatestDeltaCommit("001");
    // Compact once so we can incrementally read later
    assertTrue(client.scheduleCompactionAtInstant("002", Option.empty()));
    client.compact("002");
    // Updates with same schema is allowed
    final int numUpdateRecords = 5;
    updateBatch(hoodieWriteConfig, client, "003", "002", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, false, 0, 0, 0);
    checkLatestDeltaCommit("003");
    checkReadRecords("000", numRecords);
    // Delete with same schema is allowed
    final int numDeleteRecords = 2;
    numRecords -= numDeleteRecords;
    deleteBatch(hoodieWriteConfig, client, "004", "003", initCommitTime, numDeleteRecords, SparkRDDWriteClient::delete, false, false, 0, 0);
    checkLatestDeltaCommit("004");
    checkReadRecords("000", numRecords);
    // Insert with evolved schema is not allowed
    HoodieWriteConfig hoodieDevolvedWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA_DEVOLVED);
    client = getHoodieWriteClient(hoodieDevolvedWriteConfig);
    final List<HoodieRecord> failedRecords = generateInsertsWithSchema("004", numRecords, TRIP_EXAMPLE_SCHEMA_DEVOLVED);
    try {
        // We cannot use insertBatch directly here because we want to insert records
        // with a devolved schema and insertBatch inserts records using the TRIP_EXAMPLE_SCHEMA.
        writeBatch(client, "005", "004", Option.empty(), "003", numRecords, (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, false, 0, 0, 0, false);
        fail("Insert with devolved scheme should fail");
    } catch (HoodieInsertException ex) {
        // no new commit
        checkLatestDeltaCommit("004");
        checkReadRecords("000", numRecords);
        client.rollback("005");
    }
    // Update with devolved schema is also not allowed
    try {
        updateBatch(hoodieDevolvedWriteConfig, client, "005", "004", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, false, 0, 0, 0);
        fail("Update with devolved scheme should fail");
    } catch (HoodieUpsertException ex) {
        // no new commit
        checkLatestDeltaCommit("004");
        checkReadRecords("000", numRecords);
        client.rollback("005");
    }
    // Insert with an evolved scheme is allowed
    HoodieWriteConfig hoodieEvolvedWriteConfig = getWriteConfig(TRIP_EXAMPLE_SCHEMA_EVOLVED);
    client = getHoodieWriteClient(hoodieEvolvedWriteConfig);
    // We cannot use insertBatch directly here because we want to insert records
    // with an evolved schema and insertBatch inserts records using the TRIP_EXAMPLE_SCHEMA.
    final List<HoodieRecord> evolvedRecords = generateInsertsWithSchema("005", numRecords, TRIP_EXAMPLE_SCHEMA_EVOLVED);
    writeBatch(client, "005", "004", Option.empty(), initCommitTime, numRecords, (String s, Integer a) -> evolvedRecords, SparkRDDWriteClient::insert, false, 0, 0, 0, false);
    // new commit
    checkLatestDeltaCommit("005");
    checkReadRecords("000", 2 * numRecords);
    // Updates with evolved schema is allowed
    final List<HoodieRecord> updateRecords = generateUpdatesWithSchema("006", numUpdateRecords, TRIP_EXAMPLE_SCHEMA_EVOLVED);
    writeBatch(client, "006", "005", Option.empty(), initCommitTime, numUpdateRecords, (String s, Integer a) -> updateRecords, SparkRDDWriteClient::upsert, false, 0, 0, 0, false);
    // new commit
    checkLatestDeltaCommit("006");
    checkReadRecords("000", 2 * numRecords);
    // Now even the original schema cannot be used for updates as it is devolved in relation to the
    // current schema of the dataset.
    client = getHoodieWriteClient(hoodieWriteConfig);
    try {
        updateBatch(hoodieWriteConfig, client, "007", "006", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, false, 0, 0, 0);
        fail("Update with original scheme should fail");
    } catch (HoodieUpsertException ex) {
        // no new commit
        checkLatestDeltaCommit("006");
        checkReadRecords("000", 2 * numRecords);
        client.rollback("007");
    }
    // current schema of the dataset.
    try {
        // We are not using insertBatch directly here because insertion of these
        // records will fail and we dont want to keep these records within HoodieTestDataGenerator as we
        // will be testing updates later.
        failedRecords.clear();
        failedRecords.addAll(dataGen.generateInserts("007", numRecords));
        writeBatch(client, "007", "006", Option.empty(), initCommitTime, numRecords, (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, true, numRecords, numRecords, 1, false);
        fail("Insert with original scheme should fail");
    } catch (HoodieInsertException ex) {
        // no new commit
        checkLatestDeltaCommit("006");
        checkReadRecords("000", 2 * numRecords);
        client.rollback("007");
        // or deletes for records which do not even exist.
        for (HoodieRecord record : failedRecords) {
            assertTrue(dataGen.deleteExistingKeyIfPresent(record.getKey()));
        }
    }
    // Rollback to the original schema
    client.restoreToInstant("004");
    checkLatestDeltaCommit("004");
    // Updates with original schema are now allowed
    client = getHoodieWriteClient(hoodieWriteConfig);
    updateBatch(hoodieWriteConfig, client, "008", "004", Option.empty(), initCommitTime, numUpdateRecords, SparkRDDWriteClient::upsert, false, false, 0, 0, 0);
    // new commit
    checkLatestDeltaCommit("008");
    checkReadRecords("000", 2 * numRecords);
    // Insert with original schema is allowed now
    insertBatch(hoodieWriteConfig, client, "009", "008", numRecords, SparkRDDWriteClient::insert, false, false, 0, 0, 0, Option.empty());
    checkLatestDeltaCommit("009");
    checkReadRecords("000", 3 * numRecords);
}
Also used : HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) Test(org.junit.jupiter.api.Test)

Example 18 with HoodieUpsertException

use of org.apache.hudi.exception.HoodieUpsertException in project hudi by apache.

the class HoodieTable method validateSchema.

/**
 * Ensure that the current writerSchema is compatible with the latest schema of this dataset.
 *
 * When inserting/updating data, we read records using the last used schema and convert them to the
 * GenericRecords with writerSchema. Hence, we need to ensure that this conversion can take place without errors.
 */
private void validateSchema() throws HoodieUpsertException, HoodieInsertException {
    if (!config.getAvroSchemaValidate() || getActiveTimeline().getCommitsTimeline().filterCompletedInstants().empty()) {
        // Check not required
        return;
    }
    Schema tableSchema;
    Schema writerSchema;
    boolean isValid;
    try {
        TableSchemaResolver schemaResolver = new TableSchemaResolver(getMetaClient());
        writerSchema = HoodieAvroUtils.createHoodieWriteSchema(config.getSchema());
        tableSchema = HoodieAvroUtils.createHoodieWriteSchema(schemaResolver.getTableAvroSchemaWithoutMetadataFields());
        isValid = TableSchemaResolver.isSchemaCompatible(tableSchema, writerSchema);
    } catch (Exception e) {
        throw new HoodieException("Failed to read schema/check compatibility for base path " + metaClient.getBasePath(), e);
    }
    if (!isValid) {
        throw new HoodieException("Failed schema compatibility check for writerSchema :" + writerSchema + ", table schema :" + tableSchema + ", base path :" + metaClient.getBasePath());
    }
}
Also used : Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) TimeoutException(java.util.concurrent.TimeoutException) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 19 with HoodieUpsertException

use of org.apache.hudi.exception.HoodieUpsertException in project hudi by apache.

the class HoodieConcatHandle method write.

/**
 * Write old record as is w/o merging with incoming record.
 */
@Override
public void write(GenericRecord oldRecord) {
    String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt);
    try {
        fileWriter.writeAvro(key, oldRecord);
    } catch (IOException | RuntimeException e) {
        String errMsg = String.format("Failed to write old record into new file for key %s from old file %s to new file %s with writerSchema %s", key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true));
        LOG.debug("Old record is " + oldRecord);
        throw new HoodieUpsertException(errMsg, e);
    }
    recordsWritten++;
}
Also used : HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) IOException(java.io.IOException)

Example 20 with HoodieUpsertException

use of org.apache.hudi.exception.HoodieUpsertException in project hudi by apache.

the class HoodieMergeHandle method close.

@Override
public List<WriteStatus> close() {
    try {
        writeIncomingRecords();
        if (keyToNewRecords instanceof ExternalSpillableMap) {
            ((ExternalSpillableMap) keyToNewRecords).close();
        } else {
            keyToNewRecords.clear();
        }
        writtenRecordKeys.clear();
        if (fileWriter != null) {
            fileWriter.close();
            fileWriter = null;
        }
        long fileSizeInBytes = FSUtils.getFileSize(fs, newFilePath);
        HoodieWriteStat stat = writeStatus.getStat();
        stat.setTotalWriteBytes(fileSizeInBytes);
        stat.setFileSizeInBytes(fileSizeInBytes);
        stat.setNumWrites(recordsWritten);
        stat.setNumDeletes(recordsDeleted);
        stat.setNumUpdateWrites(updatedRecordsWritten);
        stat.setNumInserts(insertRecordsWritten);
        stat.setTotalWriteErrors(writeStatus.getTotalErrorRecords());
        RuntimeStats runtimeStats = new RuntimeStats();
        runtimeStats.setTotalUpsertTime(timer.endTimer());
        stat.setRuntimeStats(runtimeStats);
        performMergeDataValidationCheck(writeStatus);
        LOG.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(), stat.getFileId(), runtimeStats.getTotalUpsertTime()));
        return Collections.singletonList(writeStatus);
    } catch (IOException e) {
        throw new HoodieUpsertException("Failed to close UpdateHandle", e);
    }
}
Also used : HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) RuntimeStats(org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Aggregations

HoodieUpsertException (org.apache.hudi.exception.HoodieUpsertException)24 IOException (java.io.IOException)13 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)6 HoodieIOException (org.apache.hudi.exception.HoodieIOException)6 Duration (java.time.Duration)5 Instant (java.time.Instant)5 List (java.util.List)4 WriteStatus (org.apache.hudi.client.WriteStatus)4 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)4 HashMap (java.util.HashMap)3 LinkedList (java.util.LinkedList)3 IndexedRecord (org.apache.avro.generic.IndexedRecord)3 Path (org.apache.hadoop.fs.Path)3 HoodieList (org.apache.hudi.common.data.HoodieList)3 EmptyHoodieRecordPayload (org.apache.hudi.common.model.EmptyHoodieRecordPayload)3 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)3 HoodieKey (org.apache.hudi.common.model.HoodieKey)3 Pair (org.apache.hudi.common.util.collection.Pair)3 HoodieCorruptedDataException (org.apache.hudi.exception.HoodieCorruptedDataException)3 HoodieInsertException (org.apache.hudi.exception.HoodieInsertException)3