Search in sources :

Example 96 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class TestHoodieBulkInsertDataInternalWriter method testGlobalFailure.

/**
 * Issue some corrupted or wrong schematized InternalRow after few valid InternalRows so that global error is thrown. write batch 1 of valid records write batch2 of invalid records which is expected
 * to throw Global Error. Verify global error is set appropriately and only first batch of records are written to disk.
 */
@Test
public void testGlobalFailure() throws Exception {
    // init config and table
    HoodieWriteConfig cfg = getWriteConfig(true);
    HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
    String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
    String instantTime = "001";
    HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), STRUCT_TYPE, true, false);
    int size = 10 + RANDOM.nextInt(100);
    int totalFailures = 5;
    // Generate first batch of valid rows
    Dataset<Row> inputRows = getRandomRows(sqlContext, size / 2, partitionPath, false);
    List<InternalRow> internalRows = toInternalRows(inputRows, ENCODER);
    // generate some failures rows
    for (int i = 0; i < totalFailures; i++) {
        internalRows.add(getInternalRowWithError(partitionPath));
    }
    // generate 2nd batch of valid rows
    Dataset<Row> inputRows2 = getRandomRows(sqlContext, size / 2, partitionPath, false);
    internalRows.addAll(toInternalRows(inputRows2, ENCODER));
    // issue writes
    try {
        for (InternalRow internalRow : internalRows) {
            writer.write(internalRow);
        }
        fail("Should have failed");
    } catch (Throwable e) {
    // expected
    }
    HoodieWriterCommitMessage commitMetadata = (HoodieWriterCommitMessage) writer.commit();
    Option<List<String>> fileAbsPaths = Option.of(new ArrayList<>());
    Option<List<String>> fileNames = Option.of(new ArrayList<>());
    // verify write statuses
    assertWriteStatuses(commitMetadata.getWriteStatuses(), 1, size / 2, fileAbsPaths, fileNames);
    // verify rows
    Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0]));
    assertOutput(inputRows, result, instantTime, fileNames, true);
}
Also used : HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTable(org.apache.hudi.table.HoodieTable) ArrayList(java.util.ArrayList) List(java.util.List) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Row(org.apache.spark.sql.Row) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 97 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class TestHoodieDataSourceInternalBatchWrite method testDataSourceWriterInternal.

private void testDataSourceWriterInternal(Map<String, String> extraMetadata, Map<String, String> expectedExtraMetadata, boolean populateMetaFields) throws Exception {
    // init config and table
    HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
    HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
    String instantTime = "001";
    // init writer
    HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite = new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, extraMetadata, populateMetaFields, false);
    DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(0, RANDOM.nextLong());
    String[] partitionPaths = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS;
    List<String> partitionPathsAbs = new ArrayList<>();
    for (String partitionPath : partitionPaths) {
        partitionPathsAbs.add(basePath + "/" + partitionPath + "/*");
    }
    int size = 10 + RANDOM.nextInt(1000);
    int batches = 5;
    Dataset<Row> totalInputRows = null;
    for (int j = 0; j < batches; j++) {
        String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[j % 3];
        Dataset<Row> inputRows = getRandomRows(sqlContext, size, partitionPath, false);
        writeRows(inputRows, writer);
        if (totalInputRows == null) {
            totalInputRows = inputRows;
        } else {
            totalInputRows = totalInputRows.union(inputRows);
        }
    }
    HoodieWriterCommitMessage commitMetadata = (HoodieWriterCommitMessage) writer.commit();
    List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
    commitMessages.add(commitMetadata);
    dataSourceInternalBatchWrite.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
    metaClient.reloadActiveTimeline();
    Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
    // verify output
    assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
    assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
    // verify extra metadata
    Option<HoodieCommitMetadata> commitMetadataOption = HoodieClientTestUtils.getCommitMetadataForLatestInstant(metaClient);
    assertTrue(commitMetadataOption.isPresent());
    Map<String, String> actualExtraMetadata = new HashMap<>();
    commitMetadataOption.get().getExtraMetadata().entrySet().stream().filter(entry -> !entry.getKey().equals(HoodieCommitMetadata.SCHEMA_KEY)).forEach(entry -> actualExtraMetadata.put(entry.getKey(), entry.getValue()));
    assertEquals(actualExtraMetadata, expectedExtraMetadata);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) DataWriter(org.apache.spark.sql.connector.write.DataWriter) Disabled(org.junit.jupiter.api.Disabled) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Map(java.util.Map) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) MethodSource(org.junit.jupiter.params.provider.MethodSource) ENCODER(org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Row(org.apache.spark.sql.Row) STRUCT_TYPE(org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE) Arguments(org.junit.jupiter.params.provider.Arguments) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) Stream(java.util.stream.Stream) SparkDatasetTestUtils.toInternalRows(org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows) HoodieBulkInsertInternalWriterTestBase(org.apache.hudi.internal.HoodieBulkInsertInternalWriterTestBase) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDatasetTestUtils.getRandomRows(org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) Collections(java.util.Collections) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieTable(org.apache.hudi.table.HoodieTable) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Row(org.apache.spark.sql.Row) InternalRow(org.apache.spark.sql.catalyst.InternalRow)

Example 98 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class TestHoodieDataSourceInternalBatchWrite method testMultipleDataSourceWrites.

@ParameterizedTest
@MethodSource("bulkInsertTypeParams")
public void testMultipleDataSourceWrites(boolean populateMetaFields) throws Exception {
    // init config and table
    HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
    HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
    int partitionCounter = 0;
    // execute N rounds
    for (int i = 0; i < 2; i++) {
        String instantTime = "00" + i;
        // init writer
        HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite = new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, populateMetaFields, false);
        List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
        Dataset<Row> totalInputRows = null;
        DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(partitionCounter++, RANDOM.nextLong());
        int size = 10 + RANDOM.nextInt(1000);
        // one batch per partition
        int batches = 3;
        for (int j = 0; j < batches; j++) {
            String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[j % 3];
            Dataset<Row> inputRows = getRandomRows(sqlContext, size, partitionPath, false);
            writeRows(inputRows, writer);
            if (totalInputRows == null) {
                totalInputRows = inputRows;
            } else {
                totalInputRows = totalInputRows.union(inputRows);
            }
        }
        HoodieWriterCommitMessage commitMetadata = (HoodieWriterCommitMessage) writer.commit();
        commitMessages.add(commitMetadata);
        dataSourceInternalBatchWrite.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
        metaClient.reloadActiveTimeline();
        Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime, populateMetaFields);
        // verify output
        assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
        assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Row(org.apache.spark.sql.Row) InternalRow(org.apache.spark.sql.catalyst.InternalRow) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 99 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class HoodieBackedTableMetadataWriter method createMetadataWriteConfig.

/**
 * Create a {@code HoodieWriteConfig} to use for the Metadata Table.
 *
 * @param writeConfig {@code HoodieWriteConfig} of the main dataset writer
 */
private HoodieWriteConfig createMetadataWriteConfig(HoodieWriteConfig writeConfig) {
    int parallelism = writeConfig.getMetadataInsertParallelism();
    int minCommitsToKeep = Math.max(writeConfig.getMetadataMinCommitsToKeep(), writeConfig.getMinCommitsToKeep());
    int maxCommitsToKeep = Math.max(writeConfig.getMetadataMaxCommitsToKeep(), writeConfig.getMaxCommitsToKeep());
    // Create the write config for the metadata table by borrowing options from the main write config.
    HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder().withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(writeConfig.getConsistencyGuardConfig().isConsistencyCheckEnabled()).withInitialConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getInitialConsistencyCheckIntervalMs()).withMaxConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getMaxConsistencyCheckIntervalMs()).withMaxConsistencyChecks(writeConfig.getConsistencyGuardConfig().getMaxConsistencyChecks()).build()).withWriteConcurrencyMode(WriteConcurrencyMode.SINGLE_WRITER).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).withFileListingParallelism(writeConfig.getFileListingParallelism()).build()).withAutoCommit(true).withAvroSchemaValidate(true).withEmbeddedTimelineServerEnabled(false).withMarkersType(MarkerType.DIRECT.name()).withRollbackUsingMarkers(false).withPath(HoodieTableMetadata.getMetadataTableBasePath(writeConfig.getBasePath())).withSchema(HoodieMetadataRecord.getClassSchema().toString()).forTable(tableName).withCompactionConfig(HoodieCompactionConfig.newBuilder().withAsyncClean(writeConfig.isMetadataAsyncClean()).withAutoClean(false).withCleanerParallelism(parallelism).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).retainCommits(writeConfig.getMetadataCleanerCommitsRetained()).archiveCommitsWith(minCommitsToKeep, maxCommitsToKeep).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(writeConfig.getMetadataCompactDeltaCommitMax()).withAutoArchive(false).build()).withParallelism(parallelism, parallelism).withDeleteParallelism(parallelism).withRollbackParallelism(parallelism).withFinalizeWriteParallelism(parallelism).withAllowMultiWriteOnSameInstant(true).withKeyGenerator(HoodieTableMetadataKeyGenerator.class.getCanonicalName()).withPopulateMetaFields(dataWriteConfig.getMetadataConfig().populateMetaFields());
    // RecordKey properties are needed for the metadata table records
    final Properties properties = new Properties();
    properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), RECORD_KEY_FIELD_NAME);
    properties.put("hoodie.datasource.write.recordkey.field", RECORD_KEY_FIELD_NAME);
    builder.withProperties(properties);
    if (writeConfig.isMetricsOn()) {
        builder.withMetricsConfig(HoodieMetricsConfig.newBuilder().withReporterType(writeConfig.getMetricsReporterType().toString()).withExecutorMetrics(writeConfig.isExecutorMetricsEnabled()).on(true).build());
        switch(writeConfig.getMetricsReporterType()) {
            case GRAPHITE:
                builder.withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder().onGraphitePort(writeConfig.getGraphiteServerPort()).toGraphiteHost(writeConfig.getGraphiteServerHost()).usePrefix(writeConfig.getGraphiteMetricPrefix()).build());
                break;
            case JMX:
                builder.withMetricsJmxConfig(HoodieMetricsJmxConfig.newBuilder().onJmxPort(writeConfig.getJmxPort()).toJmxHost(writeConfig.getJmxHost()).build());
                break;
            case DATADOG:
            case PROMETHEUS:
            case PROMETHEUS_PUSHGATEWAY:
            case CONSOLE:
            case INMEMORY:
            case CLOUDWATCH:
                break;
            default:
                throw new HoodieMetadataException("Unsupported Metrics Reporter type " + writeConfig.getMetricsReporterType());
        }
    }
    return builder.build();
}
Also used : HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties)

Example 100 with HoodieWriteConfig

use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.

the class ListBasedHoodieBloomIndexHelper method findMatchingFilesForRecordKeys.

@Override
public HoodiePairData<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, HoodiePairData<String, String> partitionRecordKeyPairs, HoodieData<Pair<String, HoodieKey>> fileComparisonPairs, Map<String, List<BloomIndexFileInfo>> partitionToFileInfo, Map<String, Long> recordsPerPartition) {
    List<Pair<String, HoodieKey>> fileComparisonPairList = HoodieList.getList(fileComparisonPairs).stream().sorted(Comparator.comparing(Pair::getLeft)).collect(toList());
    List<HoodieKeyLookupResult> keyLookupResults = new ArrayList<>();
    Iterator<List<HoodieKeyLookupResult>> iterator = new HoodieBaseBloomIndexCheckFunction(hoodieTable, config).apply(fileComparisonPairList.iterator());
    while (iterator.hasNext()) {
        keyLookupResults.addAll(iterator.next());
    }
    keyLookupResults = keyLookupResults.stream().filter(lr -> lr.getMatchingRecordKeys().size() > 0).collect(toList());
    return context.parallelize(keyLookupResults).flatMap(lookupResult -> lookupResult.getMatchingRecordKeys().stream().map(recordKey -> new ImmutablePair<>(lookupResult, recordKey)).iterator()).mapToPair(pair -> {
        HoodieKeyLookupResult lookupResult = pair.getLeft();
        String recordKey = pair.getRight();
        return new ImmutablePair<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()));
    });
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodiePairData(org.apache.hudi.common.data.HoodiePairData) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieKeyLookupResult(org.apache.hudi.io.HoodieKeyLookupResult) ArrayList(java.util.ArrayList) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Map(java.util.Map) HoodieKey(org.apache.hudi.common.model.HoodieKey) Comparator(java.util.Comparator) Pair(org.apache.hudi.common.util.collection.Pair) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieKeyLookupResult(org.apache.hudi.io.HoodieKeyLookupResult) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)327 Test (org.junit.jupiter.api.Test)179 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)173 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)169 ArrayList (java.util.ArrayList)136 List (java.util.List)133 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)126 HoodieTable (org.apache.hudi.table.HoodieTable)117 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)111 HashMap (java.util.HashMap)93 Path (org.apache.hadoop.fs.Path)92 WriteStatus (org.apache.hudi.client.WriteStatus)86 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)84 Collectors (java.util.stream.Collectors)81 Map (java.util.Map)76 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)76 Assertions.assertEquals (org.junit.jupiter.api.Assertions.assertEquals)74 Arrays (java.util.Arrays)73 HoodieSparkTable (org.apache.hudi.table.HoodieSparkTable)72 Option (org.apache.hudi.common.util.Option)69