use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestHoodieBulkInsertDataInternalWriter method testGlobalFailure.
/**
* Issue some corrupted or wrong schematized InternalRow after few valid InternalRows so that global error is thrown. write batch 1 of valid records write batch2 of invalid records which is expected
* to throw Global Error. Verify global error is set appropriately and only first batch of records are written to disk.
*/
@Test
public void testGlobalFailure() throws Exception {
// init config and table
HoodieWriteConfig cfg = getWriteConfig(true);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
String instantTime = "001";
HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), STRUCT_TYPE, true, false);
int size = 10 + RANDOM.nextInt(100);
int totalFailures = 5;
// Generate first batch of valid rows
Dataset<Row> inputRows = getRandomRows(sqlContext, size / 2, partitionPath, false);
List<InternalRow> internalRows = toInternalRows(inputRows, ENCODER);
// generate some failures rows
for (int i = 0; i < totalFailures; i++) {
internalRows.add(getInternalRowWithError(partitionPath));
}
// generate 2nd batch of valid rows
Dataset<Row> inputRows2 = getRandomRows(sqlContext, size / 2, partitionPath, false);
internalRows.addAll(toInternalRows(inputRows2, ENCODER));
// issue writes
try {
for (InternalRow internalRow : internalRows) {
writer.write(internalRow);
}
fail("Should have failed");
} catch (Throwable e) {
// expected
}
HoodieWriterCommitMessage commitMetadata = (HoodieWriterCommitMessage) writer.commit();
Option<List<String>> fileAbsPaths = Option.of(new ArrayList<>());
Option<List<String>> fileNames = Option.of(new ArrayList<>());
// verify write statuses
assertWriteStatuses(commitMetadata.getWriteStatuses(), 1, size / 2, fileAbsPaths, fileNames);
// verify rows
Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0]));
assertOutput(inputRows, result, instantTime, fileNames, true);
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestHoodieDataSourceInternalBatchWrite method testDataSourceWriterInternal.
private void testDataSourceWriterInternal(Map<String, String> extraMetadata, Map<String, String> expectedExtraMetadata, boolean populateMetaFields) throws Exception {
// init config and table
HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
String instantTime = "001";
// init writer
HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite = new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, extraMetadata, populateMetaFields, false);
DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(0, RANDOM.nextLong());
String[] partitionPaths = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS;
List<String> partitionPathsAbs = new ArrayList<>();
for (String partitionPath : partitionPaths) {
partitionPathsAbs.add(basePath + "/" + partitionPath + "/*");
}
int size = 10 + RANDOM.nextInt(1000);
int batches = 5;
Dataset<Row> totalInputRows = null;
for (int j = 0; j < batches; j++) {
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[j % 3];
Dataset<Row> inputRows = getRandomRows(sqlContext, size, partitionPath, false);
writeRows(inputRows, writer);
if (totalInputRows == null) {
totalInputRows = inputRows;
} else {
totalInputRows = totalInputRows.union(inputRows);
}
}
HoodieWriterCommitMessage commitMetadata = (HoodieWriterCommitMessage) writer.commit();
List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
commitMessages.add(commitMetadata);
dataSourceInternalBatchWrite.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
// verify output
assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
// verify extra metadata
Option<HoodieCommitMetadata> commitMetadataOption = HoodieClientTestUtils.getCommitMetadataForLatestInstant(metaClient);
assertTrue(commitMetadataOption.isPresent());
Map<String, String> actualExtraMetadata = new HashMap<>();
commitMetadataOption.get().getExtraMetadata().entrySet().stream().filter(entry -> !entry.getKey().equals(HoodieCommitMetadata.SCHEMA_KEY)).forEach(entry -> actualExtraMetadata.put(entry.getKey(), entry.getValue()));
assertEquals(actualExtraMetadata, expectedExtraMetadata);
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class TestHoodieDataSourceInternalBatchWrite method testMultipleDataSourceWrites.
@ParameterizedTest
@MethodSource("bulkInsertTypeParams")
public void testMultipleDataSourceWrites(boolean populateMetaFields) throws Exception {
// init config and table
HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
int partitionCounter = 0;
// execute N rounds
for (int i = 0; i < 2; i++) {
String instantTime = "00" + i;
// init writer
HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite = new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, populateMetaFields, false);
List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
Dataset<Row> totalInputRows = null;
DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(partitionCounter++, RANDOM.nextLong());
int size = 10 + RANDOM.nextInt(1000);
// one batch per partition
int batches = 3;
for (int j = 0; j < batches; j++) {
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[j % 3];
Dataset<Row> inputRows = getRandomRows(sqlContext, size, partitionPath, false);
writeRows(inputRows, writer);
if (totalInputRows == null) {
totalInputRows = inputRows;
} else {
totalInputRows = totalInputRows.union(inputRows);
}
}
HoodieWriterCommitMessage commitMetadata = (HoodieWriterCommitMessage) writer.commit();
commitMessages.add(commitMetadata);
dataSourceInternalBatchWrite.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime, populateMetaFields);
// verify output
assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
}
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class HoodieBackedTableMetadataWriter method createMetadataWriteConfig.
/**
* Create a {@code HoodieWriteConfig} to use for the Metadata Table.
*
* @param writeConfig {@code HoodieWriteConfig} of the main dataset writer
*/
private HoodieWriteConfig createMetadataWriteConfig(HoodieWriteConfig writeConfig) {
int parallelism = writeConfig.getMetadataInsertParallelism();
int minCommitsToKeep = Math.max(writeConfig.getMetadataMinCommitsToKeep(), writeConfig.getMinCommitsToKeep());
int maxCommitsToKeep = Math.max(writeConfig.getMetadataMaxCommitsToKeep(), writeConfig.getMaxCommitsToKeep());
// Create the write config for the metadata table by borrowing options from the main write config.
HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder().withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(writeConfig.getConsistencyGuardConfig().isConsistencyCheckEnabled()).withInitialConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getInitialConsistencyCheckIntervalMs()).withMaxConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getMaxConsistencyCheckIntervalMs()).withMaxConsistencyChecks(writeConfig.getConsistencyGuardConfig().getMaxConsistencyChecks()).build()).withWriteConcurrencyMode(WriteConcurrencyMode.SINGLE_WRITER).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).withFileListingParallelism(writeConfig.getFileListingParallelism()).build()).withAutoCommit(true).withAvroSchemaValidate(true).withEmbeddedTimelineServerEnabled(false).withMarkersType(MarkerType.DIRECT.name()).withRollbackUsingMarkers(false).withPath(HoodieTableMetadata.getMetadataTableBasePath(writeConfig.getBasePath())).withSchema(HoodieMetadataRecord.getClassSchema().toString()).forTable(tableName).withCompactionConfig(HoodieCompactionConfig.newBuilder().withAsyncClean(writeConfig.isMetadataAsyncClean()).withAutoClean(false).withCleanerParallelism(parallelism).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).retainCommits(writeConfig.getMetadataCleanerCommitsRetained()).archiveCommitsWith(minCommitsToKeep, maxCommitsToKeep).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(writeConfig.getMetadataCompactDeltaCommitMax()).withAutoArchive(false).build()).withParallelism(parallelism, parallelism).withDeleteParallelism(parallelism).withRollbackParallelism(parallelism).withFinalizeWriteParallelism(parallelism).withAllowMultiWriteOnSameInstant(true).withKeyGenerator(HoodieTableMetadataKeyGenerator.class.getCanonicalName()).withPopulateMetaFields(dataWriteConfig.getMetadataConfig().populateMetaFields());
// RecordKey properties are needed for the metadata table records
final Properties properties = new Properties();
properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), RECORD_KEY_FIELD_NAME);
properties.put("hoodie.datasource.write.recordkey.field", RECORD_KEY_FIELD_NAME);
builder.withProperties(properties);
if (writeConfig.isMetricsOn()) {
builder.withMetricsConfig(HoodieMetricsConfig.newBuilder().withReporterType(writeConfig.getMetricsReporterType().toString()).withExecutorMetrics(writeConfig.isExecutorMetricsEnabled()).on(true).build());
switch(writeConfig.getMetricsReporterType()) {
case GRAPHITE:
builder.withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder().onGraphitePort(writeConfig.getGraphiteServerPort()).toGraphiteHost(writeConfig.getGraphiteServerHost()).usePrefix(writeConfig.getGraphiteMetricPrefix()).build());
break;
case JMX:
builder.withMetricsJmxConfig(HoodieMetricsJmxConfig.newBuilder().onJmxPort(writeConfig.getJmxPort()).toJmxHost(writeConfig.getJmxHost()).build());
break;
case DATADOG:
case PROMETHEUS:
case PROMETHEUS_PUSHGATEWAY:
case CONSOLE:
case INMEMORY:
case CLOUDWATCH:
break;
default:
throw new HoodieMetadataException("Unsupported Metrics Reporter type " + writeConfig.getMetricsReporterType());
}
}
return builder.build();
}
use of org.apache.hudi.config.HoodieWriteConfig in project hudi by apache.
the class ListBasedHoodieBloomIndexHelper method findMatchingFilesForRecordKeys.
@Override
public HoodiePairData<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, HoodiePairData<String, String> partitionRecordKeyPairs, HoodieData<Pair<String, HoodieKey>> fileComparisonPairs, Map<String, List<BloomIndexFileInfo>> partitionToFileInfo, Map<String, Long> recordsPerPartition) {
List<Pair<String, HoodieKey>> fileComparisonPairList = HoodieList.getList(fileComparisonPairs).stream().sorted(Comparator.comparing(Pair::getLeft)).collect(toList());
List<HoodieKeyLookupResult> keyLookupResults = new ArrayList<>();
Iterator<List<HoodieKeyLookupResult>> iterator = new HoodieBaseBloomIndexCheckFunction(hoodieTable, config).apply(fileComparisonPairList.iterator());
while (iterator.hasNext()) {
keyLookupResults.addAll(iterator.next());
}
keyLookupResults = keyLookupResults.stream().filter(lr -> lr.getMatchingRecordKeys().size() > 0).collect(toList());
return context.parallelize(keyLookupResults).flatMap(lookupResult -> lookupResult.getMatchingRecordKeys().stream().map(recordKey -> new ImmutablePair<>(lookupResult, recordKey)).iterator()).mapToPair(pair -> {
HoodieKeyLookupResult lookupResult = pair.getLeft();
String recordKey = pair.getRight();
return new ImmutablePair<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()));
});
}
Aggregations