Search in sources :

Example 31 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestHiveSyncTool method testNotPickingOlderParquetFileWhenLatestCommitReadFails.

@ParameterizedTest
@MethodSource("syncMode")
public void testNotPickingOlderParquetFileWhenLatestCommitReadFails(String syncMode) throws Exception {
    hiveSyncConfig.syncMode = syncMode;
    HiveTestUtil.hiveSyncConfig.batchSyncNum = 2;
    final String commitTime = "100";
    HiveTestUtil.createCOWTable(commitTime, 1, true);
    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
    // evolve the schema
    ZonedDateTime dateTime = ZonedDateTime.now().plusDays(6);
    String commitTime2 = "101";
    HiveTestUtil.addCOWPartitions(1, false, true, dateTime, commitTime2);
    // create empty commit
    final String emptyCommitTime = "200";
    HiveTestUtil.createCommitFile(commitMetadata, emptyCommitTime, hiveSyncConfig.basePath);
    HoodieHiveClient hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
    assertFalse(hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName), "Table " + HiveTestUtil.hiveSyncConfig.tableName + " should not exist initially");
    HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
    // now delete the evolved commit instant
    Path fullPath = new Path(HiveTestUtil.hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + hiveClient.getActiveTimeline().getInstants().filter(inst -> inst.getTimestamp().equals(commitTime2)).findFirst().get().getFileName());
    assertTrue(HiveTestUtil.fileSystem.delete(fullPath, false));
    try {
        tool.syncHoodieTable();
    } catch (RuntimeException e) {
    // we expect the table sync to fail
    }
    // table should not be synced yet
    assertFalse(hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName), "Table " + HiveTestUtil.hiveSyncConfig.tableName + " should not exist at all");
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) URISyntaxException(java.net.URISyntaxException) ZonedDateTime(java.time.ZonedDateTime) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HiveTestUtil.ddlExecutor(org.apache.hudi.hive.testutils.HiveTestUtil.ddlExecutor) Partition(org.apache.hadoop.hive.metastore.api.Partition) ArrayList(java.util.ArrayList) AfterAll(org.junit.jupiter.api.AfterAll) StringUtils(org.apache.hudi.common.util.StringUtils) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Locale(java.util.Locale) Map(java.util.Map) HiveTestUtil.fileSystem(org.apache.hudi.hive.testutils.HiveTestUtil.fileSystem) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) MethodSource(org.junit.jupiter.params.provider.MethodSource) PartitionEventType(org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) Field(org.apache.avro.Schema.Field) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Driver(org.apache.hadoop.hive.ql.Driver) IOException(java.io.IOException) SessionState(org.apache.hadoop.hive.ql.session.SessionState) Collectors(java.util.stream.Collectors) ConfigUtils(org.apache.hudi.hive.util.ConfigUtils) Test(org.junit.jupiter.api.Test) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) HiveTestUtil(org.apache.hudi.hive.testutils.HiveTestUtil) NetworkTestUtils(org.apache.hudi.common.testutils.NetworkTestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) PartitionEvent(org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HiveTestUtil.hiveSyncConfig(org.apache.hudi.hive.testutils.HiveTestUtil.hiveSyncConfig) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ZonedDateTime(java.time.ZonedDateTime) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 32 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestHiveSyncTool method testPickingOlderParquetFileIfLatestIsEmptyCommit.

@ParameterizedTest
@MethodSource("syncMode")
public void testPickingOlderParquetFileIfLatestIsEmptyCommit(String syncMode) throws Exception {
    hiveSyncConfig.syncMode = syncMode;
    HiveTestUtil.hiveSyncConfig.batchSyncNum = 2;
    final String commitTime = "100";
    HiveTestUtil.createCOWTable(commitTime, 1, true);
    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
    // create empty commit
    final String emptyCommitTime = "200";
    HiveTestUtil.createCommitFileWithSchema(commitMetadata, emptyCommitTime, true);
    HoodieHiveClient hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
    assertFalse(hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName), "Table " + HiveTestUtil.hiveSyncConfig.tableName + " should not exist initially");
    HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
    tool.syncHoodieTable();
    verifyOldParquetFileTest(hiveClient, emptyCommitTime);
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 33 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestHoodieDataSourceInternalWriter method testDataSourceWriterInternal.

private void testDataSourceWriterInternal(Map<String, String> extraMetadata, Map<String, String> expectedExtraMetadata, boolean populateMetaFields) throws Exception {
    // init config and table
    HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
    String instantTime = "001";
    // init writer
    HoodieDataSourceInternalWriter dataSourceInternalWriter = new HoodieDataSourceInternalWriter(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(extraMetadata), populateMetaFields, false);
    DataWriter<InternalRow> writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(0, RANDOM.nextLong(), RANDOM.nextLong());
    String[] partitionPaths = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS;
    List<String> partitionPathsAbs = new ArrayList<>();
    for (String partitionPath : partitionPaths) {
        partitionPathsAbs.add(basePath + "/" + partitionPath + "/*");
    }
    int size = 10 + RANDOM.nextInt(1000);
    int batches = 2;
    Dataset<Row> totalInputRows = null;
    for (int j = 0; j < batches; j++) {
        String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[j % 3];
        Dataset<Row> inputRows = getRandomRows(sqlContext, size, partitionPath, false);
        writeRows(inputRows, writer);
        if (totalInputRows == null) {
            totalInputRows = inputRows;
        } else {
            totalInputRows = totalInputRows.union(inputRows);
        }
    }
    HoodieWriterCommitMessage commitMetadata = (HoodieWriterCommitMessage) writer.commit();
    List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
    commitMessages.add(commitMetadata);
    dataSourceInternalWriter.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
    metaClient.reloadActiveTimeline();
    Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
    // verify output
    assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
    assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
    // verify extra metadata
    Option<HoodieCommitMetadata> commitMetadataOption = HoodieClientTestUtils.getCommitMetadataForLatestInstant(metaClient);
    assertTrue(commitMetadataOption.isPresent());
    Map<String, String> actualExtraMetadata = new HashMap<>();
    commitMetadataOption.get().getExtraMetadata().entrySet().stream().filter(entry -> !entry.getKey().equals(HoodieCommitMetadata.SCHEMA_KEY)).forEach(entry -> actualExtraMetadata.put(entry.getKey(), entry.getValue()));
    assertEquals(actualExtraMetadata, expectedExtraMetadata);
}
Also used : InternalRow(org.apache.spark.sql.catalyst.InternalRow) Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) Disabled(org.junit.jupiter.api.Disabled) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) DataWriter(org.apache.spark.sql.sources.v2.writer.DataWriter) ArrayList(java.util.ArrayList) Map(java.util.Map) DataSourceOptions(org.apache.spark.sql.sources.v2.DataSourceOptions) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) MethodSource(org.junit.jupiter.params.provider.MethodSource) ENCODER(org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Row(org.apache.spark.sql.Row) STRUCT_TYPE(org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE) Arguments(org.junit.jupiter.params.provider.Arguments) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) Stream(java.util.stream.Stream) SparkDatasetTestUtils.toInternalRows(org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDatasetTestUtils.getRandomRows(org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) Collections(java.util.Collections) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) DataSourceOptions(org.apache.spark.sql.sources.v2.DataSourceOptions) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Row(org.apache.spark.sql.Row) InternalRow(org.apache.spark.sql.catalyst.InternalRow)

Example 34 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestHoodieDataSourceInternalBatchWrite method testDataSourceWriterInternal.

private void testDataSourceWriterInternal(Map<String, String> extraMetadata, Map<String, String> expectedExtraMetadata, boolean populateMetaFields) throws Exception {
    // init config and table
    HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
    HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
    String instantTime = "001";
    // init writer
    HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite = new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, extraMetadata, populateMetaFields, false);
    DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(0, RANDOM.nextLong());
    String[] partitionPaths = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS;
    List<String> partitionPathsAbs = new ArrayList<>();
    for (String partitionPath : partitionPaths) {
        partitionPathsAbs.add(basePath + "/" + partitionPath + "/*");
    }
    int size = 10 + RANDOM.nextInt(1000);
    int batches = 5;
    Dataset<Row> totalInputRows = null;
    for (int j = 0; j < batches; j++) {
        String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[j % 3];
        Dataset<Row> inputRows = getRandomRows(sqlContext, size, partitionPath, false);
        writeRows(inputRows, writer);
        if (totalInputRows == null) {
            totalInputRows = inputRows;
        } else {
            totalInputRows = totalInputRows.union(inputRows);
        }
    }
    HoodieWriterCommitMessage commitMetadata = (HoodieWriterCommitMessage) writer.commit();
    List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
    commitMessages.add(commitMetadata);
    dataSourceInternalBatchWrite.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
    metaClient.reloadActiveTimeline();
    Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
    // verify output
    assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
    assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
    // verify extra metadata
    Option<HoodieCommitMetadata> commitMetadataOption = HoodieClientTestUtils.getCommitMetadataForLatestInstant(metaClient);
    assertTrue(commitMetadataOption.isPresent());
    Map<String, String> actualExtraMetadata = new HashMap<>();
    commitMetadataOption.get().getExtraMetadata().entrySet().stream().filter(entry -> !entry.getKey().equals(HoodieCommitMetadata.SCHEMA_KEY)).forEach(entry -> actualExtraMetadata.put(entry.getKey(), entry.getValue()));
    assertEquals(actualExtraMetadata, expectedExtraMetadata);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) DataWriter(org.apache.spark.sql.connector.write.DataWriter) Disabled(org.junit.jupiter.api.Disabled) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Map(java.util.Map) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) MethodSource(org.junit.jupiter.params.provider.MethodSource) ENCODER(org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Row(org.apache.spark.sql.Row) STRUCT_TYPE(org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE) Arguments(org.junit.jupiter.params.provider.Arguments) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) Stream(java.util.stream.Stream) SparkDatasetTestUtils.toInternalRows(org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows) HoodieBulkInsertInternalWriterTestBase(org.apache.hudi.internal.HoodieBulkInsertInternalWriterTestBase) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDatasetTestUtils.getRandomRows(org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) Collections(java.util.Collections) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieTable(org.apache.hudi.table.HoodieTable) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Row(org.apache.spark.sql.Row) InternalRow(org.apache.spark.sql.catalyst.InternalRow)

Example 35 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class JavaUpsertPartitioner method averageBytesPerRecord.

/**
 * Obtains the average record size based on records written during previous commits. Used for estimating how many
 * records pack into one file.
 */
protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, HoodieWriteConfig hoodieWriteConfig) {
    long avgSize = hoodieWriteConfig.getCopyOnWriteRecordSizeEstimate();
    long fileSizeThreshold = (long) (hoodieWriteConfig.getRecordSizeEstimationThreshold() * hoodieWriteConfig.getParquetSmallFileLimit());
    try {
        if (!commitTimeline.empty()) {
            // Go over the reverse ordered commits to get a more recent estimate of average record size.
            Iterator<HoodieInstant> instants = commitTimeline.getReverseOrderedInstants().iterator();
            while (instants.hasNext()) {
                HoodieInstant instant = instants.next();
                HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class);
                long totalBytesWritten = commitMetadata.fetchTotalBytesWritten();
                long totalRecordsWritten = commitMetadata.fetchTotalRecordsWritten();
                if (totalBytesWritten > fileSizeThreshold && totalRecordsWritten > 0) {
                    avgSize = (long) Math.ceil((1.0 * totalBytesWritten) / totalRecordsWritten);
                    break;
                }
            }
        }
    } catch (Throwable t) {
        // make this fail safe.
        LOG.error("Error trying to compute average bytes/record ", t);
    }
    return avgSize;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata)

Aggregations

HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)139 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)64 ArrayList (java.util.ArrayList)54 HashMap (java.util.HashMap)49 List (java.util.List)48 HoodieWriteStat (org.apache.hudi.common.model.HoodieWriteStat)44 IOException (java.io.IOException)42 Test (org.junit.jupiter.api.Test)41 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)40 Map (java.util.Map)38 Path (org.apache.hadoop.fs.Path)36 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)34 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)34 File (java.io.File)26 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 Option (org.apache.hudi.common.util.Option)25 Schema (org.apache.avro.Schema)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)21 Collectors (java.util.stream.Collectors)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20