Search in sources :

Example 16 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class MetadataCommand method create.

@CliCommand(value = "metadata create", help = "Create the Metadata Table if it does not exist")
public String create(@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master) throws IOException {
    HoodieCLI.getTableMetaClient();
    Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
    try {
        FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath);
        if (statuses.length > 0) {
            throw new RuntimeException("Metadata directory (" + metadataPath.toString() + ") not empty.");
        }
    } catch (FileNotFoundException e) {
        // Metadata directory does not exist yet
        HoodieCLI.fs.mkdirs(metadataPath);
    }
    HoodieTimer timer = new HoodieTimer().startTimer();
    HoodieWriteConfig writeConfig = getWriteConfig();
    initJavaSparkContext(Option.of(master));
    SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc));
    return String.format("Created Metadata Table in %s (duration=%.2f secs)", metadataPath, timer.endTimer() / 1000.0);
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) FileNotFoundException(java.io.FileNotFoundException) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 17 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class MetadataCommand method listPartitions.

@CliCommand(value = "metadata list-partitions", help = "List all partitions from metadata")
public String listPartitions(@CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master) throws IOException {
    HoodieCLI.getTableMetaClient();
    initJavaSparkContext(Option.of(master));
    HoodieMetadataConfig config = HoodieMetadataConfig.newBuilder().enable(true).build();
    HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(new HoodieSparkEngineContext(jsc), config, HoodieCLI.basePath, "/tmp");
    if (!metadata.enabled()) {
        return "[ERROR] Metadata Table not enabled/initialized\n\n";
    }
    HoodieTimer timer = new HoodieTimer().startTimer();
    List<String> partitions = metadata.getAllPartitionPaths();
    LOG.debug("Took " + timer.endTimer() + " ms");
    final List<Comparable[]> rows = new ArrayList<>();
    partitions.stream().sorted(Comparator.reverseOrder()).forEach(p -> {
        Comparable[] row = new Comparable[1];
        row[0] = p;
        rows.add(row);
    });
    TableHeader header = new TableHeader().addTableHeaderField("partition");
    return HoodiePrintHelper.print(header, new HashMap<>(), "", false, Integer.MAX_VALUE, false, rows);
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) TableHeader(org.apache.hudi.cli.TableHeader) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieBackedTableMetadata(org.apache.hudi.metadata.HoodieBackedTableMetadata) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 18 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class TestDatePartitionPathSelector method setup.

@BeforeEach
public void setup() {
    initSparkContexts();
    initPath();
    initFileSystem();
    context = new HoodieSparkEngineContext(jsc);
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) BeforeEach(org.junit.jupiter.api.BeforeEach)

Example 19 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class TestHoodieBackedMetadata method testReader.

/**
 * Ensure that the reader only reads completed instants.
 *
 * @throws IOException
 */
@Test
public void testReader() throws Exception {
    init(HoodieTableType.COPY_ON_WRITE);
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    List<HoodieRecord> records;
    List<WriteStatus> writeStatuses;
    String[] commitTimestamps = { HoodieActiveTimeline.createNewInstantTime(), HoodieActiveTimeline.createNewInstantTime(), HoodieActiveTimeline.createNewInstantTime(), HoodieActiveTimeline.createNewInstantTime() };
    try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
        for (int i = 0; i < commitTimestamps.length; ++i) {
            records = dataGen.generateInserts(commitTimestamps[i], 5);
            client.startCommitWithTime(commitTimestamps[i]);
            writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamps[i]).collect();
            assertNoWriteErrors(writeStatuses);
        }
        // Ensure we can see files from each commit
        Set<String> timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet());
        assertEquals(timelineTimestamps.size(), commitTimestamps.length);
        for (int i = 0; i < commitTimestamps.length; ++i) {
            assertTrue(timelineTimestamps.contains(commitTimestamps[i]));
        }
        // mark each commit as incomplete and ensure files are not seen
        for (int i = 0; i < commitTimestamps.length; ++i) {
            FileCreateUtils.deleteCommit(basePath, commitTimestamps[i]);
            timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet());
            assertEquals(timelineTimestamps.size(), commitTimestamps.length - 1);
            for (int j = 0; j < commitTimestamps.length; ++j) {
                assertTrue(j == i || timelineTimestamps.contains(commitTimestamps[j]));
            }
            FileCreateUtils.createCommit(basePath, commitTimestamps[i]);
        }
        // Test multiple incomplete commits
        FileCreateUtils.deleteCommit(basePath, commitTimestamps[0]);
        FileCreateUtils.deleteCommit(basePath, commitTimestamps[2]);
        timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet());
        assertEquals(timelineTimestamps.size(), commitTimestamps.length - 2);
        for (int j = 0; j < commitTimestamps.length; ++j) {
            assertTrue(j == 0 || j == 2 || timelineTimestamps.contains(commitTimestamps[j]));
        }
        // Test no completed commits
        for (int i = 0; i < commitTimestamps.length; ++i) {
            FileCreateUtils.deleteCommit(basePath, commitTimestamps[i]);
        }
        timelineTimestamps = getAllFiles(metadata(client)).stream().map(p -> p.getName()).map(n -> FSUtils.getCommitTime(n)).collect(Collectors.toSet());
        assertEquals(timelineTimestamps.size(), 0);
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) Disabled(org.junit.jupiter.api.Disabled) Collections.singletonList(java.util.Collections.singletonList) Future(java.util.concurrent.Future) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Arrays.asList(java.util.Arrays.asList) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) WriteConcurrencyMode(org.apache.hudi.common.model.WriteConcurrencyMode) Tag(org.junit.jupiter.api.Tag) FileSystemViewStorageType(org.apache.hudi.common.table.view.FileSystemViewStorageType) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Pair(org.apache.hadoop.hbase.util.Pair) Schema(org.apache.avro.Schema) Set(java.util.Set) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) Executors(java.util.concurrent.Executors) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieMetadataMergedLogRecordReader(org.apache.hudi.metadata.HoodieMetadataMergedLogRecordReader) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieMetadataMetrics(org.apache.hudi.metadata.HoodieMetadataMetrics) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieBackedTableMetadataWriter(org.apache.hudi.metadata.HoodieBackedTableMetadataWriter) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) MetadataPartitionType(org.apache.hudi.metadata.MetadataPartitionType) DELETE(org.apache.hudi.common.model.WriteOperationType.DELETE) Registry(org.apache.hudi.common.metrics.Registry) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) MERGE_ON_READ(org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieTableVersion(org.apache.hudi.common.table.HoodieTableVersion) INSERT(org.apache.hudi.common.model.WriteOperationType.INSERT) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieMetadataRecord(org.apache.hudi.avro.model.HoodieMetadataRecord) HoodieHFileReader(org.apache.hudi.io.storage.HoodieHFileReader) Paths(java.nio.file.Paths) HoodieKey(org.apache.hudi.common.model.HoodieKey) UPSERT(org.apache.hudi.common.model.WriteOperationType.UPSERT) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) COPY_ON_WRITE(org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) MethodSource(org.junit.jupiter.params.provider.MethodSource) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) LockConfiguration(org.apache.hudi.common.config.LockConfiguration) Collections.emptyList(java.util.Collections.emptyList) SparkUpgradeDowngradeHelper(org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper) HoodieMetadataPayload(org.apache.hudi.metadata.HoodieMetadataPayload) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) CacheConfig(org.apache.hadoop.hbase.io.hfile.CacheConfig) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) InProcessLockProvider(org.apache.hudi.client.transaction.lock.InProcessLockProvider) FileSlice(org.apache.hudi.common.model.FileSlice) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) UpgradeDowngrade(org.apache.hudi.table.upgrade.UpgradeDowngrade) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) LinkedList(java.util.LinkedList) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) ExecutorService(java.util.concurrent.ExecutorService) GenericRecord(org.apache.avro.generic.GenericRecord) FILESYSTEM_LOCK_PATH_PROP_KEY(org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieLockConfig(org.apache.hudi.config.HoodieLockConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Time(org.apache.hadoop.util.Time) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 20 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class TestHoodieBackedMetadata method testMetadataMetrics.

/**
 * Test various metrics published by metadata table.
 */
@Test
public void testMetadataMetrics() throws Exception {
    init(HoodieTableType.COPY_ON_WRITE, false);
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, true).build())) {
        // Write
        String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
        client.startCommitWithTime(newCommitTime);
        List<WriteStatus> writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
        assertNoWriteErrors(writeStatuses);
        validateMetadata(client);
        Registry metricsRegistry = Registry.getRegistry("HoodieMetadata");
        assertTrue(metricsRegistry.getAllCounts().containsKey(HoodieMetadataMetrics.INITIALIZE_STR + ".count"));
        assertTrue(metricsRegistry.getAllCounts().containsKey(HoodieMetadataMetrics.INITIALIZE_STR + ".totalDuration"));
        assertTrue(metricsRegistry.getAllCounts().get(HoodieMetadataMetrics.INITIALIZE_STR + ".count") >= 1L);
        final String prefix = MetadataPartitionType.FILES.getPartitionPath() + ".";
        assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_COUNT_BASE_FILES));
        assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_COUNT_LOG_FILES));
        assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_TOTAL_BASE_FILE_SIZE));
        assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_TOTAL_LOG_FILE_SIZE));
    }
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Registry(org.apache.hudi.common.metrics.Registry) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)58 Path (org.apache.hadoop.fs.Path)25 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)24 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 ArrayList (java.util.ArrayList)19 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)19 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)17 WriteStatus (org.apache.hudi.client.WriteStatus)15 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)15 IOException (java.io.IOException)14 List (java.util.List)14 Option (org.apache.hudi.common.util.Option)14 LogManager (org.apache.log4j.LogManager)14 Logger (org.apache.log4j.Logger)14 Test (org.junit.jupiter.api.Test)14 Collectors (java.util.stream.Collectors)12 FileStatus (org.apache.hadoop.fs.FileStatus)12 FileSystem (org.apache.hadoop.fs.FileSystem)12 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)11