Search in sources :

Example 1 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class FunctionalTestHarness method runBeforeEach.

@BeforeEach
public synchronized void runBeforeEach() throws Exception {
    initialized = spark != null && hdfsTestService != null;
    if (!initialized) {
        SparkConf sparkConf = conf();
        SparkRDDWriteClient.registerClasses(sparkConf);
        HoodieReadClient.addHoodieSupport(sparkConf);
        spark = SparkSession.builder().config(sparkConf).getOrCreate();
        sqlContext = spark.sqlContext();
        jsc = new JavaSparkContext(spark.sparkContext());
        context = new HoodieSparkEngineContext(jsc);
        hdfsTestService = new HdfsTestService();
        dfsCluster = hdfsTestService.start(true);
        dfs = dfsCluster.getFileSystem();
        dfs.mkdirs(dfs.getWorkingDirectory());
        Runtime.getRuntime().addShutdownHook(new Thread(() -> {
            hdfsTestService.stop();
            hdfsTestService = null;
            jsc.close();
            jsc = null;
            spark.stop();
            spark = null;
        }));
    }
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HdfsTestService(org.apache.hudi.common.testutils.minicluster.HdfsTestService) SparkConf(org.apache.spark.SparkConf) BeforeEach(org.junit.jupiter.api.BeforeEach)

Example 2 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieClientTestHarness method initSparkContexts.

/**
 * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with the given application name.
 *
 * @param appName The specified application name.
 */
protected void initSparkContexts(String appName) {
    // Initialize a local spark env
    jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(appName + "#" + testMethodName));
    jsc.setLogLevel("ERROR");
    hadoopConf = jsc.hadoopConfiguration();
    // SQLContext stuff
    sqlContext = new SQLContext(jsc);
    context = new HoodieSparkEngineContext(jsc);
    hadoopConf = context.getHadoopConf().get();
    sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate();
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SQLContext(org.apache.spark.sql.SQLContext)

Example 3 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieClientTestHarness method validateMetadata.

/**
 * Validate the metadata tables contents to ensure it matches what is on the file system.
 */
public void validateMetadata(HoodieTestTable testTable, List<String> inflightCommits, HoodieWriteConfig writeConfig, String metadataTableBasePath, boolean doFullValidation) throws IOException {
    HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
    assertNotNull(tableMetadata, "MetadataReader should have been initialized");
    if (!writeConfig.isMetadataTableEnabled()) {
        return;
    }
    if (!tableMetadata.getSyncedInstantTime().isPresent() || tableMetadata instanceof FileSystemBackedTableMetadata) {
        throw new IllegalStateException("Metadata should have synced some commits or tableMetadata should not be an instance " + "of FileSystemBackedTableMetadata");
    }
    assertEquals(inflightCommits, testTable.inflightCommits());
    HoodieTimer timer = new HoodieTimer().startTimer();
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    // Partitions should match
    List<java.nio.file.Path> fsPartitionPaths = testTable.getAllPartitionPaths();
    List<String> fsPartitions = new ArrayList<>();
    fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString()));
    if (fsPartitions.isEmpty()) {
        fsPartitions.add("");
    }
    List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
    Collections.sort(fsPartitions);
    Collections.sort(metadataPartitions);
    assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match");
    assertEquals(fsPartitions, metadataPartitions, "Partitions should match");
    // Files within each partition should match
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable table = HoodieSparkTable.create(writeConfig, engineContext, true);
    TableFileSystemView tableView = table.getHoodieView();
    List<String> fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList());
    Map<String, FileStatus[]> partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths);
    assertEquals(fsPartitions.size(), partitionToFilesMap.size());
    fsPartitions.forEach(partition -> {
        try {
            validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition);
        } catch (IOException e) {
            fail("Exception should not be raised: " + e);
        }
    });
    if (doFullValidation) {
        runFullValidation(writeConfig, metadataTableBasePath, engineContext);
    }
    LOG.info("Validation time=" + timer.endTimer());
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Random(java.util.Random) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) AfterAll(org.junit.jupiter.api.AfterAll) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Assertions.assertLinesMatch(org.junit.jupiter.api.Assertions.assertLinesMatch) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) CleanPlanV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) WorkloadStat(org.apache.hudi.table.WorkloadStat) CleanerUtils.convertCleanMetadata(org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Tuple2(scala.Tuple2) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) TestInfo(org.junit.jupiter.api.TestInfo) Executors(java.util.concurrent.Executors) Serializable(java.io.Serializable) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) HoodieBackedTableMetadataWriter(org.apache.hudi.metadata.HoodieBackedTableMetadataWriter) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) TimelineService(org.apache.hudi.timeline.service.TimelineService) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ExecutorService(java.util.concurrent.ExecutorService) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HdfsTestService(org.apache.hudi.common.testutils.minicluster.HdfsTestService) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) SQLContext(org.apache.spark.sql.SQLContext) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Assertions(org.junit.jupiter.api.Assertions) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) IOException(java.io.IOException) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) HoodieTable(org.apache.hudi.table.HoodieTable) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 4 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class TestMarkerBasedRollbackStrategy method testCopyOnWriteRollback.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testCopyOnWriteRollback(boolean useFileListingMetadata) throws Exception {
    HoodieWriteConfig writeConfig = getConfigBuilder().withRollbackUsingMarkers(true).withAutoCommit(false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(useFileListingMetadata).build()).withPath(basePath).build();
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, writeConfig)) {
        // rollback 2nd commit and ensure stats reflect the info.
        List<HoodieRollbackStat> stats = testRun(useFileListingMetadata, writeConfig, writeClient);
        assertEquals(3, stats.size());
        for (HoodieRollbackStat stat : stats) {
            assertEquals(1, stat.getSuccessDeleteFiles().size());
            assertEquals(0, stat.getFailedDeleteFiles().size());
            assertEquals(0, stat.getCommandBlocksCount().size());
        }
    }
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRollbackStat(org.apache.hudi.common.HoodieRollbackStat) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 5 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class TestDirectWriteMarkers method setup.

@BeforeEach
public void setup() throws IOException {
    initPath();
    initMetaClient();
    this.jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(TestDirectWriteMarkers.class.getName()));
    this.context = new HoodieSparkEngineContext(jsc);
    this.fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf());
    this.markerFolderPath = new Path(metaClient.getMarkerFolderPath("000"));
    this.writeMarkers = new DirectWriteMarkers(fs, metaClient.getBasePath(), markerFolderPath.toString(), "000");
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BeforeEach(org.junit.jupiter.api.BeforeEach)

Aggregations

HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)58 Path (org.apache.hadoop.fs.Path)25 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)24 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 ArrayList (java.util.ArrayList)19 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)19 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)17 WriteStatus (org.apache.hudi.client.WriteStatus)15 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)15 IOException (java.io.IOException)14 List (java.util.List)14 Option (org.apache.hudi.common.util.Option)14 LogManager (org.apache.log4j.LogManager)14 Logger (org.apache.log4j.Logger)14 Test (org.junit.jupiter.api.Test)14 Collectors (java.util.stream.Collectors)12 FileStatus (org.apache.hadoop.fs.FileStatus)12 FileSystem (org.apache.hadoop.fs.FileSystem)12 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)11