Search in sources :

Example 6 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableClustering method testClustering.

@ParameterizedTest
@MethodSource
void testClustering(boolean doUpdates, boolean populateMetaFields, boolean preserveCommitMetadata) throws Exception {
    // set low compaction small File Size to generate more file groups.
    HoodieWriteConfig.Builder cfgBuilder = HoodieWriteConfig.newBuilder().forTable("test-trip-table").withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).withAutoCommit(true).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10L).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder().withEnableBackupForRemoteFileSystemView(false).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClustering(true).withInlineClusteringNumCommits(1).withPreserveHoodieCommitMetadata(preserveCommitMetadata).build()).withRollbackUsingMarkers(false);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, cfg.getProps());
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 400);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records.subList(0, 200), client, cfg, newCommitTime);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (more inserts to create new files)
       */
        // we already set small file size to small number to force inserts to go into new file.
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        dataFiles = insertRecordsToMORTable(metaClient, records.subList(200, 400), client, cfg, newCommitTime);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        if (doUpdates) {
            /*
         * Write 3 (updates)
         */
            newCommitTime = "003";
            client.startCommitWithTime(newCommitTime);
            records = dataGen.generateUpdates(newCommitTime, 100);
            updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
        }
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        hoodieTable.getHoodieView().sync();
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        // expect 2 base files for each partition
        assertEquals(dataGen.getPartitionPaths().length * 2, allFiles.length);
        String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
        metaClient = HoodieTableMetaClient.reload(metaClient);
        hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        // verify all files are included in clustering plan.
        assertEquals(allFiles.length, hoodieTable.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getLeft).count());
        // Do the clustering and validate
        doClusteringAndValidate(client, clusteringCommitTime, metaClient, cfg, dataGen);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Pair(org.apache.hudi.common.util.collection.Pair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 7 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestHoodieKeyLocationFetchHandle method testFetchHandle.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testFetchHandle(boolean populateMetaFields) throws Exception {
    metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
    config = getConfigBuilder().withProperties(getPropertiesForKeyGen()).withIndexConfig(HoodieIndexConfig.newBuilder().build()).build();
    List<HoodieRecord> records = dataGen.generateInserts(makeNewCommitTime(), 100);
    Map<String, List<HoodieRecord>> partitionRecordsMap = recordsToPartitionRecordsMap(records);
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, AVRO_SCHEMA_WITH_METADATA_FIELDS);
    Map<Tuple2<String, String>, List<Tuple2<HoodieKey, HoodieRecordLocation>>> expectedList = writeToParquetAndGetExpectedRecordLocations(partitionRecordsMap, testTable);
    List<Tuple2<String, HoodieBaseFile>> partitionPathFileIdPairs = loadAllFilesForPartitions(new ArrayList<>(partitionRecordsMap.keySet()), context, hoodieTable);
    BaseKeyGenerator keyGenerator = (BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(getPropertiesForKeyGen()));
    for (Tuple2<String, HoodieBaseFile> entry : partitionPathFileIdPairs) {
        HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2), populateMetaFields ? Option.empty() : Option.of(keyGenerator));
        Iterator<Pair<HoodieKey, HoodieRecordLocation>> result = fetcherHandle.locations().iterator();
        List<Tuple2<HoodieKey, HoodieRecordLocation>> actualList = new ArrayList<>();
        result.forEachRemaining(x -> actualList.add(new Tuple2<>(x.getLeft(), x.getRight())));
        assertEquals(expectedList.get(new Tuple2<>(entry._1, entry._2.getFileId())), actualList);
    }
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Properties(java.util.Properties) TypedProperties(org.apache.hudi.common.config.TypedProperties) TypedProperties(org.apache.hudi.common.config.TypedProperties) Tuple2(scala.Tuple2) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Pair(org.apache.hudi.common.util.collection.Pair) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 8 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class DeltaSync method readFromSource.

/**
 * Read from Upstream Source and apply transformation if needed.
 *
 * @param commitTimelineOpt Timeline with completed commits
 * @return Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> Input data read from upstream source, consists
 * of schemaProvider, checkpointStr and hoodieRecord
 * @throws Exception in case of any Exception
 */
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> readFromSource(Option<HoodieTimeline> commitTimelineOpt) throws IOException {
    // Retrieve the previous round checkpoints, if any
    Option<String> resumeCheckpointStr = Option.empty();
    if (commitTimelineOpt.isPresent()) {
        resumeCheckpointStr = getCheckpointToResume(commitTimelineOpt);
    } else {
        // initialize the table for the first time.
        String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props);
        HoodieTableMetaClient.withPropertyBuilder().setTableType(cfg.tableType).setTableName(cfg.targetTableName).setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()).setPayloadClassName(cfg.payloadClassName).setBaseFileFormat(cfg.baseFileFormat).setPartitionFields(partitionColumns).setRecordKeyFields(props.getProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key())).setPopulateMetaFields(props.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue())).setKeyGeneratorClassProp(props.getProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), SimpleKeyGenerator.class.getName())).initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath);
    }
    LOG.debug("Checkpoint from config: " + cfg.checkpoint);
    if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
        resumeCheckpointStr = Option.of(cfg.checkpoint);
    }
    LOG.info("Checkpoint to resume from : " + resumeCheckpointStr);
    int maxRetryCount = cfg.retryOnSourceFailures ? cfg.maxRetryCount : 1;
    int curRetryCount = 0;
    Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> sourceDataToSync = null;
    while (curRetryCount++ < maxRetryCount && sourceDataToSync == null) {
        try {
            sourceDataToSync = fetchFromSource(resumeCheckpointStr);
        } catch (HoodieSourceTimeoutException e) {
            if (curRetryCount >= maxRetryCount) {
                throw e;
            }
            try {
                LOG.error("Exception thrown while fetching data from source. Msg : " + e.getMessage() + ", class : " + e.getClass() + ", cause : " + e.getCause());
                LOG.error("Sleeping for " + (cfg.retryIntervalSecs) + " before retrying again. Current retry count " + curRetryCount + ", max retry count " + cfg.maxRetryCount);
                Thread.sleep(cfg.retryIntervalSecs * 1000);
            } catch (InterruptedException ex) {
                LOG.error("Ignoring InterruptedException while waiting to retry on source failure " + e.getMessage());
            }
        }
    }
    return sourceDataToSync;
}
Also used : HoodieSourceTimeoutException(org.apache.hudi.utilities.exception.HoodieSourceTimeoutException) Configuration(org.apache.hadoop.conf.Configuration) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) Pair(org.apache.hudi.common.util.collection.Pair)

Example 9 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class DeltaSync method syncOnce.

/**
 * Run one round of delta sync and return new compaction instant if one got scheduled.
 */
public Pair<Option<String>, JavaRDD<WriteStatus>> syncOnce() throws IOException {
    Pair<Option<String>, JavaRDD<WriteStatus>> result = null;
    Timer.Context overallTimerContext = metrics.getOverallTimerContext();
    // Refresh Timeline
    refreshTimeline();
    Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> srcRecordsWithCkpt = readFromSource(commitTimelineOpt);
    if (null != srcRecordsWithCkpt) {
        // compactor
        if (null == writeClient) {
            this.schemaProvider = srcRecordsWithCkpt.getKey();
            // Setup HoodieWriteClient and compaction now that we decided on schema
            setupWriteClient();
        } else {
            Schema newSourceSchema = srcRecordsWithCkpt.getKey().getSourceSchema();
            Schema newTargetSchema = srcRecordsWithCkpt.getKey().getTargetSchema();
            if (!(processedSchema.isSchemaPresent(newSourceSchema)) || !(processedSchema.isSchemaPresent(newTargetSchema))) {
                LOG.info("Seeing new schema. Source :" + newSourceSchema.toString(true) + ", Target :" + newTargetSchema.toString(true));
                // We need to recreate write client with new schema and register them.
                reInitWriteClient(newSourceSchema, newTargetSchema);
                processedSchema.addSchema(newSourceSchema);
                processedSchema.addSchema(newTargetSchema);
            }
        }
        // complete the pending clustering before writing to sink
        if (cfg.retryLastPendingInlineClusteringJob && getHoodieClientConfig(this.schemaProvider).inlineClusteringEnabled()) {
            Option<String> pendingClusteringInstant = getLastPendingClusteringInstant(allCommitsTimelineOpt);
            if (pendingClusteringInstant.isPresent()) {
                writeClient.cluster(pendingClusteringInstant.get(), true);
            }
        }
        result = writeToSink(srcRecordsWithCkpt.getRight().getRight(), srcRecordsWithCkpt.getRight().getLeft(), metrics, overallTimerContext);
    }
    metrics.updateDeltaStreamerSyncMetrics(System.currentTimeMillis());
    // Clear persistent RDDs
    jssc.getPersistentRDDs().values().forEach(JavaRDD::unpersist);
    return result;
}
Also used : Timer(com.codahale.metrics.Timer) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) Option(org.apache.hudi.common.util.Option) JavaRDD(org.apache.spark.api.java.JavaRDD) Pair(org.apache.hudi.common.util.collection.Pair)

Example 10 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class S3EventsHoodieIncrSource method fetchNextBatch.

@Override
public Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {
    DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(HOODIE_SRC_BASE_PATH));
    String srcPath = props.getString(HOODIE_SRC_BASE_PATH);
    int numInstantsPerFetch = props.getInteger(NUM_INSTANTS_PER_FETCH, DEFAULT_NUM_INSTANTS_PER_FETCH);
    boolean readLatestOnMissingCkpt = props.getBoolean(READ_LATEST_INSTANT_ON_MISSING_CKPT, DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT);
    IncrSourceHelper.MissingCheckpointStrategy missingCheckpointStrategy = (props.containsKey(HoodieIncrSource.Config.MISSING_CHECKPOINT_STRATEGY)) ? IncrSourceHelper.MissingCheckpointStrategy.valueOf(props.getString(HoodieIncrSource.Config.MISSING_CHECKPOINT_STRATEGY)) : null;
    if (readLatestOnMissingCkpt) {
        missingCheckpointStrategy = IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST;
    }
    String fileFormat = props.getString(SOURCE_FILE_FORMAT, DEFAULT_SOURCE_FILE_FORMAT);
    // Use begin Instant if set and non-empty
    Option<String> beginInstant = lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty();
    Pair<String, Pair<String, String>> queryTypeAndInstantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath, numInstantsPerFetch, beginInstant, missingCheckpointStrategy);
    if (queryTypeAndInstantEndpts.getValue().getKey().equals(queryTypeAndInstantEndpts.getValue().getValue())) {
        LOG.warn("Already caught up. Begin Checkpoint was :" + queryTypeAndInstantEndpts.getValue().getKey());
        return Pair.of(Option.empty(), queryTypeAndInstantEndpts.getValue().getKey());
    }
    Dataset<Row> source = null;
    // Do incremental pull. Set end instant if available.
    if (queryTypeAndInstantEndpts.getKey().equals(DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())) {
        source = sparkSession.read().format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()).option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), queryTypeAndInstantEndpts.getRight().getLeft()).option(DataSourceReadOptions.END_INSTANTTIME().key(), queryTypeAndInstantEndpts.getRight().getRight()).load(srcPath);
    } else {
        // if checkpoint is missing from source table, and if strategy is set to READ_UPTO_LATEST_COMMIT, we have to issue snapshot query
        source = sparkSession.read().format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()).load(srcPath).filter(String.format("%s > '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, queryTypeAndInstantEndpts.getRight().getLeft()));
    }
    if (source.isEmpty()) {
        return Pair.of(Option.empty(), queryTypeAndInstantEndpts.getRight().getRight());
    }
    String filter = "s3.object.size > 0";
    if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_KEY_PREFIX, null))) {
        filter = filter + " and s3.object.key like '" + props.getString(Config.S3_KEY_PREFIX) + "%'";
    }
    if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_IGNORE_KEY_PREFIX, null))) {
        filter = filter + " and s3.object.key not like '" + props.getString(Config.S3_IGNORE_KEY_PREFIX) + "%'";
    }
    if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_IGNORE_KEY_SUBSTRING, null))) {
        filter = filter + " and s3.object.key not like '%" + props.getString(Config.S3_IGNORE_KEY_SUBSTRING) + "%'";
    }
    // add file format filtering by default
    filter = filter + " and s3.object.key like '%" + fileFormat + "%'";
    String s3FS = props.getString(Config.S3_FS_PREFIX, "s3").toLowerCase();
    String s3Prefix = s3FS + "://";
    // Extract distinct file keys from s3 meta hoodie table
    final List<Row> cloudMetaDf = source.filter(filter).select("s3.bucket.name", "s3.object.key").distinct().collectAsList();
    // Create S3 paths
    final boolean checkExists = props.getBoolean(Config.ENABLE_EXISTS_CHECK, Config.DEFAULT_ENABLE_EXISTS_CHECK);
    List<String> cloudFiles = new ArrayList<>();
    for (Row row : cloudMetaDf) {
        // construct file path, row index 0 refers to bucket and 1 refers to key
        String bucket = row.getString(0);
        String filePath = s3Prefix + bucket + "/" + row.getString(1);
        if (checkExists) {
            FileSystem fs = FSUtils.getFs(s3Prefix + bucket, sparkSession.sparkContext().hadoopConfiguration());
            try {
                if (fs.exists(new Path(filePath))) {
                    cloudFiles.add(filePath);
                }
            } catch (IOException e) {
                LOG.error(String.format("Error while checking path exists for %s ", filePath), e);
            }
        } else {
            cloudFiles.add(filePath);
        }
    }
    Option<Dataset<Row>> dataset = Option.empty();
    if (!cloudFiles.isEmpty()) {
        dataset = Option.of(sparkSession.read().format(fileFormat).load(cloudFiles.toArray(new String[0])));
    }
    return Pair.of(dataset, queryTypeAndInstantEndpts.getRight().getRight());
}
Also used : Path(org.apache.hadoop.fs.Path) Dataset(org.apache.spark.sql.Dataset) ArrayList(java.util.ArrayList) IOException(java.io.IOException) IncrSourceHelper(org.apache.hudi.utilities.sources.helpers.IncrSourceHelper) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

Pair (org.apache.hudi.common.util.collection.Pair)147 List (java.util.List)98 Map (java.util.Map)91 IOException (java.io.IOException)89 Collectors (java.util.stream.Collectors)87 Option (org.apache.hudi.common.util.Option)87 ArrayList (java.util.ArrayList)85 Path (org.apache.hadoop.fs.Path)81 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)76 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)66 HashMap (java.util.HashMap)65 LogManager (org.apache.log4j.LogManager)64 Logger (org.apache.log4j.Logger)64 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)63 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)54 HoodieIOException (org.apache.hudi.exception.HoodieIOException)54 Arrays (java.util.Arrays)48 HoodieTable (org.apache.hudi.table.HoodieTable)46 Test (org.junit.jupiter.api.Test)46