Search in sources :

Example 36 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieSnapshotExporter method exportAsNonHudi.

private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
    Partitioner defaultPartitioner = dataset -> {
        Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
        return StringUtils.isNullOrEmpty(cfg.outputPartitionField) ? hoodieDroppedDataset.write() : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
    };
    Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner) ? defaultPartitioner : ReflectionUtils.loadClass(cfg.outputPartitioner);
    HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    context.setJobStatus(this.getClass().getSimpleName(), "Exporting as non-HUDI dataset");
    final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
    Iterator<String> exportingFilePaths = jsc.parallelize(partitions, partitions.size()).flatMap(partition -> fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp).map(HoodieBaseFile::getPath).iterator()).toLocalIterator();
    Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
    partitioner.partition(sourceDataset).format(cfg.outputFormat).mode(SaveMode.Overwrite).save(cfg.targetOutputPath);
}
Also used : ParameterException(com.beust.jcommander.ParameterException) Dataset(org.apache.spark.sql.Dataset) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileUtil(org.apache.hadoop.fs.FileUtil) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) JavaConversions(scala.collection.JavaConversions) SaveMode(org.apache.spark.sql.SaveMode) Iterator(java.util.Iterator) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) JCommander(com.beust.jcommander.JCommander) SparkConf(org.apache.spark.SparkConf) HoodieSnapshotExporterException(org.apache.hudi.utilities.exception.HoodieSnapshotExporterException) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) Tuple2(scala.Tuple2) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) DataFrameWriter(org.apache.spark.sql.DataFrameWriter) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) IValueValidator(com.beust.jcommander.IValueValidator) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Column(org.apache.spark.sql.Column) Dataset(org.apache.spark.sql.Dataset) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) Row(org.apache.spark.sql.Row) SQLContext(org.apache.spark.sql.SQLContext)

Example 37 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieSnapshotExporter method export.

public void export(JavaSparkContext jsc, Config cfg) throws IOException {
    FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    if (outputPathExists(fs, cfg)) {
        throw new HoodieSnapshotExporterException("The target output path already exists.");
    }
    final String latestCommitTimestamp = getLatestCommitTimestamp(fs, cfg).<HoodieSnapshotExporterException>orElseThrow(() -> {
        throw new HoodieSnapshotExporterException("No commits present. Nothing to snapshot.");
    });
    LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp));
    final List<String> partitions = getPartitions(engineContext, cfg);
    if (partitions.isEmpty()) {
        throw new HoodieSnapshotExporterException("The source dataset has 0 partition to snapshot.");
    }
    LOG.info(String.format("The job needs to export %d partitions.", partitions.size()));
    if (cfg.outputFormat.equals(OutputFormatValidator.HUDI)) {
        exportAsHudi(jsc, cfg, partitions, latestCommitTimestamp);
    } else {
        exportAsNonHudi(jsc, cfg, partitions, latestCommitTimestamp);
    }
    createSuccessTag(fs, cfg);
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieSnapshotExporterException(org.apache.hudi.utilities.exception.HoodieSnapshotExporterException)

Example 38 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieWithTimelineServer method run.

public void run(JavaSparkContext jsc) throws UnknownHostException {
    startService();
    final String driverHost = InetAddress.getLocalHost().getHostAddress();
    System.out.println("Driver Hostname is :" + driverHost);
    List<String> messages = new ArrayList<>();
    IntStream.range(0, cfg.numPartitions).forEach(i -> messages.add("Hello World"));
    HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    context.setJobStatus(this.getClass().getSimpleName(), "Sending requests to driver host");
    List<String> gotMessages = context.map(messages, msg -> sendRequest(driverHost, cfg.serverPort), messages.size());
    System.out.println("Got Messages :" + gotMessages);
    ValidationUtils.checkArgument(gotMessages.equals(messages), "Got expected reply from Server");
}
Also used : HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ArrayList(java.util.ArrayList)

Example 39 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class UtilHelpers method createHoodieClient.

/**
 * Build Hoodie write client.
 *
 * @param jsc         Java Spark Context
 * @param basePath    Base Path
 * @param schemaStr   Schema
 * @param parallelism Parallelism
 */
public static SparkRDDWriteClient<HoodieRecordPayload> createHoodieClient(JavaSparkContext jsc, String basePath, String schemaStr, int parallelism, Option<String> compactionStrategyClass, TypedProperties properties) {
    HoodieCompactionConfig compactionConfig = compactionStrategyClass.map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false).withCompactionStrategy(ReflectionUtils.loadClass(strategy)).build()).orElse(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build());
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withParallelism(parallelism, parallelism).withBulkInsertParallelism(parallelism).withDeleteParallelism(parallelism).withSchema(schemaStr).combineInput(true, true).withCompactionConfig(compactionConfig).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withProps(properties).build();
    return new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), config);
}
Also used : Arrays(java.util.Arrays) SchemaProviderWithPostProcessor(org.apache.hudi.utilities.schema.SchemaProviderWithPostProcessor) Connection(java.sql.Connection) Enumeration(java.util.Enumeration) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) ByteBuffer(java.nio.ByteBuffer) Logger(org.apache.log4j.Logger) DFSPropertiesConfiguration(org.apache.hudi.common.config.DFSPropertiesConfiguration) HoodieSourcePostProcessException(org.apache.hudi.utilities.exception.HoodieSourcePostProcessException) Source(org.apache.hudi.utilities.sources.Source) ResultSet(java.sql.ResultSet) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Transformer(org.apache.hudi.utilities.transform.Transformer) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) DriverRegistry(org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry) JDBCOptions(org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions) InitialCheckPointProvider(org.apache.hudi.utilities.checkpointing.InitialCheckPointProvider) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StructType(org.apache.spark.sql.types.StructType) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) Schema(org.apache.avro.Schema) SparkAvroPostProcessor(org.apache.hudi.utilities.schema.SparkAvroPostProcessor) JdbcUtils(org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils) Function1(org.apache.hudi.common.util.Functions.Function1) PreparedStatement(java.sql.PreparedStatement) HoodieIndex(org.apache.hudi.index.HoodieIndex) Objects(java.util.Objects) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) RowBasedSchemaProvider(org.apache.hudi.utilities.schema.RowBasedSchemaProvider) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) SchemaPostProcessor(org.apache.hudi.utilities.schema.SchemaPostProcessor) JdbcDialects(org.apache.spark.sql.jdbc.JdbcDialects) JdbcDialect(org.apache.spark.sql.jdbc.JdbcDialect) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AvroConversionUtils(org.apache.hudi.AvroConversionUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) SQLException(java.sql.SQLException) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) ChainedSchemaPostProcessor(org.apache.hudi.utilities.schema.ChainedSchemaPostProcessor) JsonKafkaSourcePostProcessor(org.apache.hudi.utilities.sources.processor.JsonKafkaSourcePostProcessor) JavaRDD(org.apache.spark.api.java.JavaRDD) DriverWrapper(org.apache.spark.sql.execution.datasources.jdbc.DriverWrapper) SparkSession(org.apache.spark.sql.SparkSession) HoodieSchemaPostProcessException(org.apache.hudi.utilities.exception.HoodieSchemaPostProcessException) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) Properties(java.util.Properties) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkLauncher(org.apache.spark.launcher.SparkLauncher) SparkConf(org.apache.spark.SparkConf) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) LongAccumulator(org.apache.spark.util.LongAccumulator) Config(org.apache.hudi.utilities.schema.SchemaPostProcessor.Config) ChainedJsonKafkaSourcePostProcessor(org.apache.hudi.utilities.sources.processor.ChainedJsonKafkaSourcePostProcessor) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) StringReader(java.io.StringReader) Driver(java.sql.Driver) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) ChainedTransformer(org.apache.hudi.utilities.transform.ChainedTransformer) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) BufferedReader(java.io.BufferedReader) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) DriverManager(java.sql.DriverManager) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig)

Example 40 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HDFSParquetImporter method buildHoodieRecordsForImport.

protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc, String schemaStr) throws IOException {
    Job job = Job.getInstance(jsc.hadoopConfiguration());
    // Allow recursive directories to be found
    job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
    // To parallelize reading file status.
    job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024");
    AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
    ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));
    HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    context.setJobStatus(this.getClass().getSimpleName(), "Build records for import");
    return jsc.newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class, job.getConfiguration()).coalesce(16 * cfg.parallelism).map(entry -> {
        GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2();
        Object partitionField = genericRecord.get(cfg.partitionKey);
        if (partitionField == null) {
            throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey);
        }
        Object rowField = genericRecord.get(cfg.rowKey);
        if (rowField == null) {
            throw new HoodieIOException("row field is missing. :" + cfg.rowKey);
        }
        String partitionPath = partitionField.toString();
        LOG.debug("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")");
        if (partitionField instanceof Number) {
            try {
                long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L);
                partitionPath = PARTITION_FORMATTER.format(Instant.ofEpochMilli(ts));
            } catch (NumberFormatException nfe) {
                LOG.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")");
            }
        }
        return new HoodieAvroRecord<>(new HoodieKey(rowField.toString(), partitionPath), new HoodieJsonPayload(genericRecord.toString()));
    });
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Schema(org.apache.avro.Schema) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieJsonPayload(org.apache.hudi.common.HoodieJsonPayload) Tuple2(scala.Tuple2) HoodieKey(org.apache.hudi.common.model.HoodieKey) Job(org.apache.hadoop.mapreduce.Job) GenericRecord(org.apache.avro.generic.GenericRecord) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport)

Aggregations

HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)58 Path (org.apache.hadoop.fs.Path)25 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)24 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 ArrayList (java.util.ArrayList)19 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)19 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)17 WriteStatus (org.apache.hudi.client.WriteStatus)15 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)15 IOException (java.io.IOException)14 List (java.util.List)14 Option (org.apache.hudi.common.util.Option)14 LogManager (org.apache.log4j.LogManager)14 Logger (org.apache.log4j.Logger)14 Test (org.junit.jupiter.api.Test)14 Collectors (java.util.stream.Collectors)12 FileStatus (org.apache.hadoop.fs.FileStatus)12 FileSystem (org.apache.hadoop.fs.FileSystem)12 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)11