use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class HoodieSnapshotExporter method exportAsNonHudi.
private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
Partitioner defaultPartitioner = dataset -> {
Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
return StringUtils.isNullOrEmpty(cfg.outputPartitionField) ? hoodieDroppedDataset.write() : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
};
Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner) ? defaultPartitioner : ReflectionUtils.loadClass(cfg.outputPartitioner);
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
context.setJobStatus(this.getClass().getSimpleName(), "Exporting as non-HUDI dataset");
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
Iterator<String> exportingFilePaths = jsc.parallelize(partitions, partitions.size()).flatMap(partition -> fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp).map(HoodieBaseFile::getPath).iterator()).toLocalIterator();
Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
partitioner.partition(sourceDataset).format(cfg.outputFormat).mode(SaveMode.Overwrite).save(cfg.targetOutputPath);
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class HoodieSnapshotExporter method export.
public void export(JavaSparkContext jsc, Config cfg) throws IOException {
FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
if (outputPathExists(fs, cfg)) {
throw new HoodieSnapshotExporterException("The target output path already exists.");
}
final String latestCommitTimestamp = getLatestCommitTimestamp(fs, cfg).<HoodieSnapshotExporterException>orElseThrow(() -> {
throw new HoodieSnapshotExporterException("No commits present. Nothing to snapshot.");
});
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp));
final List<String> partitions = getPartitions(engineContext, cfg);
if (partitions.isEmpty()) {
throw new HoodieSnapshotExporterException("The source dataset has 0 partition to snapshot.");
}
LOG.info(String.format("The job needs to export %d partitions.", partitions.size()));
if (cfg.outputFormat.equals(OutputFormatValidator.HUDI)) {
exportAsHudi(jsc, cfg, partitions, latestCommitTimestamp);
} else {
exportAsNonHudi(jsc, cfg, partitions, latestCommitTimestamp);
}
createSuccessTag(fs, cfg);
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class HoodieWithTimelineServer method run.
public void run(JavaSparkContext jsc) throws UnknownHostException {
startService();
final String driverHost = InetAddress.getLocalHost().getHostAddress();
System.out.println("Driver Hostname is :" + driverHost);
List<String> messages = new ArrayList<>();
IntStream.range(0, cfg.numPartitions).forEach(i -> messages.add("Hello World"));
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
context.setJobStatus(this.getClass().getSimpleName(), "Sending requests to driver host");
List<String> gotMessages = context.map(messages, msg -> sendRequest(driverHost, cfg.serverPort), messages.size());
System.out.println("Got Messages :" + gotMessages);
ValidationUtils.checkArgument(gotMessages.equals(messages), "Got expected reply from Server");
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class UtilHelpers method createHoodieClient.
/**
* Build Hoodie write client.
*
* @param jsc Java Spark Context
* @param basePath Base Path
* @param schemaStr Schema
* @param parallelism Parallelism
*/
public static SparkRDDWriteClient<HoodieRecordPayload> createHoodieClient(JavaSparkContext jsc, String basePath, String schemaStr, int parallelism, Option<String> compactionStrategyClass, TypedProperties properties) {
HoodieCompactionConfig compactionConfig = compactionStrategyClass.map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false).withCompactionStrategy(ReflectionUtils.loadClass(strategy)).build()).orElse(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build());
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withParallelism(parallelism, parallelism).withBulkInsertParallelism(parallelism).withDeleteParallelism(parallelism).withSchema(schemaStr).combineInput(true, true).withCompactionConfig(compactionConfig).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withProps(properties).build();
return new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), config);
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class HDFSParquetImporter method buildHoodieRecordsForImport.
protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc, String schemaStr) throws IOException {
Job job = Job.getInstance(jsc.hadoopConfiguration());
// Allow recursive directories to be found
job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
// To parallelize reading file status.
job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024");
AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
context.setJobStatus(this.getClass().getSimpleName(), "Build records for import");
return jsc.newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class, job.getConfiguration()).coalesce(16 * cfg.parallelism).map(entry -> {
GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2();
Object partitionField = genericRecord.get(cfg.partitionKey);
if (partitionField == null) {
throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey);
}
Object rowField = genericRecord.get(cfg.rowKey);
if (rowField == null) {
throw new HoodieIOException("row field is missing. :" + cfg.rowKey);
}
String partitionPath = partitionField.toString();
LOG.debug("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")");
if (partitionField instanceof Number) {
try {
long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L);
partitionPath = PARTITION_FORMATTER.format(Instant.ofEpochMilli(ts));
} catch (NumberFormatException nfe) {
LOG.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")");
}
}
return new HoodieAvroRecord<>(new HoodieKey(rowField.toString(), partitionPath), new HoodieJsonPayload(genericRecord.toString()));
});
}
Aggregations