use of org.apache.hudi.common.config.SerializableSchema in project hudi by apache.
the class SingleSparkJobExecutionStrategy method performClustering.
@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) {
JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext());
final TaskContextSupplier taskContextSupplier = getEngineContext().getTaskContextSupplier();
final SerializableSchema serializableSchema = new SerializableSchema(schema);
final List<ClusteringGroupInfo> clusteringGroupInfos = clusteringPlan.getInputGroups().stream().map(clusteringGroup -> ClusteringGroupInfo.create(clusteringGroup)).collect(Collectors.toList());
String umask = engineContext.hadoopConfiguration().get("fs.permissions.umask-mode");
Broadcast<String> umaskBroadcastValue = engineContext.broadcast(umask);
JavaRDD<ClusteringGroupInfo> groupInfoJavaRDD = engineContext.parallelize(clusteringGroupInfos, clusteringGroupInfos.size());
LOG.info("number of partitions for clustering " + groupInfoJavaRDD.getNumPartitions());
JavaRDD<WriteStatus> writeStatusRDD = groupInfoJavaRDD.mapPartitions(clusteringOps -> {
Configuration configuration = new Configuration();
configuration.set("fs.permissions.umask-mode", umaskBroadcastValue.getValue());
Iterable<ClusteringGroupInfo> clusteringOpsIterable = () -> clusteringOps;
List<ClusteringGroupInfo> groupsInPartition = StreamSupport.stream(clusteringOpsIterable.spliterator(), false).collect(Collectors.toList());
return groupsInPartition.stream().flatMap(clusteringOp -> runClusteringForGroup(clusteringOp, clusteringPlan.getStrategy().getStrategyParams(), Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false), serializableSchema, taskContextSupplier, instantTime)).iterator();
});
HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD));
return writeMetadata;
}
use of org.apache.hudi.common.config.SerializableSchema in project hudi by apache.
the class TestSerializableSchema method verifySchema.
private void verifySchema(Schema schema) throws IOException {
SerializableSchema serializableSchema = new SerializableSchema(schema);
assertEquals(schema, serializableSchema.get());
assertTrue(schema != serializableSchema.get());
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(baos);
serializableSchema.writeObjectTo(oos);
oos.flush();
oos.close();
byte[] bytesWritten = baos.toByteArray();
SerializableSchema newSchema = new SerializableSchema();
newSchema.readObjectFrom(new ObjectInputStream(new ByteArrayInputStream(bytesWritten)));
assertEquals(schema, newSchema.get());
}
use of org.apache.hudi.common.config.SerializableSchema in project hudi by apache.
the class RDDSpatialCurveSortPartitioner method repartitionRecords.
@Override
public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions) {
SerializableSchema serializableSchema = new SerializableSchema(schema);
JavaRDD<GenericRecord> genericRecordsRDD = records.map(f -> (GenericRecord) f.getData().getInsertValue(serializableSchema.get()).get());
Dataset<Row> sourceDataset = AvroConversionUtils.createDataFrame(genericRecordsRDD.rdd(), schema.toString(), sparkEngineContext.getSqlContext().sparkSession());
Dataset<Row> sortedDataset = reorder(sourceDataset, outputSparkPartitions);
return HoodieSparkUtils.createRdd(sortedDataset, schema.getName(), schema.getNamespace(), false, Option.empty()).toJavaRDD().map(record -> {
String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
HoodieKey hoodieKey = new HoodieKey(key, partition);
HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, new RewriteAvroPayload(record));
return hoodieRecord;
});
}
use of org.apache.hudi.common.config.SerializableSchema in project hudi by apache.
the class RDDCustomColumnsSortPartitioner method repartitionRecords.
@Override
public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions) {
final String[] sortColumns = this.sortColumnNames;
final SerializableSchema schema = this.serializableSchema;
final boolean consistentLogicalTimestampEnabled = this.consistentLogicalTimestampEnabled;
return records.sortBy(record -> {
Object recordValue = HoodieAvroUtils.getRecordColumnValues(record, sortColumns, schema, consistentLogicalTimestampEnabled);
// null values are replaced with empty string for null_first order
if (recordValue == null) {
return StringUtils.EMPTY_STRING;
} else {
return StringUtils.objToString(record);
}
}, true, outputSparkPartitions);
}
Aggregations