use of org.apache.spark.sql.types.StructType$ in project hudi by apache.
the class TestSqlFileBasedTransformer method getEmptyDatasetRow.
private Dataset<Row> getEmptyDatasetRow() {
// Create the schema struct.
List<org.apache.spark.sql.types.StructField> listOfStructField = new ArrayList<>();
listOfStructField.add(DataTypes.createStructField("col1", DataTypes.StringType, true));
StructType structType = DataTypes.createStructType(listOfStructField);
// Create the data frame with the rows and schema.
List<Row> list = new ArrayList<>();
// Create empty dataframe with the schema.
return sparkSession.createDataFrame(list, structType);
}
use of org.apache.spark.sql.types.StructType$ in project hudi by apache.
the class SpaceCurveSortingHelper method orderDataFrameByMappingValues.
/**
* Orders provided {@link Dataset} by mapping values of the provided list of columns
* {@code orderByCols} onto a specified space curve (Z-curve, Hilbert, etc)
*
* <p/>
* NOTE: Only support base data-types: long,int,short,double,float,string,timestamp,decimal,date,byte.
* This method is more effective than {@link #orderDataFrameBySamplingValues} leveraging
* data sampling instead of direct mapping
*
* @param df Spark {@link Dataset} holding data to be ordered
* @param orderByCols list of columns to be ordered by
* @param targetPartitionCount target number of output partitions
* @param layoutOptStrategy target layout optimization strategy
* @return a {@link Dataset} holding data ordered by mapping tuple of values from provided columns
* onto a specified space-curve
*/
public static Dataset<Row> orderDataFrameByMappingValues(Dataset<Row> df, HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, List<String> orderByCols, int targetPartitionCount) {
Map<String, StructField> columnsMap = Arrays.stream(df.schema().fields()).collect(Collectors.toMap(StructField::name, Function.identity()));
List<String> checkCols = orderByCols.stream().filter(columnsMap::containsKey).collect(Collectors.toList());
if (orderByCols.size() != checkCols.size()) {
LOG.error(String.format("Trying to ordering over a column(s) not present in the schema (%s); skipping", CollectionUtils.diff(orderByCols, checkCols)));
return df;
}
// ordering altogether (since it will match linear ordering anyway)
if (orderByCols.size() == 1) {
String orderByColName = orderByCols.get(0);
LOG.debug(String.format("Single column to order by (%s), skipping space-curve ordering", orderByColName));
// TODO validate if we need Spark to re-partition
return df.repartitionByRange(targetPartitionCount, new Column(orderByColName));
}
int fieldNum = df.schema().fields().length;
Map<Integer, StructField> fieldMap = orderByCols.stream().collect(Collectors.toMap(e -> Arrays.asList(df.schema().fields()).indexOf(columnsMap.get(e)), columnsMap::get));
JavaRDD<Row> sortedRDD;
switch(layoutOptStrategy) {
case ZORDER:
sortedRDD = createZCurveSortedRDD(df.toJavaRDD(), fieldMap, fieldNum, targetPartitionCount);
break;
case HILBERT:
sortedRDD = createHilbertSortedRDD(df.toJavaRDD(), fieldMap, fieldNum, targetPartitionCount);
break;
default:
throw new UnsupportedOperationException(String.format("Not supported layout-optimization strategy (%s)", layoutOptStrategy));
}
// Compose new {@code StructType} for ordered RDDs
StructType newStructType = composeOrderedRDDStructType(df.schema());
return df.sparkSession().createDataFrame(sortedRDD, newStructType).drop("Index");
}
use of org.apache.spark.sql.types.StructType$ in project hudi by apache.
the class TestDataSourceUtils method testAutoModifyParquetWriteLegacyFormatParameter.
@ParameterizedTest
@CsvSource({ "true, false", "true, true", "false, true", "false, false" })
public void testAutoModifyParquetWriteLegacyFormatParameter(boolean smallDecimal, boolean defaultWriteValue) {
// create test StructType
List<StructField> structFields = new ArrayList<>();
if (smallDecimal) {
structFields.add(StructField.apply("d1", DecimalType$.MODULE$.apply(10, 2), false, Metadata.empty()));
} else {
structFields.add(StructField.apply("d1", DecimalType$.MODULE$.apply(38, 10), false, Metadata.empty()));
}
StructType structType = StructType$.MODULE$.apply(structFields);
// create write options
Map<String, String> options = new HashMap<>();
options.put("hoodie.parquet.writelegacyformat.enabled", String.valueOf(defaultWriteValue));
// start test
mayBeOverwriteParquetWriteLegacyFormatProp(options, structType);
// check result
boolean res = Boolean.parseBoolean(options.get("hoodie.parquet.writelegacyformat.enabled"));
if (smallDecimal) {
// should auto modify "hoodie.parquet.writelegacyformat.enabled" = "true".
assertEquals(true, res);
} else {
// should not modify the value of "hoodie.parquet.writelegacyformat.enabled".
assertEquals(defaultWriteValue, res);
}
}
use of org.apache.spark.sql.types.StructType$ in project hudi by apache.
the class TestHoodieDatasetBulkInsertHelper method testBulkInsertPreCombine.
@ParameterizedTest
@MethodSource("providePreCombineArgs")
public void testBulkInsertPreCombine(boolean enablePreCombine) {
HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(getPropsAllSet("_row_key")).combineInput(enablePreCombine, enablePreCombine).withPreCombineField("ts").build();
List<Row> inserts = DataSourceTestUtils.generateRandomRows(10);
Dataset<Row> toUpdateDataset = sqlContext.createDataFrame(inserts.subList(0, 5), structType);
List<Row> updates = DataSourceTestUtils.updateRowsWithHigherTs(toUpdateDataset);
List<Row> rows = new ArrayList<>();
rows.addAll(inserts);
rows.addAll(updates);
Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", "testNamespace", new NonSortPartitionerWithRows(), false, false);
StructType resultSchema = result.schema();
assertEquals(result.count(), enablePreCombine ? 10 : 15);
assertEquals(resultSchema.fieldNames().length, structType.fieldNames().length + HoodieRecord.HOODIE_META_COLUMNS.size());
for (Map.Entry<String, Integer> entry : HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.entrySet()) {
assertTrue(resultSchema.fieldIndex(entry.getKey()) == entry.getValue());
}
int metadataRecordKeyIndex = resultSchema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD);
int metadataPartitionPathIndex = resultSchema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD);
int metadataCommitTimeIndex = resultSchema.fieldIndex(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
int metadataCommitSeqNoIndex = resultSchema.fieldIndex(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD);
int metadataFilenameIndex = resultSchema.fieldIndex(HoodieRecord.FILENAME_METADATA_FIELD);
result.toJavaRDD().foreach(entry -> {
assertTrue(entry.get(metadataRecordKeyIndex).equals(entry.getAs("_row_key")));
assertTrue(entry.get(metadataPartitionPathIndex).equals(entry.getAs("partition")));
assertTrue(entry.get(metadataCommitSeqNoIndex).equals(""));
assertTrue(entry.get(metadataCommitTimeIndex).equals(""));
assertTrue(entry.get(metadataFilenameIndex).equals(""));
});
Dataset<Row> trimmedOutput = result.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
// find resolved input snapshot
ExpressionEncoder encoder = getEncoder(dataset.schema());
if (enablePreCombine) {
Dataset<Row> inputSnapshotDf = dataset.groupByKey((MapFunction<Row, String>) value -> value.getAs("partition") + "+" + value.getAs("_row_key"), Encoders.STRING()).reduceGroups((ReduceFunction<Row>) (v1, v2) -> {
long ts1 = v1.getAs("ts");
long ts2 = v2.getAs("ts");
if (ts1 >= ts2) {
return v1;
} else {
return v2;
}
}).map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, encoder);
assertTrue(inputSnapshotDf.except(trimmedOutput).count() == 0);
} else {
assertTrue(dataset.except(trimmedOutput).count() == 0);
}
}
use of org.apache.spark.sql.types.StructType$ in project kubernetes by Naresh240.
the class App method main.
public static void main(String[] args) {
// System.out.println( "Hello World!" );
SparkSession spark = SparkSession.builder().appName("JSubmittingApplications").getOrCreate();
System.out.println("You are using Spark " + spark.version());
spark.sparkContext().setLogLevel("ERROR");
List<Row> list = new ArrayList<Row>();
list.add(RowFactory.create("one"));
list.add(RowFactory.create("two"));
list.add(RowFactory.create("three"));
list.add(RowFactory.create("four"));
List<org.apache.spark.sql.types.StructField> listOfStructField = new ArrayList<org.apache.spark.sql.types.StructField>();
listOfStructField.add(DataTypes.createStructField("test", DataTypes.StringType, true));
StructType structType = DataTypes.createStructType(listOfStructField);
Dataset<Row> data = spark.createDataFrame(list, structType);
data.show();
spark.stop();
}
Aggregations