Search in sources :

Example 96 with StructType$

use of org.apache.spark.sql.types.StructType$ in project hudi by apache.

the class TestSqlFileBasedTransformer method getEmptyDatasetRow.

private Dataset<Row> getEmptyDatasetRow() {
    // Create the schema struct.
    List<org.apache.spark.sql.types.StructField> listOfStructField = new ArrayList<>();
    listOfStructField.add(DataTypes.createStructField("col1", DataTypes.StringType, true));
    StructType structType = DataTypes.createStructType(listOfStructField);
    // Create the data frame with the rows and schema.
    List<Row> list = new ArrayList<>();
    // Create empty dataframe with the schema.
    return sparkSession.createDataFrame(list, structType);
}
Also used : StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row)

Example 97 with StructType$

use of org.apache.spark.sql.types.StructType$ in project hudi by apache.

the class SpaceCurveSortingHelper method orderDataFrameByMappingValues.

/**
 * Orders provided {@link Dataset} by mapping values of the provided list of columns
 * {@code orderByCols} onto a specified space curve (Z-curve, Hilbert, etc)
 *
 * <p/>
 * NOTE: Only support base data-types: long,int,short,double,float,string,timestamp,decimal,date,byte.
 *       This method is more effective than {@link #orderDataFrameBySamplingValues} leveraging
 *       data sampling instead of direct mapping
 *
 * @param df Spark {@link Dataset} holding data to be ordered
 * @param orderByCols list of columns to be ordered by
 * @param targetPartitionCount target number of output partitions
 * @param layoutOptStrategy target layout optimization strategy
 * @return a {@link Dataset} holding data ordered by mapping tuple of values from provided columns
 *         onto a specified space-curve
 */
public static Dataset<Row> orderDataFrameByMappingValues(Dataset<Row> df, HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, List<String> orderByCols, int targetPartitionCount) {
    Map<String, StructField> columnsMap = Arrays.stream(df.schema().fields()).collect(Collectors.toMap(StructField::name, Function.identity()));
    List<String> checkCols = orderByCols.stream().filter(columnsMap::containsKey).collect(Collectors.toList());
    if (orderByCols.size() != checkCols.size()) {
        LOG.error(String.format("Trying to ordering over a column(s) not present in the schema (%s); skipping", CollectionUtils.diff(orderByCols, checkCols)));
        return df;
    }
    // ordering altogether (since it will match linear ordering anyway)
    if (orderByCols.size() == 1) {
        String orderByColName = orderByCols.get(0);
        LOG.debug(String.format("Single column to order by (%s), skipping space-curve ordering", orderByColName));
        // TODO validate if we need Spark to re-partition
        return df.repartitionByRange(targetPartitionCount, new Column(orderByColName));
    }
    int fieldNum = df.schema().fields().length;
    Map<Integer, StructField> fieldMap = orderByCols.stream().collect(Collectors.toMap(e -> Arrays.asList(df.schema().fields()).indexOf(columnsMap.get(e)), columnsMap::get));
    JavaRDD<Row> sortedRDD;
    switch(layoutOptStrategy) {
        case ZORDER:
            sortedRDD = createZCurveSortedRDD(df.toJavaRDD(), fieldMap, fieldNum, targetPartitionCount);
            break;
        case HILBERT:
            sortedRDD = createHilbertSortedRDD(df.toJavaRDD(), fieldMap, fieldNum, targetPartitionCount);
            break;
        default:
            throw new UnsupportedOperationException(String.format("Not supported layout-optimization strategy (%s)", layoutOptStrategy));
    }
    // Compose new {@code StructType} for ordered RDDs
    StructType newStructType = composeOrderedRDDStructType(df.schema());
    return df.sparkSession().createDataFrame(sortedRDD, newStructType).drop("Index");
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) FloatType(org.apache.spark.sql.types.FloatType) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) DecimalType(org.apache.spark.sql.types.DecimalType) Function(java.util.function.Function) RangeSampleSort$(org.apache.spark.sql.hudi.execution.RangeSampleSort$) StructType$(org.apache.spark.sql.types.StructType$) Logger(org.apache.log4j.Logger) ByteType(org.apache.spark.sql.types.ByteType) Map(java.util.Map) BinaryUtil(org.apache.hudi.common.util.BinaryUtil) DoubleType(org.apache.spark.sql.types.DoubleType) Nonnull(javax.annotation.Nonnull) JavaRDD(org.apache.spark.api.java.JavaRDD) Metadata(org.apache.spark.sql.types.Metadata) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) WrappedArray(scala.collection.mutable.WrappedArray) JavaConversions(scala.collection.JavaConversions) HilbertCurveUtils(org.apache.hudi.optimize.HilbertCurveUtils) ByteArraySorting(org.apache.spark.sql.hudi.execution.ByteArraySorting) Iterator(java.util.Iterator) Column(org.apache.spark.sql.Column) RowFactory(org.apache.spark.sql.RowFactory) IntegerType(org.apache.spark.sql.types.IntegerType) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) Row(org.apache.spark.sql.Row) TimestampType(org.apache.spark.sql.types.TimestampType) Collectors(java.util.stream.Collectors) ShortType(org.apache.spark.sql.types.ShortType) HilbertCurve(org.davidmoten.hilbert.HilbertCurve) List(java.util.List) BinaryType$(org.apache.spark.sql.types.BinaryType$) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) BooleanType(org.apache.spark.sql.types.BooleanType) DateType(org.apache.spark.sql.types.DateType) StructType(org.apache.spark.sql.types.StructType) StructField(org.apache.spark.sql.types.StructField) Column(org.apache.spark.sql.Column) Row(org.apache.spark.sql.Row)

Example 98 with StructType$

use of org.apache.spark.sql.types.StructType$ in project hudi by apache.

the class TestDataSourceUtils method testAutoModifyParquetWriteLegacyFormatParameter.

@ParameterizedTest
@CsvSource({ "true, false", "true, true", "false, true", "false, false" })
public void testAutoModifyParquetWriteLegacyFormatParameter(boolean smallDecimal, boolean defaultWriteValue) {
    // create test StructType
    List<StructField> structFields = new ArrayList<>();
    if (smallDecimal) {
        structFields.add(StructField.apply("d1", DecimalType$.MODULE$.apply(10, 2), false, Metadata.empty()));
    } else {
        structFields.add(StructField.apply("d1", DecimalType$.MODULE$.apply(38, 10), false, Metadata.empty()));
    }
    StructType structType = StructType$.MODULE$.apply(structFields);
    // create write options
    Map<String, String> options = new HashMap<>();
    options.put("hoodie.parquet.writelegacyformat.enabled", String.valueOf(defaultWriteValue));
    // start test
    mayBeOverwriteParquetWriteLegacyFormatProp(options, structType);
    // check result
    boolean res = Boolean.parseBoolean(options.get("hoodie.parquet.writelegacyformat.enabled"));
    if (smallDecimal) {
        // should auto modify "hoodie.parquet.writelegacyformat.enabled" = "true".
        assertEquals(true, res);
    } else {
        // should not modify the value of "hoodie.parquet.writelegacyformat.enabled".
        assertEquals(defaultWriteValue, res);
    }
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) ArgumentMatchers.anyString(org.mockito.ArgumentMatchers.anyString) CsvSource(org.junit.jupiter.params.provider.CsvSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 99 with StructType$

use of org.apache.spark.sql.types.StructType$ in project hudi by apache.

the class TestHoodieDatasetBulkInsertHelper method testBulkInsertPreCombine.

@ParameterizedTest
@MethodSource("providePreCombineArgs")
public void testBulkInsertPreCombine(boolean enablePreCombine) {
    HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(getPropsAllSet("_row_key")).combineInput(enablePreCombine, enablePreCombine).withPreCombineField("ts").build();
    List<Row> inserts = DataSourceTestUtils.generateRandomRows(10);
    Dataset<Row> toUpdateDataset = sqlContext.createDataFrame(inserts.subList(0, 5), structType);
    List<Row> updates = DataSourceTestUtils.updateRowsWithHigherTs(toUpdateDataset);
    List<Row> rows = new ArrayList<>();
    rows.addAll(inserts);
    rows.addAll(updates);
    Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
    Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", "testNamespace", new NonSortPartitionerWithRows(), false, false);
    StructType resultSchema = result.schema();
    assertEquals(result.count(), enablePreCombine ? 10 : 15);
    assertEquals(resultSchema.fieldNames().length, structType.fieldNames().length + HoodieRecord.HOODIE_META_COLUMNS.size());
    for (Map.Entry<String, Integer> entry : HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.entrySet()) {
        assertTrue(resultSchema.fieldIndex(entry.getKey()) == entry.getValue());
    }
    int metadataRecordKeyIndex = resultSchema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD);
    int metadataPartitionPathIndex = resultSchema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD);
    int metadataCommitTimeIndex = resultSchema.fieldIndex(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
    int metadataCommitSeqNoIndex = resultSchema.fieldIndex(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD);
    int metadataFilenameIndex = resultSchema.fieldIndex(HoodieRecord.FILENAME_METADATA_FIELD);
    result.toJavaRDD().foreach(entry -> {
        assertTrue(entry.get(metadataRecordKeyIndex).equals(entry.getAs("_row_key")));
        assertTrue(entry.get(metadataPartitionPathIndex).equals(entry.getAs("partition")));
        assertTrue(entry.get(metadataCommitSeqNoIndex).equals(""));
        assertTrue(entry.get(metadataCommitTimeIndex).equals(""));
        assertTrue(entry.get(metadataFilenameIndex).equals(""));
    });
    Dataset<Row> trimmedOutput = result.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
    // find resolved input snapshot
    ExpressionEncoder encoder = getEncoder(dataset.schema());
    if (enablePreCombine) {
        Dataset<Row> inputSnapshotDf = dataset.groupByKey((MapFunction<Row, String>) value -> value.getAs("partition") + "+" + value.getAs("_row_key"), Encoders.STRING()).reduceGroups((ReduceFunction<Row>) (v1, v2) -> {
            long ts1 = v1.getAs("ts");
            long ts2 = v2.getAs("ts");
            if (ts1 >= ts2) {
                return v1;
            } else {
                return v2;
            }
        }).map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, encoder);
        assertTrue(inputSnapshotDf.except(trimmedOutput).count() == 0);
    } else {
        assertTrue(dataset.except(trimmedOutput).count() == 0);
    }
}
Also used : HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Attribute(org.apache.spark.sql.catalyst.expressions.Attribute) IntStream(java.util.stream.IntStream) Assertions.fail(org.junit.jupiter.api.Assertions.fail) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) Dataset(org.apache.spark.sql.Dataset) AvroConversionUtils(org.apache.hudi.AvroConversionUtils) HashMap(java.util.HashMap) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) ReduceFunction(org.apache.spark.api.java.function.ReduceFunction) ArrayList(java.util.ArrayList) SimpleAnalyzer$(org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$) DataSourceTestUtils(org.apache.hudi.testutils.DataSourceTestUtils) Map(java.util.Map) Tag(org.junit.jupiter.api.Tag) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) MapFunction(org.apache.spark.api.java.function.MapFunction) MethodSource(org.junit.jupiter.params.provider.MethodSource) StructType(org.apache.spark.sql.types.StructType) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) JavaConversions(scala.collection.JavaConversions) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieDatasetBulkInsertHelper(org.apache.hudi.HoodieDatasetBulkInsertHelper) RowEncoder(org.apache.spark.sql.catalyst.encoders.RowEncoder) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) Arguments(org.junit.jupiter.params.provider.Arguments) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) Encoders(org.apache.spark.sql.Encoders) NonSortPartitionerWithRows(org.apache.hudi.execution.bulkinsert.NonSortPartitionerWithRows) Test(org.junit.jupiter.api.Test) ExpressionEncoder(org.apache.spark.sql.catalyst.encoders.ExpressionEncoder) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) JavaConverters(scala.collection.JavaConverters) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) ReduceFunction(org.apache.spark.api.java.function.ReduceFunction) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) NonSortPartitionerWithRows(org.apache.hudi.execution.bulkinsert.NonSortPartitionerWithRows) Tuple2(scala.Tuple2) ExpressionEncoder(org.apache.spark.sql.catalyst.encoders.ExpressionEncoder) Row(org.apache.spark.sql.Row) HashMap(java.util.HashMap) Map(java.util.Map) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 100 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kubernetes by Naresh240.

the class App method main.

public static void main(String[] args) {
    // System.out.println( "Hello World!" );
    SparkSession spark = SparkSession.builder().appName("JSubmittingApplications").getOrCreate();
    System.out.println("You are using Spark " + spark.version());
    spark.sparkContext().setLogLevel("ERROR");
    List<Row> list = new ArrayList<Row>();
    list.add(RowFactory.create("one"));
    list.add(RowFactory.create("two"));
    list.add(RowFactory.create("three"));
    list.add(RowFactory.create("four"));
    List<org.apache.spark.sql.types.StructField> listOfStructField = new ArrayList<org.apache.spark.sql.types.StructField>();
    listOfStructField.add(DataTypes.createStructField("test", DataTypes.StringType, true));
    StructType structType = DataTypes.createStructType(listOfStructField);
    Dataset<Row> data = spark.createDataFrame(list, structType);
    data.show();
    spark.stop();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row)

Aggregations

StructType (org.apache.spark.sql.types.StructType)418 StructField (org.apache.spark.sql.types.StructField)228 Row (org.apache.spark.sql.Row)200 ArrayList (java.util.ArrayList)152 Test (org.junit.Test)131 Script (org.apache.sysml.api.mlcontext.Script)68 SparkSession (org.apache.spark.sql.SparkSession)61 List (java.util.List)41 DataType (org.apache.spark.sql.types.DataType)40 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)36 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)34 DenseVector (org.apache.spark.ml.linalg.DenseVector)33 Map (java.util.Map)31 ArrayType (org.apache.spark.sql.types.ArrayType)30 Dataset (org.apache.spark.sql.Dataset)28 Tuple2 (scala.Tuple2)28 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)27 Vector (org.apache.spark.ml.linalg.Vector)27 IOException (java.io.IOException)26 InternalRow (org.apache.spark.sql.catalyst.InternalRow)25