Search in sources :

Example 6 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.

the class SaveDataSetStage method getDataSet.

/**
 * Gets the data set for the specified transformation result.
 */
private DataSet getDataSet(@Nonnull final TransformResult transform) {
    DataSet dataset = transform.getDataSet();
    if (request.getFormat() != null && request.getFormat().equals("orc")) {
        // Ensure that column names comply with ORC standards
        final StructType schema = dataset.schema();
        final Column[] columns = new Column[schema.size()];
        final DefaultQueryResultColumn[] queryColumns = new QueryResultRowTransform(schema, "orc", converterService).columns();
        for (int i = 0; i < schema.size(); ++i) {
            if (!queryColumns[i].getField().equals(schema.apply(i).name())) {
                columns[i] = new Column(schema.apply(i).name()).as(queryColumns[i].getField());
            } else {
                columns[i] = new Column(schema.apply(i).name());
            }
        }
        dataset = dataset.select(columns);
    }
    return dataset;
}
Also used : StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) DefaultQueryResultColumn(com.thinkbiganalytics.discovery.model.DefaultQueryResultColumn) Column(org.apache.spark.sql.Column) DefaultQueryResultColumn(com.thinkbiganalytics.discovery.model.DefaultQueryResultColumn)

Example 7 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.

the class AbstractSparkDataSetProviderTest method readDeleteSourceFile.

/**
 * Verify reading a data set and deleting the source file.
 */
@Test
@SuppressWarnings("unchecked")
public void readDeleteSourceFile() {
    isFileFormat = true;
    // Mock data set
    dataSet = Mockito.mock(DataFrame.class);
    Mockito.when(dataSet.col("value")).thenReturn(new Column("value"));
    final StructType schema = DataTypes.createStructType(Collections.singletonList(DataTypes.createStructField("value", DataTypes.StringType, true)));
    Mockito.when(dataSet.schema()).thenReturn(schema);
    final DataFrame mapDataSet = Mockito.mock(DataFrame.class);
    Mockito.when(dataSet.withColumn(Mockito.eq("value"), Mockito.any(Column.class))).thenReturn(mapDataSet);
    // Mock options
    final DataSetOptions options = new DataSetOptions();
    options.setFormat("text");
    options.setOption(KyloCatalogConstants.PATH_OPTION, "/mock/path/file.txt");
    options.setOption("keepSourceFile", "FALSE");
    // Test reading
    final MockSparkDataSetProvider provider = new MockSparkDataSetProvider();
    final DataFrame df = provider.read(Mockito.mock(KyloCatalogClient.class), options);
    Assert.assertEquals(mapDataSet, df);
    final ArgumentCaptor<Column> newColumn = ArgumentCaptor.forClass(Column.class);
    Mockito.verify(dataSet).withColumn(Mockito.eq("value"), newColumn.capture());
    Assert.assertTrue("Expected new column to be a UDF", newColumn.getValue().expr() instanceof ScalaUDF);
}
Also used : StructType(org.apache.spark.sql.types.StructType) Column(org.apache.spark.sql.Column) KyloCatalogClient(com.thinkbiganalytics.kylo.catalog.api.KyloCatalogClient) DataSetOptions(com.thinkbiganalytics.kylo.catalog.spi.DataSetOptions) DataFrame(org.apache.spark.sql.DataFrame) ScalaUDF(org.apache.spark.sql.catalyst.expressions.ScalaUDF) Test(org.junit.Test)

Example 8 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.

the class ProfilerTestEmptyData method setUp.

@Before
@SuppressWarnings("unchecked")
public void setUp() {
    if (columnStatsMap == null) {
        StructField[] schemaFields = new StructField[3];
        schemaFields[0] = DataTypes.createStructField("id", DataTypes.IntegerType, true);
        schemaFields[1] = DataTypes.createStructField("first_name", DataTypes.StringType, true);
        schemaFields[2] = DataTypes.createStructField("last_name", DataTypes.StringType, true);
        StructType schema = DataTypes.createStructType(schemaFields);
        final JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sqlContext.sparkContext());
        JavaRDD dataRDD = javaSparkContext.emptyRDD();
        DataSet dataDF = scs.toDataSet(sqlContext.createDataFrame(dataRDD, schema));
        ProfilerConfiguration configuration = new ProfilerConfiguration();
        configuration.setNumberOfTopNValues(3);
        StatisticsModel statsModel = profiler.profile(dataDF, configuration);
        columnStatsMap = (statsModel != null) ? (Map) statsModel.getColumnStatisticsMap() : (Map<Integer, StandardColumnStatistics>) Collections.EMPTY_MAP;
    }
}
Also used : StatisticsModel(com.thinkbiganalytics.spark.dataprofiler.StatisticsModel) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Map(java.util.Map) JavaRDD(org.apache.spark.api.java.JavaRDD) Before(org.junit.Before)

Example 9 with StructType$

use of org.apache.spark.sql.types.StructType$ in project kylo by Teradata.

the class ProfilerTest method setUp.

@Before
@SuppressWarnings("unchecked")
public void setUp() {
    if (columnStatsMap == null) {
        StructField[] schemaFields = new StructField[15];
        schemaFields[0] = DataTypes.createStructField("id", DataTypes.IntegerType, true);
        schemaFields[1] = DataTypes.createStructField("firstname", DataTypes.StringType, true);
        schemaFields[2] = DataTypes.createStructField("lastname", DataTypes.StringType, true);
        schemaFields[3] = DataTypes.createStructField("age", DataTypes.IntegerType, true);
        schemaFields[4] = DataTypes.createStructField("description", DataTypes.StringType, true);
        schemaFields[5] = DataTypes.createStructField("height", DataTypes.DoubleType, true);
        schemaFields[6] = DataTypes.createStructField("joindate", DataTypes.DateType, true);
        schemaFields[7] = DataTypes.createStructField("lifemember", DataTypes.BooleanType, true);
        schemaFields[8] = DataTypes.createStructField("lastlogin", DataTypes.TimestampType, true);
        schemaFields[9] = DataTypes.createStructField("phash", DataTypes.LongType, true);
        schemaFields[10] = DataTypes.createStructField("weight", DataTypes.FloatType, true);
        schemaFields[11] = DataTypes.createStructField("credits", DataTypes.ShortType, true);
        schemaFields[12] = DataTypes.createStructField("ccode", DataTypes.ByteType, true);
        schemaFields[13] = DataTypes.createStructField("score", DataTypes.createDecimalType(7, 5), true);
        schemaFields[14] = DataTypes.createStructField("favoritepet", DataTypes.StringType, true);
        StructType schema = DataTypes.createStructType(schemaFields);
        List<Row> rows = new ArrayList<>();
        rows.add(RowFactory.create(1, "Jon", "Wright", 14, "Jon::Wright", 5.85d, Date.valueOf("2010-05-04"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 1456890911L, 40.2f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Cat"));
        rows.add(RowFactory.create(2, "Jon", "Hudson", null, "Jon::Hudson", 5.85d, Date.valueOf("1990-10-25"), null, Timestamp.valueOf("2011-01-08 11:25:45"), 7638962135L, 110.5f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(8.223)), "alligator"));
        rows.add(RowFactory.create(3, "Rachael", "Hu", 40, "Rachael::Hu", 6.22d, Date.valueOf("1990-10-25"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 160.7f, (short) 1400, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Alpaca"));
        rows.add(RowFactory.create(4, EMPTY_STRING, EMPTY_STRING, 40, null, null, Date.valueOf("1956-11-12"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 2988626110L, null, null, (byte) 99, null, "Cat"));
        rows.add(RowFactory.create(5, "Rachael", EMPTY_STRING, 22, "Rachael::", 5.85d, Date.valueOf("2005-12-24"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8260467621L, 160.7f, (short) 100, null, new BigDecimal(String.valueOf(4.343)), "Zebra"));
        rows.add(RowFactory.create(6, "Elizabeth", "Taylor", 40, "Elizabeth::Taylor", 5.85d, Date.valueOf("2011-08-08"), null, Timestamp.valueOf("2016-01-14 14:20:20"), 8732866249L, null, (short) 1400, null, new BigDecimal(String.valueOf(4.343)), "ZEBRA"));
        rows.add(RowFactory.create(7, "Jon", "Taylor", 18, "Jon::Taylor", null, Date.valueOf("2011-08-08"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 110.5f, (short) 500, (byte) 40, new BigDecimal(String.valueOf(4.343)), null));
        rows.add(RowFactory.create(8, "Rachael", EMPTY_STRING, 22, "Rachael::", 4.37d, Date.valueOf("2011-08-08"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8782348100L, null, null, null, null, "albatross"));
        rows.add(RowFactory.create(9, EMPTY_STRING, "Edmundson Jr", 11, "::Edmundson Jr", 4.88d, Date.valueOf("2007-06-07"), Boolean.FALSE, Timestamp.valueOf("2007-03-16 08:24:37"), null, 155.3f, (short) 0, (byte) 99, new BigDecimal(String.valueOf(1.567)), EMPTY_STRING));
        rows.add(RowFactory.create(10, "Jon", EMPTY_STRING, 65, "Jon::", null, Date.valueOf("1975-04-04"), Boolean.TRUE, Timestamp.valueOf("2007-03-16 08:24:31"), null, 180.6f, (short) 5000, (byte) 2, new BigDecimal(String.valueOf(4.343)), "Cat"));
        final JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sqlContext.sparkContext());
        JavaRDD<Row> dataRDD = javaSparkContext.parallelize(rows);
        DataSet dataDF = scs.toDataSet(sqlContext.createDataFrame(dataRDD, schema));
        /* Enable to debug contents of test data */
        /*
            for (Row r: dataRDD.collect()) {
                System.out.println(r.toString());
            }
            */
        ProfilerConfiguration configuration = new ProfilerConfiguration();
        configuration.setNumberOfTopNValues(3);
        StatisticsModel statsModel = profiler.profile(dataDF, configuration);
        columnStatsMap = (statsModel != null) ? (Map) statsModel.getColumnStatisticsMap() : (Map<Integer, StandardColumnStatistics>) Collections.EMPTY_MAP;
    }
}
Also used : StatisticsModel(com.thinkbiganalytics.spark.dataprofiler.StatisticsModel) StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) ArrayList(java.util.ArrayList) BigDecimal(java.math.BigDecimal) StructField(org.apache.spark.sql.types.StructField) ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Map(java.util.Map) Before(org.junit.Before)

Example 10 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class TestSparkSchema method testFailIfSparkReadSchemaIsOff.

@Test
public void testFailIfSparkReadSchemaIsOff() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    PartitionSpec spec = PartitionSpec.unpartitioned();
    tables.create(SCHEMA, spec, null, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
    originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
    StructType sparkReadSchema = new StructType(new StructField[] { // wrong field name
    new StructField("idd", DataTypes.IntegerType, true, Metadata.empty()) });
    AssertHelpers.assertThrows("Iceberg should not allow a projection that contain unknown fields", java.lang.IllegalArgumentException.class, "Field idd not found in source schema", () -> spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation));
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Row(org.apache.spark.sql.Row) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Aggregations

StructType (org.apache.spark.sql.types.StructType)418 StructField (org.apache.spark.sql.types.StructField)228 Row (org.apache.spark.sql.Row)200 ArrayList (java.util.ArrayList)152 Test (org.junit.Test)131 Script (org.apache.sysml.api.mlcontext.Script)68 SparkSession (org.apache.spark.sql.SparkSession)61 List (java.util.List)41 DataType (org.apache.spark.sql.types.DataType)40 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)36 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)34 DenseVector (org.apache.spark.ml.linalg.DenseVector)33 Map (java.util.Map)31 ArrayType (org.apache.spark.sql.types.ArrayType)30 Dataset (org.apache.spark.sql.Dataset)28 Tuple2 (scala.Tuple2)28 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)27 Vector (org.apache.spark.ml.linalg.Vector)27 IOException (java.io.IOException)26 InternalRow (org.apache.spark.sql.catalyst.InternalRow)25