Search in sources :

Example 1 with Row$

use of org.apache.spark.sql.Row$ in project kylo by Teradata.

the class ProfilerTest method setUp.

@Before
@SuppressWarnings("unchecked")
public void setUp() {
    if (columnStatsMap == null) {
        StructField[] schemaFields = new StructField[15];
        schemaFields[0] = DataTypes.createStructField("id", DataTypes.IntegerType, true);
        schemaFields[1] = DataTypes.createStructField("firstname", DataTypes.StringType, true);
        schemaFields[2] = DataTypes.createStructField("lastname", DataTypes.StringType, true);
        schemaFields[3] = DataTypes.createStructField("age", DataTypes.IntegerType, true);
        schemaFields[4] = DataTypes.createStructField("description", DataTypes.StringType, true);
        schemaFields[5] = DataTypes.createStructField("height", DataTypes.DoubleType, true);
        schemaFields[6] = DataTypes.createStructField("joindate", DataTypes.DateType, true);
        schemaFields[7] = DataTypes.createStructField("lifemember", DataTypes.BooleanType, true);
        schemaFields[8] = DataTypes.createStructField("lastlogin", DataTypes.TimestampType, true);
        schemaFields[9] = DataTypes.createStructField("phash", DataTypes.LongType, true);
        schemaFields[10] = DataTypes.createStructField("weight", DataTypes.FloatType, true);
        schemaFields[11] = DataTypes.createStructField("credits", DataTypes.ShortType, true);
        schemaFields[12] = DataTypes.createStructField("ccode", DataTypes.ByteType, true);
        schemaFields[13] = DataTypes.createStructField("score", DataTypes.createDecimalType(7, 5), true);
        schemaFields[14] = DataTypes.createStructField("favoritepet", DataTypes.StringType, true);
        StructType schema = DataTypes.createStructType(schemaFields);
        List<Row> rows = new ArrayList<>();
        rows.add(RowFactory.create(1, "Jon", "Wright", 14, "Jon::Wright", 5.85d, Date.valueOf("2010-05-04"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 1456890911L, 40.2f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Cat"));
        rows.add(RowFactory.create(2, "Jon", "Hudson", null, "Jon::Hudson", 5.85d, Date.valueOf("1990-10-25"), null, Timestamp.valueOf("2011-01-08 11:25:45"), 7638962135L, 110.5f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(8.223)), "alligator"));
        rows.add(RowFactory.create(3, "Rachael", "Hu", 40, "Rachael::Hu", 6.22d, Date.valueOf("1990-10-25"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 160.7f, (short) 1400, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Alpaca"));
        rows.add(RowFactory.create(4, EMPTY_STRING, EMPTY_STRING, 40, null, null, Date.valueOf("1956-11-12"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 2988626110L, null, null, (byte) 99, null, "Cat"));
        rows.add(RowFactory.create(5, "Rachael", EMPTY_STRING, 22, "Rachael::", 5.85d, Date.valueOf("2005-12-24"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8260467621L, 160.7f, (short) 100, null, new BigDecimal(String.valueOf(4.343)), "Zebra"));
        rows.add(RowFactory.create(6, "Elizabeth", "Taylor", 40, "Elizabeth::Taylor", 5.85d, Date.valueOf("2011-08-08"), null, Timestamp.valueOf("2016-01-14 14:20:20"), 8732866249L, null, (short) 1400, null, new BigDecimal(String.valueOf(4.343)), "ZEBRA"));
        rows.add(RowFactory.create(7, "Jon", "Taylor", 18, "Jon::Taylor", null, Date.valueOf("2011-08-08"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 110.5f, (short) 500, (byte) 40, new BigDecimal(String.valueOf(4.343)), null));
        rows.add(RowFactory.create(8, "Rachael", EMPTY_STRING, 22, "Rachael::", 4.37d, Date.valueOf("2011-08-08"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8782348100L, null, null, null, null, "albatross"));
        rows.add(RowFactory.create(9, EMPTY_STRING, "Edmundson Jr", 11, "::Edmundson Jr", 4.88d, Date.valueOf("2007-06-07"), Boolean.FALSE, Timestamp.valueOf("2007-03-16 08:24:37"), null, 155.3f, (short) 0, (byte) 99, new BigDecimal(String.valueOf(1.567)), EMPTY_STRING));
        rows.add(RowFactory.create(10, "Jon", EMPTY_STRING, 65, "Jon::", null, Date.valueOf("1975-04-04"), Boolean.TRUE, Timestamp.valueOf("2007-03-16 08:24:31"), null, 180.6f, (short) 5000, (byte) 2, new BigDecimal(String.valueOf(4.343)), "Cat"));
        final JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sqlContext.sparkContext());
        JavaRDD<Row> dataRDD = javaSparkContext.parallelize(rows);
        DataSet dataDF = scs.toDataSet(sqlContext.createDataFrame(dataRDD, schema));
        /* Enable to debug contents of test data */
        /*
            for (Row r: dataRDD.collect()) {
                System.out.println(r.toString());
            }
            */
        ProfilerConfiguration configuration = new ProfilerConfiguration();
        configuration.setNumberOfTopNValues(3);
        StatisticsModel statsModel = profiler.profile(dataDF, configuration);
        columnStatsMap = (statsModel != null) ? (Map) statsModel.getColumnStatisticsMap() : (Map<Integer, StandardColumnStatistics>) Collections.EMPTY_MAP;
    }
}
Also used : StatisticsModel(com.thinkbiganalytics.spark.dataprofiler.StatisticsModel) StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) ArrayList(java.util.ArrayList) BigDecimal(java.math.BigDecimal) StructField(org.apache.spark.sql.types.StructField) ProfilerConfiguration(com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Map(java.util.Map) Before(org.junit.Before)

Example 2 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteDataFilesUnpartitionedTable.

@Test
public void testRewriteDataFilesUnpartitionedTable() {
    PartitionSpec spec = PartitionSpec.unpartitioned();
    Map<String, String> options = Maps.newHashMap();
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
    writeRecords(records2);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size());
    Actions actions = Actions.forTable(table);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
    Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
    table.refresh();
    CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
    List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
    Assert.assertEquals("Should have 1 data files before rewrite", 1, dataFiles1.size());
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Example 3 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteDataFilesPartitionedTable.

@Test
public void testRewriteDataFilesPartitionedTable() {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
    Map<String, String> options = Maps.newHashMap();
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
    writeRecords(records2);
    List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
    writeRecords(records3);
    List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
    writeRecords(records4);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
    Actions actions = Actions.forTable(table);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
    Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFiles().size());
    table.refresh();
    CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
    List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
    Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles1.size());
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    expectedRecords.addAll(records3);
    expectedRecords.addAll(records4);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Example 4 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteDataFilesWithFilter.

@Test
public void testRewriteDataFilesWithFilter() {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
    Map<String, String> options = Maps.newHashMap();
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
    writeRecords(records2);
    List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
    writeRecords(records3);
    List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
    writeRecords(records4);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
    Actions actions = Actions.forTable(table);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c1", 1)).filter(Expressions.startsWith("c2", "AA")).execute();
    Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
    table.refresh();
    CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
    List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
    Assert.assertEquals("Should have 7 data files before rewrite", 7, dataFiles1.size());
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    expectedRecords.addAll(records3);
    expectedRecords.addAll(records4);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Example 5 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class ReadAndWriteTablesTest method createPartitionedTable.

@Test
public void createPartitionedTable() {
    PartitionSpec spec = PartitionSpec.builderFor(schema).identity("id").build();
    table = tables.create(schema, spec, pathToTable.toString());
    List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);
    df.select("id", "data").write().format("iceberg").mode("append").save(pathToTable.toString());
    table.refresh();
}
Also used : Row(org.apache.spark.sql.Row) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Aggregations

Row (org.apache.spark.sql.Row)1045 Test (org.junit.Test)344 ArrayList (java.util.ArrayList)244 SparkSession (org.apache.spark.sql.SparkSession)243 StructType (org.apache.spark.sql.types.StructType)215 Test (org.junit.jupiter.api.Test)157 StructField (org.apache.spark.sql.types.StructField)138 Table (org.apache.iceberg.Table)127 Dataset (org.apache.spark.sql.Dataset)123 List (java.util.List)115 Script (org.apache.sysml.api.mlcontext.Script)104 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)101 IOException (java.io.IOException)78 Column (org.apache.spark.sql.Column)78 File (java.io.File)76 Collectors (java.util.stream.Collectors)73 PartitionSpec (org.apache.iceberg.PartitionSpec)70 DatasetBuilder (au.csiro.pathling.test.builders.DatasetBuilder)66 Map (java.util.Map)66 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)61