use of org.apache.spark.sql.Row$ in project kylo by Teradata.
the class ProfilerTest method setUp.
@Before
@SuppressWarnings("unchecked")
public void setUp() {
if (columnStatsMap == null) {
StructField[] schemaFields = new StructField[15];
schemaFields[0] = DataTypes.createStructField("id", DataTypes.IntegerType, true);
schemaFields[1] = DataTypes.createStructField("firstname", DataTypes.StringType, true);
schemaFields[2] = DataTypes.createStructField("lastname", DataTypes.StringType, true);
schemaFields[3] = DataTypes.createStructField("age", DataTypes.IntegerType, true);
schemaFields[4] = DataTypes.createStructField("description", DataTypes.StringType, true);
schemaFields[5] = DataTypes.createStructField("height", DataTypes.DoubleType, true);
schemaFields[6] = DataTypes.createStructField("joindate", DataTypes.DateType, true);
schemaFields[7] = DataTypes.createStructField("lifemember", DataTypes.BooleanType, true);
schemaFields[8] = DataTypes.createStructField("lastlogin", DataTypes.TimestampType, true);
schemaFields[9] = DataTypes.createStructField("phash", DataTypes.LongType, true);
schemaFields[10] = DataTypes.createStructField("weight", DataTypes.FloatType, true);
schemaFields[11] = DataTypes.createStructField("credits", DataTypes.ShortType, true);
schemaFields[12] = DataTypes.createStructField("ccode", DataTypes.ByteType, true);
schemaFields[13] = DataTypes.createStructField("score", DataTypes.createDecimalType(7, 5), true);
schemaFields[14] = DataTypes.createStructField("favoritepet", DataTypes.StringType, true);
StructType schema = DataTypes.createStructType(schemaFields);
List<Row> rows = new ArrayList<>();
rows.add(RowFactory.create(1, "Jon", "Wright", 14, "Jon::Wright", 5.85d, Date.valueOf("2010-05-04"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 1456890911L, 40.2f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Cat"));
rows.add(RowFactory.create(2, "Jon", "Hudson", null, "Jon::Hudson", 5.85d, Date.valueOf("1990-10-25"), null, Timestamp.valueOf("2011-01-08 11:25:45"), 7638962135L, 110.5f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(8.223)), "alligator"));
rows.add(RowFactory.create(3, "Rachael", "Hu", 40, "Rachael::Hu", 6.22d, Date.valueOf("1990-10-25"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 160.7f, (short) 1400, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Alpaca"));
rows.add(RowFactory.create(4, EMPTY_STRING, EMPTY_STRING, 40, null, null, Date.valueOf("1956-11-12"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 2988626110L, null, null, (byte) 99, null, "Cat"));
rows.add(RowFactory.create(5, "Rachael", EMPTY_STRING, 22, "Rachael::", 5.85d, Date.valueOf("2005-12-24"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8260467621L, 160.7f, (short) 100, null, new BigDecimal(String.valueOf(4.343)), "Zebra"));
rows.add(RowFactory.create(6, "Elizabeth", "Taylor", 40, "Elizabeth::Taylor", 5.85d, Date.valueOf("2011-08-08"), null, Timestamp.valueOf("2016-01-14 14:20:20"), 8732866249L, null, (short) 1400, null, new BigDecimal(String.valueOf(4.343)), "ZEBRA"));
rows.add(RowFactory.create(7, "Jon", "Taylor", 18, "Jon::Taylor", null, Date.valueOf("2011-08-08"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 110.5f, (short) 500, (byte) 40, new BigDecimal(String.valueOf(4.343)), null));
rows.add(RowFactory.create(8, "Rachael", EMPTY_STRING, 22, "Rachael::", 4.37d, Date.valueOf("2011-08-08"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8782348100L, null, null, null, null, "albatross"));
rows.add(RowFactory.create(9, EMPTY_STRING, "Edmundson Jr", 11, "::Edmundson Jr", 4.88d, Date.valueOf("2007-06-07"), Boolean.FALSE, Timestamp.valueOf("2007-03-16 08:24:37"), null, 155.3f, (short) 0, (byte) 99, new BigDecimal(String.valueOf(1.567)), EMPTY_STRING));
rows.add(RowFactory.create(10, "Jon", EMPTY_STRING, 65, "Jon::", null, Date.valueOf("1975-04-04"), Boolean.TRUE, Timestamp.valueOf("2007-03-16 08:24:31"), null, 180.6f, (short) 5000, (byte) 2, new BigDecimal(String.valueOf(4.343)), "Cat"));
final JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sqlContext.sparkContext());
JavaRDD<Row> dataRDD = javaSparkContext.parallelize(rows);
DataSet dataDF = scs.toDataSet(sqlContext.createDataFrame(dataRDD, schema));
/* Enable to debug contents of test data */
/*
for (Row r: dataRDD.collect()) {
System.out.println(r.toString());
}
*/
ProfilerConfiguration configuration = new ProfilerConfiguration();
configuration.setNumberOfTopNValues(3);
StatisticsModel statsModel = profiler.profile(dataDF, configuration);
columnStatsMap = (statsModel != null) ? (Map) statsModel.getColumnStatisticsMap() : (Map<Integer, StandardColumnStatistics>) Collections.EMPTY_MAP;
}
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteDataFilesUnpartitionedTable.
@Test
public void testRewriteDataFilesUnpartitionedTable() {
PartitionSpec spec = PartitionSpec.unpartitioned();
Map<String, String> options = Maps.newHashMap();
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
writeRecords(records2);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size());
Actions actions = Actions.forTable(table);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
table.refresh();
CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
Assert.assertEquals("Should have 1 data files before rewrite", 1, dataFiles1.size());
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteDataFilesPartitionedTable.
@Test
public void testRewriteDataFilesPartitionedTable() {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
Map<String, String> options = Maps.newHashMap();
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
writeRecords(records2);
List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
writeRecords(records3);
List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
writeRecords(records4);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
Actions actions = Actions.forTable(table);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFiles().size());
table.refresh();
CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles1.size());
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
expectedRecords.addAll(records3);
expectedRecords.addAll(records4);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteDataFilesWithFilter.
@Test
public void testRewriteDataFilesWithFilter() {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
Map<String, String> options = Maps.newHashMap();
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
writeRecords(records2);
List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
writeRecords(records3);
List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
writeRecords(records4);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
Actions actions = Actions.forTable(table);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c1", 1)).filter(Expressions.startsWith("c2", "AA")).execute();
Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
table.refresh();
CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
Assert.assertEquals("Should have 7 data files before rewrite", 7, dataFiles1.size());
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
expectedRecords.addAll(records3);
expectedRecords.addAll(records4);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class ReadAndWriteTablesTest method createPartitionedTable.
@Test
public void createPartitionedTable() {
PartitionSpec spec = PartitionSpec.builderFor(schema).identity("id").build();
table = tables.create(schema, spec, pathToTable.toString());
List<SimpleRecord> expected = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);
df.select("id", "data").write().format("iceberg").mode("append").save(pathToTable.toString());
table.refresh();
}
Aggregations