use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkPositionDeltaWriters method toSet.
@Override
protected StructLikeSet toSet(Iterable<InternalRow> rows) {
StructLikeSet set = StructLikeSet.create(table.schema().asStruct());
StructType sparkType = SparkSchemaUtil.convert(table.schema());
for (InternalRow row : rows) {
InternalRowWrapper wrapper = new InternalRowWrapper(sparkType);
set.add(wrapper.wrap(row));
}
return set;
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkRollingFileWriters method toRow.
@Override
protected InternalRow toRow(Integer id, String data) {
InternalRow row = new GenericInternalRow(2);
row.update(0, id);
row.update(1, UTF8String.fromString(data));
return row;
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestPartitionPruning method createTestDataset.
private Dataset<Row> createTestDataset() {
List<InternalRow> rows = LOGS.stream().map(logMessage -> {
Object[] underlying = new Object[] { logMessage.getId(), UTF8String.fromString(logMessage.getDate()), UTF8String.fromString(logMessage.getLevel()), UTF8String.fromString(logMessage.getMessage()), // discard the nanoseconds part to simplify
TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) };
return new GenericInternalRow(underlying);
}).collect(Collectors.toList());
JavaRDD<InternalRow> rdd = sparkContext.parallelize(rows);
Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false);
return df.selectExpr("id", "date", "level", "message", "timestamp").selectExpr("id", "date", "level", "message", "timestamp", "bucket3(id) AS bucket_id", "truncate5(message) AS truncated_message", "hour(timestamp) AS ts_hour");
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkAppenderFactory method expectedRowSet.
@Override
protected StructLikeSet expectedRowSet(Iterable<InternalRow> rows) {
StructLikeSet set = StructLikeSet.create(table.schema().asStruct());
for (InternalRow row : rows) {
InternalRowWrapper wrapper = new InternalRowWrapper(sparkType);
set.add(wrapper.wrap(row));
}
return set;
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkDataFile method checkSparkDataFile.
private void checkSparkDataFile(Table table) throws IOException {
Iterable<InternalRow> rows = RandomData.generateSpark(table.schema(), 200, 0);
JavaRDD<InternalRow> rdd = sparkContext.parallelize(Lists.newArrayList(rows));
Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false);
df.write().format("iceberg").mode("append").save(tableLocation);
table.refresh();
List<ManifestFile> manifests = table.currentSnapshot().allManifests();
Assert.assertEquals("Should have 1 manifest", 1, manifests.size());
List<DataFile> dataFiles = Lists.newArrayList();
try (ManifestReader<DataFile> reader = ManifestFiles.read(manifests.get(0), table.io())) {
for (DataFile dataFile : reader) {
checkDataFile(dataFile.copy(), DataFiles.builder(table.spec()).copy(dataFile).build());
dataFiles.add(dataFile.copy());
}
}
Dataset<Row> dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files");
// reorder columns to test arbitrary projections
List<Column> columns = Arrays.stream(dataFileDF.columns()).map(ColumnName::new).collect(Collectors.toList());
Collections.shuffle(columns);
List<Row> sparkDataFiles = dataFileDF.select(Iterables.toArray(columns, Column.class)).collectAsList();
Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size());
Types.StructType dataFileType = DataFile.getType(table.spec().partitionType());
StructType sparkDataFileType = sparkDataFiles.get(0).schema();
SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkDataFileType);
for (int i = 0; i < dataFiles.size(); i++) {
checkDataFile(dataFiles.get(i), wrapper.wrap(sparkDataFiles.get(i)));
}
}
Aggregations