use of org.apache.spark.sql.catalyst.InternalRow in project RemoteShuffleService by alibaba.
the class RssShuffleWriterSuiteJ method getUnsafeRowIterator.
private Iterator<Product2<Integer, UnsafeRow>> getUnsafeRowIterator(final int size, final AtomicInteger total, final boolean mix) {
int current = 0;
ListBuffer<Product2<Integer, UnsafeRow>> list = new ListBuffer<>();
while (current < size) {
int key = total.getAndIncrement();
String value = key + ": " + (mix && rand.nextBoolean() ? GIANT_RECORD : NORMAL_RECORD);
current += value.length();
ListBuffer<Object> values = new ListBuffer<>();
values.$plus$eq(key);
values.$plus$eq(UTF8String.fromString(value));
InternalRow row = InternalRow.apply(values.toSeq());
DataType[] types = new DataType[2];
types[0] = IntegerType$.MODULE$;
types[1] = StringType$.MODULE$;
UnsafeRow unsafeRow = UnsafeProjection.create(types).apply(row);
list.$plus$eq(new Tuple2<>(key % numPartitions, unsafeRow));
}
return list.toIterator();
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkParquetReadMetadataColumns method readAndValidate.
private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List<InternalRow> expected) throws IOException {
Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA);
if (vectorized) {
builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED));
builder.recordsPerBatch(RECORDS_PER_BATCH);
} else {
builder = builder.createReaderFunc(msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType));
}
if (filter != null) {
builder = builder.filter(filter);
}
if (splitStart != null && splitLength != null) {
builder = builder.split(splitStart, splitLength);
}
try (CloseableIterable<InternalRow> reader = vectorized ? batchesToRows(builder.build()) : builder.build()) {
final Iterator<InternalRow> actualRows = reader.iterator();
for (InternalRow internalRow : expected) {
Assert.assertTrue("Should have expected number of rows", actualRows.hasNext());
TestHelpers.assertEquals(PROJECTION_SCHEMA, internalRow, actualRows.next());
}
Assert.assertFalse("Should not have extra rows", actualRows.hasNext());
}
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class IcebergSourceDeleteBenchmark method writePosDeletes.
protected void writePosDeletes(CharSequence path, List<Long> deletedPos, int numNoise) throws IOException {
OutputFileFactory fileFactory = newFileFactory();
SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build();
ClusteredPositionDeleteWriter<InternalRow> writer = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table().io(), fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
PartitionSpec unpartitionedSpec = table().specs().get(0);
PositionDelete<InternalRow> positionDelete = PositionDelete.create();
try (ClusteredPositionDeleteWriter<InternalRow> closeableWriter = writer) {
for (Long pos : deletedPos) {
positionDelete.set(path, pos, null);
closeableWriter.write(positionDelete, unpartitionedSpec, null);
for (int i = 0; i < numNoise; i++) {
positionDelete.set(noisePath(path), pos, null);
closeableWriter.write(positionDelete, unpartitionedSpec, null);
}
}
}
RowDelta rowDelta = table().newRowDelta();
writer.result().deleteFiles().forEach(rowDelta::addDeletes);
rowDelta.validateDeletedFiles().commit();
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class WritersBenchmark method writePartitionedClusteredDataWriter.
@Benchmark
@Threads(1)
public void writePartitionedClusteredDataWriter(Blackhole blackhole) throws IOException {
FileIO io = table().io();
OutputFileFactory fileFactory = newFileFactory();
SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).dataSchema(table().schema()).build();
ClusteredDataWriter<InternalRow> writer = new ClusteredDataWriter<>(writerFactory, fileFactory, io, fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema());
StructType dataSparkType = SparkSchemaUtil.convert(table().schema());
InternalRowWrapper internalRowWrapper = new InternalRowWrapper(dataSparkType);
try (ClusteredDataWriter<InternalRow> closeableWriter = writer) {
for (InternalRow row : rows) {
partitionKey.partition(internalRowWrapper.wrap(row));
closeableWriter.write(row, partitionedSpec, partitionKey);
}
}
blackhole.consume(writer);
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class WritersBenchmark method writeUnpartitionedLegacyDataWriter.
@Benchmark
@Threads(1)
public void writeUnpartitionedLegacyDataWriter(Blackhole blackhole) throws IOException {
FileIO io = table().io();
OutputFileFactory fileFactory = newFileFactory();
Schema writeSchema = table().schema();
StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema);
SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType).spec(unpartitionedSpec).build();
TaskWriter<InternalRow> writer = new UnpartitionedWriter<>(unpartitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES);
try (TaskWriter<InternalRow> closableWriter = writer) {
for (InternalRow row : rows) {
closableWriter.write(row);
}
}
blackhole.consume(writer.complete());
}
Aggregations