use of org.apache.spark.sql.vectorized.ColumnarBatch in project iceberg by apache.
the class ColumnarBatchReader method read.
@Override
public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) {
Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead);
ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length];
if (reuse == null) {
closeVectors();
}
for (int i = 0; i < readers.length; i += 1) {
vectorHolders[i] = readers[i].read(vectorHolders[i], numRowsToRead);
int numRowsInVector = vectorHolders[i].numValues();
Preconditions.checkState(numRowsInVector == numRowsToRead, "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, numRowsToRead);
arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector);
}
ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors);
batch.setNumRows(numRowsToRead);
return batch;
}
use of org.apache.spark.sql.vectorized.ColumnarBatch in project iceberg by apache.
the class TestSparkOrcReader method writeAndValidateRecords.
private void writeAndValidateRecords(Schema schema, Iterable<InternalRow> expected) throws IOException {
final File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());
try (FileAppender<InternalRow> writer = ORC.write(Files.localOutput(testFile)).createWriterFunc(SparkOrcWriter::new).schema(schema).build()) {
writer.addAll(expected);
}
try (CloseableIterable<InternalRow> reader = ORC.read(Files.localInput(testFile)).project(schema).createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)).build()) {
final Iterator<InternalRow> actualRows = reader.iterator();
final Iterator<InternalRow> expectedRows = expected.iterator();
while (expectedRows.hasNext()) {
Assert.assertTrue("Should have expected number of rows", actualRows.hasNext());
assertEquals(schema, expectedRows.next(), actualRows.next());
}
Assert.assertFalse("Should not have extra rows", actualRows.hasNext());
}
try (CloseableIterable<ColumnarBatch> reader = ORC.read(Files.localInput(testFile)).project(schema).createBatchedReaderFunc(readOrcSchema -> VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())).build()) {
final Iterator<InternalRow> actualRows = batchesToRows(reader.iterator());
final Iterator<InternalRow> expectedRows = expected.iterator();
while (expectedRows.hasNext()) {
Assert.assertTrue("Should have expected number of rows", actualRows.hasNext());
assertEquals(schema, expectedRows.next(), actualRows.next());
}
Assert.assertFalse("Should not have extra rows", actualRows.hasNext());
}
}
use of org.apache.spark.sql.vectorized.ColumnarBatch in project tispark by pingcap.
the class TiColumnarBatchHelper method createColumnarBatch.
public static ColumnarBatch createColumnarBatch(TiChunk chunk) {
int colLen = chunk.numOfCols();
TiColumnVectorAdapter[] columns = new TiColumnVectorAdapter[colLen];
for (int i = 0; i < colLen; i++) {
columns[i] = new TiColumnVectorAdapter(chunk.column(i));
}
ColumnarBatch batch = new ColumnarBatch(columns);
batch.setNumRows(chunk.numOfRows());
return batch;
}
use of org.apache.spark.sql.vectorized.ColumnarBatch in project spark-bigquery-connector by GoogleCloudDataproc.
the class ArrowReaderIterator method toArrowRows.
private Iterator<InternalRow> toArrowRows(VectorSchemaRoot root, List<String> namesInOrder) {
ColumnVector[] columns = namesInOrder.stream().map(name -> root.getVector(name)).map(vector -> new ArrowSchemaConverter(vector, userProvidedFieldMap.get(vector.getName()))).collect(Collectors.toList()).toArray(new ColumnVector[0]);
ColumnarBatch batch = new ColumnarBatch(columns);
batch.setNumRows(root.getRowCount());
return batch.rowIterator();
}
use of org.apache.spark.sql.vectorized.ColumnarBatch in project spark-bigquery-connector by GoogleCloudDataproc.
the class ArrowColumnBatchPartitionReaderContext method next.
public boolean next() throws IOException {
tracer.nextBatchNeeded();
if (closed) {
return false;
}
tracer.rowsParseStarted();
closed = !reader.loadNextBatch();
if (closed) {
return false;
}
VectorSchemaRoot root = reader.root();
if (currentBatch == null) {
// trying to verify from dev@spark but this object
// should only need to get created once. The underlying
// vectors should stay the same.
ColumnVector[] columns = namesInOrder.stream().map(root::getVector).map(vector -> new ArrowSchemaConverter(vector, userProvidedFieldMap.get(vector.getName()))).toArray(ColumnVector[]::new);
currentBatch = new ColumnarBatch(columns);
}
currentBatch.setNumRows(root.getRowCount());
tracer.rowsParseFinished(currentBatch.numRows());
return true;
}
Aggregations