use of org.apache.spark.sql.vectorized.ColumnVector in project iceberg by apache.
the class ColumnarBatchReader method read.
@Override
public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) {
Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead);
ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length];
if (reuse == null) {
closeVectors();
}
for (int i = 0; i < readers.length; i += 1) {
vectorHolders[i] = readers[i].read(vectorHolders[i], numRowsToRead);
int numRowsInVector = vectorHolders[i].numValues();
Preconditions.checkState(numRowsInVector == numRowsToRead, "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, numRowsToRead);
arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector);
}
ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors);
batch.setNumRows(numRowsToRead);
return batch;
}
use of org.apache.spark.sql.vectorized.ColumnVector in project iceberg by apache.
the class TestHelpers method assertEqualsBatch.
public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch, boolean checkArrowValidityVector) {
for (int rowId = 0; rowId < batch.numRows(); rowId++) {
List<Types.NestedField> fields = struct.fields();
InternalRow row = batch.getRow(rowId);
Record rec = expected.next();
for (int i = 0; i < fields.size(); i += 1) {
Type fieldType = fields.get(i).type();
Object expectedValue = rec.get(i);
Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType));
assertEqualsUnsafe(fieldType, expectedValue, actualValue);
if (checkArrowValidityVector) {
ColumnVector columnVector = batch.column(i);
ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector();
Assert.assertFalse("Nullability doesn't match of " + columnVector.dataType(), expectedValue == null ^ arrowVector.isNull(rowId));
}
}
}
}
use of org.apache.spark.sql.vectorized.ColumnVector in project spark-bigquery-connector by GoogleCloudDataproc.
the class ArrowReaderIterator method toArrowRows.
private Iterator<InternalRow> toArrowRows(VectorSchemaRoot root, List<String> namesInOrder) {
ColumnVector[] columns = namesInOrder.stream().map(name -> root.getVector(name)).map(vector -> new ArrowSchemaConverter(vector, userProvidedFieldMap.get(vector.getName()))).collect(Collectors.toList()).toArray(new ColumnVector[0]);
ColumnarBatch batch = new ColumnarBatch(columns);
batch.setNumRows(root.getRowCount());
return batch.rowIterator();
}
use of org.apache.spark.sql.vectorized.ColumnVector in project spark-bigquery-connector by GoogleCloudDataproc.
the class ArrowColumnBatchPartitionReaderContext method next.
public boolean next() throws IOException {
tracer.nextBatchNeeded();
if (closed) {
return false;
}
tracer.rowsParseStarted();
closed = !reader.loadNextBatch();
if (closed) {
return false;
}
VectorSchemaRoot root = reader.root();
if (currentBatch == null) {
// trying to verify from dev@spark but this object
// should only need to get created once. The underlying
// vectors should stay the same.
ColumnVector[] columns = namesInOrder.stream().map(root::getVector).map(vector -> new ArrowSchemaConverter(vector, userProvidedFieldMap.get(vector.getName()))).toArray(ColumnVector[]::new);
currentBatch = new ColumnarBatch(columns);
}
currentBatch.setNumRows(root.getRowCount());
tracer.rowsParseFinished(currentBatch.numRows());
return true;
}
use of org.apache.spark.sql.vectorized.ColumnVector in project TileDB-Spark by TileDB-Inc.
the class TileDBDataReaderPartitionScan method get.
@Override
public ColumnarBatch get() {
metricsUpdater.startTimer(queryGetTimerName);
try {
int nRows = (int) currentNumRecords;
if (resultBatch == null) {
ColumnVector[] colVecs = new ColumnVector[valueVectors.size()];
for (int i = 0; i < valueVectors.size(); i++) {
String name = fieldNames.get(i);
TypeInfo typeInfo = getTypeInfo(name);
boolean isDateType = typeInfo.multiplier != 1 || typeInfo.moreThanDay;
// if nullable
if (typeInfo.isNullable) {
// If the attribute is nullable we need to set the validity buffer from the main value
// vector in bitmap fashion.
// TileDB handles the bitmap as a bytemap, thus the following conversion.
ArrowBuf arrowBufValidity = valueVectors.get(i).getValidityBuffer();
ArrowBuf validityByteBuffer = validityValueVectors.get(i).getDataBuffer();
for (int j = 0; j < nRows; j++) {
if (validityByteBuffer.getByte(j) == (byte) 0) {
BitVectorHelper.setValidityBit(arrowBufValidity, j, 0);
}
}
}
if (isDateType) {
if (typeInfo.isVarLen)
throw new TileDBError("Var length attributes/dimensions of type TILEDB_DATETIME_* are not currently supported: " + name);
// it means that the datatype is Date and the values need filtering to
// accommodate for the fewer datatypes that spark provides compared to TileDB.
filterDataBufferForDateTypes(valueVectors.get(i).getDataBuffer(), currentNumRecords, typeInfo);
}
colVecs[i] = new ArrowColumnVector(valueVectors.get(i));
}
resultBatch = new ColumnarBatch(colVecs);
}
resultBatch.setNumRows(nRows);
// Note that calculateNativeArrayByteSizes() might not be
this.metricsUpdater.updateTaskMetrics(nRows, calculateResultByteSize());
} catch (TileDBError err) {
throw new RuntimeException(err.getMessage());
}
metricsUpdater.finish(queryGetTimerName);
return resultBatch;
}
Aggregations