use of org.apache.drill.exec.physical.resultSet.RowSetLoader in project drill by apache.
the class TestResultSetLoaderUnions method testSimpleListDynamic.
/**
* Test a simple list created dynamically at load time.
* The list must include a single type member.
*/
@Test
public void testSimpleListDynamic() {
final ResultSetLoader rsLoader = new ResultSetLoaderImpl(fixture.allocator());
final RowSetLoader writer = rsLoader.writer();
// Can write a batch as if this was a repeated Varchar, except
// that any value can also be null.
rsLoader.startBatch();
writer.addColumn(MaterializedField.create("id", Types.required(MinorType.INT)));
final ColumnMetadata colSchema = MetadataUtils.newVariant("list", DataMode.REPEATED);
colSchema.variantSchema().addType(MinorType.VARCHAR);
colSchema.variantSchema().becomeSimple();
writer.addColumn(colSchema);
// Sanity check: should be an array of Varchar because we said the
// types within the list is not expandable.
final ArrayWriter arrWriter = writer.array("list");
assertEquals(ObjectType.SCALAR, arrWriter.entryType());
final ScalarWriter strWriter = arrWriter.scalar();
assertEquals(ValueType.STRING, strWriter.valueType());
writer.addRow(1, strArray("fred", "barney")).addRow(2, null).addRow(3, strArray("wilma", "betty", "pebbles"));
// Verify
final TupleMetadata schema = new SchemaBuilder().add("id", MinorType.INT).addList("list").addType(MinorType.VARCHAR).resumeSchema().buildSchema();
final SingleRowSet expected = fixture.rowSetBuilder(schema).addRow(1, strArray("fred", "barney")).addRow(2, null).addRow(3, strArray("wilma", "betty", "pebbles")).build();
RowSetUtilities.verify(expected, fixture.wrap(rsLoader.harvest()));
}
use of org.apache.drill.exec.physical.resultSet.RowSetLoader in project drill by apache.
the class TestResultSetSchemaChange method testSchemaChangeFirstBatch.
/**
* Test the case where the schema changes in the first batch.
* Schema changes before the first record are trivial and tested
* elsewhere. Here we write some records, then add new columns, as a
* JSON reader might do.
*/
@Test
public void testSchemaChangeFirstBatch() {
ResultSetLoader rsLoader = new ResultSetLoaderImpl(fixture.allocator());
RowSetLoader rootWriter = rsLoader.writer();
rootWriter.addColumn(SchemaBuilder.columnSchema("a", MinorType.VARCHAR, DataMode.REQUIRED));
// Create initial rows
rsLoader.startBatch();
int rowCount = 0;
for (int i = 0; i < 2; i++) {
rootWriter.start();
rowCount++;
rootWriter.scalar(0).setString("a_" + rowCount);
rootWriter.save();
}
// Add a second column: nullable.
rootWriter.addColumn(SchemaBuilder.columnSchema("b", MinorType.INT, DataMode.OPTIONAL));
for (int i = 0; i < 2; i++) {
rootWriter.start();
rowCount++;
rootWriter.scalar(0).setString("a_" + rowCount);
rootWriter.scalar(1).setInt(rowCount);
rootWriter.save();
}
// Add a third column. Use variable-width so that offset
// vectors must be back-filled.
rootWriter.addColumn(SchemaBuilder.columnSchema("c", MinorType.VARCHAR, DataMode.OPTIONAL));
for (int i = 0; i < 2; i++) {
rootWriter.start();
rowCount++;
rootWriter.scalar(0).setString("a_" + rowCount);
rootWriter.scalar(1).setInt(rowCount);
rootWriter.scalar(2).setString("c_" + rowCount);
rootWriter.save();
}
// Fourth: Required Varchar. Previous rows are back-filled with empty strings.
// And a required int. Back-filled with zeros.
// May occasionally be useful. But, does have to work to prevent
// vector corruption if some reader decides to go this route.
rootWriter.addColumn(SchemaBuilder.columnSchema("d", MinorType.VARCHAR, DataMode.REQUIRED));
rootWriter.addColumn(SchemaBuilder.columnSchema("e", MinorType.INT, DataMode.REQUIRED));
for (int i = 0; i < 2; i++) {
rootWriter.start();
rowCount++;
rootWriter.scalar(0).setString("a_" + rowCount);
rootWriter.scalar(1).setInt(rowCount);
rootWriter.scalar(2).setString("c_" + rowCount);
rootWriter.scalar(3).setString("d_" + rowCount);
rootWriter.scalar(4).setInt(rowCount * 10);
rootWriter.save();
}
// Add an array. Now two offset vectors must be back-filled.
rootWriter.addColumn(SchemaBuilder.columnSchema("f", MinorType.VARCHAR, DataMode.REPEATED));
for (int i = 0; i < 2; i++) {
rootWriter.start();
rowCount++;
rootWriter.scalar(0).setString("a_" + rowCount);
rootWriter.scalar(1).setInt(rowCount);
rootWriter.scalar(2).setString("c_" + rowCount);
rootWriter.scalar(3).setString("d_" + rowCount);
rootWriter.scalar(4).setInt(rowCount * 10);
ScalarWriter arrayWriter = rootWriter.column(5).array().scalar();
arrayWriter.setString("f_" + rowCount + "-1");
arrayWriter.setString("f_" + rowCount + "-2");
rootWriter.save();
}
// Harvest the batch and verify.
RowSet actual = fixture.wrap(rsLoader.harvest());
TupleMetadata expectedSchema = new SchemaBuilder().add("a", MinorType.VARCHAR).addNullable("b", MinorType.INT).addNullable("c", MinorType.VARCHAR).add("d", MinorType.VARCHAR).add("e", MinorType.INT).addArray("f", MinorType.VARCHAR).buildSchema();
SingleRowSet expected = fixture.rowSetBuilder(expectedSchema).addRow("a_1", null, null, "", 0, strArray()).addRow("a_2", null, null, "", 0, strArray()).addRow("a_3", 3, null, "", 0, strArray()).addRow("a_4", 4, null, "", 0, strArray()).addRow("a_5", 5, "c_5", "", 0, strArray()).addRow("a_6", 6, "c_6", "", 0, strArray()).addRow("a_7", 7, "c_7", "d_7", 70, strArray()).addRow("a_8", 8, "c_8", "d_8", 80, strArray()).addRow("a_9", 9, "c_9", "d_9", 90, strArray("f_9-1", "f_9-2")).addRow("a_10", 10, "c_10", "d_10", 100, strArray("f_10-1", "f_10-2")).build();
RowSetUtilities.verify(expected, actual);
rsLoader.close();
}
use of org.apache.drill.exec.physical.resultSet.RowSetLoader in project drill by apache.
the class TestResultSetSchemaChange method testSchemaChangeWithOverflow.
/**
* Test a schema change on the row that overflows. If the
* new column is added after overflow, it will appear as
* a schema-change in the following batch. This is fine as
* we are essentially time-shifting: pretending that the
* overflow row was written in the next batch (which, in
* fact, it is: that's what overflow means.)
*/
@Test
public void testSchemaChangeWithOverflow() {
ResultSetOptions options = new ResultSetOptionBuilder().rowCountLimit(ValueVector.MAX_ROW_COUNT).build();
ResultSetLoader rsLoader = new ResultSetLoaderImpl(fixture.allocator(), options);
RowSetLoader rootWriter = rsLoader.writer();
rootWriter.addColumn(SchemaBuilder.columnSchema("a", MinorType.VARCHAR, DataMode.REQUIRED));
rsLoader.startBatch();
byte[] value = new byte[512];
Arrays.fill(value, (byte) 'X');
int count = 0;
while (!rootWriter.isFull()) {
rootWriter.start();
rootWriter.scalar(0).setBytes(value, value.length);
if (rootWriter.isFull()) {
rootWriter.addColumn(SchemaBuilder.columnSchema("b", MinorType.INT, DataMode.OPTIONAL));
rootWriter.scalar(1).setInt(count);
// Add a Varchar to ensure its offset fiddling is done properly
rootWriter.addColumn(SchemaBuilder.columnSchema("c", MinorType.VARCHAR, DataMode.OPTIONAL));
rootWriter.scalar(2).setString("c-" + count);
// Allow adding a required column at this point.
// (Not intuitively obvious that this should work; we back-fill
// with zeros.)
rootWriter.addColumn(SchemaBuilder.columnSchema("d", MinorType.INT, DataMode.REQUIRED));
}
rootWriter.save();
count++;
}
// Result should include only the first column.
SchemaBuilder schemaBuilder = new SchemaBuilder().add("a", MinorType.VARCHAR);
BatchSchema expectedSchema = new BatchSchemaBuilder().withSchemaBuilder(schemaBuilder).build();
RowSet result = fixture.wrap(rsLoader.harvest());
assertTrue(result.batchSchema().isEquivalent(expectedSchema));
assertEquals(count - 1, result.rowCount());
result.clear();
assertEquals(1, rsLoader.schemaVersion());
// Double check: still can add a required column after
// starting the next batch. (No longer in overflow state.)
rsLoader.startBatch();
rootWriter.addColumn(SchemaBuilder.columnSchema("e", MinorType.INT, DataMode.REQUIRED));
// Next batch should start with the overflow row, including
// the column added at the end of the previous batch, after
// overflow.
result = fixture.wrap(rsLoader.harvest());
assertEquals(5, rsLoader.schemaVersion());
assertEquals(1, result.rowCount());
BatchSchemaBuilder batchSchemaBuilder = new BatchSchemaBuilder(expectedSchema);
batchSchemaBuilder.schemaBuilder().addNullable("b", MinorType.INT).addNullable("c", MinorType.VARCHAR).add("d", MinorType.INT).add("e", MinorType.INT);
expectedSchema = batchSchemaBuilder.build();
assertTrue(result.batchSchema().isEquivalent(expectedSchema));
RowSetReader reader = result.reader();
reader.next();
assertEquals(count - 1, reader.scalar(1).getInt());
assertEquals("c-" + (count - 1), reader.scalar(2).getString());
assertEquals(0, reader.scalar("d").getInt());
assertEquals(0, reader.scalar("e").getInt());
result.clear();
rsLoader.close();
}
use of org.apache.drill.exec.physical.resultSet.RowSetLoader in project drill by apache.
the class TestLoad method testLoadValueVectorEmptyVarCharArray.
@Test
public void testLoadValueVectorEmptyVarCharArray() throws Exception {
try (BufferAllocator allocator = RootAllocatorFactory.newRoot(drillConfig)) {
TupleMetadata schema = new SchemaBuilder().addArray("chars", MinorType.VARCHAR).build();
ResultSetLoaderImpl.ResultSetOptions options = new ResultSetOptionBuilder().readerSchema(schema).build();
ResultSetLoader resultSetLoader = new ResultSetLoaderImpl(allocator, options);
resultSetLoader.startBatch();
RowSetLoader rowWriter = resultSetLoader.writer();
rowWriter.addRow(new Object[] { null });
VectorContainer harvest = resultSetLoader.harvest();
// Create vectors
List<ValueVector> vectors = StreamSupport.stream(harvest.spliterator(), false).map(VectorWrapper::getValueVector).collect(Collectors.toList());
// Writeable batch now owns vector buffers
WritableBatch writableBatch = WritableBatch.getBatchNoHV(1, vectors, false);
// Serialize the vectors
DrillBuf byteBuf = serializeBatch(allocator, writableBatch);
// Batch loader does NOT take ownership of the serialized buffer
RecordBatchLoader batchLoader = new RecordBatchLoader(allocator);
batchLoader.load(writableBatch.getDef(), byteBuf);
// Release the serialized buffer.
byteBuf.release();
assertEquals(1, batchLoader.getRecordCount());
// Free the original vectors
writableBatch.clear();
// Free the deserialized vectors
batchLoader.clear();
}
}
use of org.apache.drill.exec.physical.resultSet.RowSetLoader in project drill by apache.
the class MetadataHandlerBatch method writeMetadata.
private <T extends BaseMetadata & LocationProvider> VectorContainer writeMetadata(List<T> metadataList) {
BaseMetadata firstElement = metadataList.iterator().next();
ResultSetLoader resultSetLoader = getResultSetLoaderForMetadata(firstElement);
resultSetLoader.startBatch();
RowSetLoader rowWriter = resultSetLoader.writer();
Iterator<T> segmentsIterator = metadataList.iterator();
while (!rowWriter.isFull() && segmentsIterator.hasNext()) {
T metadata = segmentsIterator.next();
metadataToHandle.remove(metadata.getMetadataInfo().identifier());
List<Object> arguments = new ArrayList<>();
// adds required segment names to the arguments
arguments.add(metadata.getPath().toUri().getPath());
Collections.addAll(arguments, Arrays.copyOf(MetadataIdentifierUtils.getValuesFromMetadataIdentifier(metadata.getMetadataInfo().identifier()), popConfig.getContext().segmentColumns().size()));
// adds column statistics values assuming that they are sorted in alphabetic order
// (see getResultSetLoaderForMetadata() method)
metadata.getColumnsStatistics().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().toExpr())).map(Map.Entry::getValue).flatMap(columnStatistics -> AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet().stream().map(columnStatistics::get)).forEach(arguments::add);
AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet().stream().map(metadata::getStatistic).forEach(arguments::add);
// collectedMap field value
arguments.add(new Object[] {});
if (metadataType == MetadataType.SEGMENT) {
arguments.add(((SegmentMetadata) metadata).getLocations().stream().map(path -> path.toUri().getPath()).toArray(String[]::new));
}
if (metadataType == MetadataType.ROW_GROUP) {
arguments.add(String.valueOf(((RowGroupMetadata) metadata).getRowGroupIndex()));
arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.START)));
arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
}
arguments.add(metadata.getSchema().jsonString());
arguments.add(String.valueOf(metadata.getLastModifiedTime()));
arguments.add(metadataType.name());
rowWriter.addRow(arguments.toArray());
}
return resultSetLoader.harvest();
}
Aggregations