use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class HashPartition method completeABatch.
/**
* A current batch is full (or no more rows incoming) - complete processing this batch
* I.e., add it to its partition's tmp list, if needed - spill that list, and if needed -
* (that is, more rows are coming) - initialize with a new current batch for that partition
*/
private void completeABatch(boolean toInitialize, boolean needsSpill) {
if (currentBatch.hasRecordCount() && currentBatch.getRecordCount() > 0) {
currentBatch.add(currHVVector);
currentBatch.buildSchema(BatchSchema.SelectionVectorMode.NONE);
tmpBatchesList.add(currentBatch);
partitionBatchesCount++;
long batchSize = new RecordBatchSizer(currentBatch).getActualSize();
inMemoryBatchStats.add(new HashJoinMemoryCalculator.BatchStat(currentBatch.getRecordCount(), batchSize));
partitionInMemorySize += batchSize;
numInMemoryRecords += currentBatch.getRecordCount();
} else {
freeCurrentBatchAndHVVector();
}
if (needsSpill) {
// spill this batch/partition and free its memory
spillThisPartition();
}
if (toInitialize) {
// allocate a new batch and HV vector
allocateNewCurrentBatchAndHV();
} else {
currentBatch = null;
currHVVector = null;
}
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class ProjectMemoryManager method update.
@Override
public void update() {
long updateStartTime = System.currentTimeMillis();
RecordBatchSizer batchSizer = new RecordBatchSizer(incomingBatch);
long batchSizerEndTime = System.currentTimeMillis();
setRecordBatchSizer(batchSizer);
rowWidth = 0;
int totalVariableColumnWidth = 0;
for (String outputColumnName : varWidthColumnSizes.keySet()) {
VariableWidthColumnInfo columnWidthInfo = varWidthColumnSizes.get(outputColumnName);
int width = -1;
// Walk the tree of OutputWidthExpressions to get a FixedLenExpr
// As the tree is walked, the RecordBatchSizer and function annotations
// are looked-up to come up with the final FixedLenExpr
OutputWidthExpression savedWidthExpr = columnWidthInfo.getOutputExpression();
OutputWidthVisitorState state = new OutputWidthVisitorState(this);
OutputWidthExpression reducedExpr = savedWidthExpr.accept(new OutputWidthVisitor(), state);
width = ((FixedLenExpr) reducedExpr).getDataWidth();
Preconditions.checkState(width >= 0);
int metadataWidth = getMetadataWidth(columnWidthInfo.outputVV);
logger.trace("update(): fieldName {} width: {} metadataWidth: {}", columnWidthInfo.outputVV.getField().getName(), width, metadataWidth);
width += metadataWidth;
totalVariableColumnWidth += width;
}
rowWidth += totalFixedWidthColumnWidth;
rowWidth += totalComplexColumnWidth;
rowWidth += totalVariableColumnWidth;
int outPutRowCount;
if (rowWidth != 0) {
// if rowWidth is not zero, set the output row count in the sizer
setOutputRowCount(getOutputBatchSize(), rowWidth);
// if more rows can be allowed than the incoming row count, then set the
// output row count to the incoming row count.
outPutRowCount = Math.min(getOutputRowCount(), batchSizer.rowCount());
} else {
// if rowWidth == 0 then the memory manager does
// not have sufficient information to size the batch
// let the entire batch pass through.
// If incoming rc == 0, all RB Sizer look-ups will have
// 0 width and so total width can be 0
outPutRowCount = incomingBatch.getRecordCount();
}
setOutputRowCount(outPutRowCount);
long updateEndTime = System.currentTimeMillis();
logger.trace("update() : Output RC {}, BatchSizer RC {}, incoming RC {}, width {}, total fixed width {}" + ", total variable width {}, total complex width {}, batchSizer time {} ms, update time {} ms" + ", manager {}, incoming {}", outPutRowCount, batchSizer.rowCount(), incomingBatch.getRecordCount(), rowWidth, totalFixedWidthColumnWidth, totalVariableColumnWidth, totalComplexColumnWidth, (batchSizerEndTime - updateStartTime), (updateEndTime - updateStartTime), this, incomingBatch);
RecordBatchStats.logRecordBatchStats(RecordBatchIOType.INPUT, getRecordBatchSizer(), outgoingBatch.getRecordBatchStatsContext());
updateIncomingStats();
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class TestOutputBatchSize method testSizerRepeatedRepeatedList.
@Test
public void testSizerRepeatedRepeatedList() throws Exception {
List<String> inputJsonBatches = Lists.newArrayList();
StringBuilder batchString = new StringBuilder();
StringBuilder newString = new StringBuilder();
newString.append("[ [[1,2,3,4], [5,6,7,8]], [[1,2,3,4], [5,6,7,8]] ]");
numRows = 9;
batchString.append("[");
for (int i = 0; i < numRows; i++) {
batchString.append("{\"c\" : " + newString);
batchString.append("},");
}
batchString.append("{\"c\" : " + newString);
batchString.append("}");
batchString.append("]");
inputJsonBatches.add(batchString.toString());
// Create a dummy scanBatch to figure out the size.
RecordBatch scanBatch = new ScanBatch(new MockPhysicalOperator(), fragContext, getReaderListForJsonBatches(inputJsonBatches, fragContext));
VectorAccessible va = new BatchIterator(scanBatch).iterator().next();
RecordBatchSizer sizer = new RecordBatchSizer(va);
assertEquals(1, sizer.columns().size());
RecordBatchSizer.ColumnSize column = sizer.columns().get("c");
assertNotNull(column);
/**
* stdDataSize:8*5*5*5, stdNetSize:8*5*5*5 + 8*5*5 + 8*5 + 4,
* dataSizePerEntry:16*8, netSizePerEntry:16*8 + 16*4 + 4*2 + 4*2,
* totalDataSize:16*8*10, totalNetSize:netSizePerEntry*10, valueCount:10,
* elementCount:10, estElementCountPerArray:1, isVariableWidth:false
*/
assertEquals(1000, column.getStdDataSizePerEntry());
assertEquals(1244, column.getStdNetSizePerEntry());
assertEquals(128, column.getDataSizePerEntry());
assertEquals(156, column.getNetSizePerEntry());
assertEquals(1280, column.getTotalDataSize());
assertEquals(1560, column.getTotalNetSize());
assertEquals(10, column.getValueCount());
assertEquals(20, column.getElementCount());
assertEquals(2, column.getCardinality(), 0.01);
assertEquals(false, column.isVariableWidth());
final int testRowCount = 1000;
final int testRowCountPowerTwo = 2048;
for (VectorWrapper<?> vw : va) {
ValueVector v = vw.getValueVector();
v.clear();
RecordBatchSizer.ColumnSize colSize = sizer.getColumn(v.getField().getName());
// Allocates to nearest power of two
colSize.allocateVector(v, testRowCount);
// offset vector of delegate vector i.e. outer array should have row count number of values.
UInt4Vector offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals((Integer.highestOneBit(testRowCount) << 1), offsetVector.getValueCapacity());
// Get data vector of delegate vector. This is repeated list again
ValueVector dataVector = ((RepeatedListVector) v).getDataVector();
// offset vector of delegate vector of the inner repeated list
// This should have row count * 2 number of values.
offsetVector = ((RepeatedListVector) dataVector).getOffsetVector();
assertEquals((Integer.highestOneBit(testRowCount * 2) << 1), offsetVector.getValueCapacity());
// Data vector of inner vector should have row count * 2 number of values - 1 (for offset vector adjustment).
ValueVector innerDataVector = ((RepeatedValueVector) dataVector).getDataVector();
assertEquals((Integer.highestOneBit((testRowCount * 2) << 1) - 1), dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * 2 (inner array cardinality) * row count number of values.
offsetVector = ((RepeatedValueVector) innerDataVector).getOffsetVector();
assertEquals((Integer.highestOneBit(testRowCount * 4) << 1), offsetVector.getValueCapacity());
// Data vector of inner vector should
// have 2 (outer array cardinality) * 2 (inner array cardinality) * row count number of values.
dataVector = ((RepeatedValueVector) innerDataVector).getDataVector();
assertEquals(Integer.highestOneBit(testRowCount << 1) * 16, dataVector.getValueCapacity());
v.clear();
// Allocates the same as value passed since it is already power of two.
// -1 is done for adjustment needed for offset vector.
colSize.allocateVector(v, testRowCountPowerTwo - 1);
// offset vector of delegate vector i.e. outer array should have row count number of values.
offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals(testRowCountPowerTwo, offsetVector.getValueCapacity());
// Get data vector of delegate vector. This is repeated list again
dataVector = ((RepeatedListVector) v).getDataVector();
// offset vector of delegate vector of the inner repeated list
// This should have row count * 2 number of values.
offsetVector = ((RepeatedListVector) dataVector).getOffsetVector();
assertEquals(testRowCountPowerTwo * 2, offsetVector.getValueCapacity());
// Data vector of inner vector should have row count * 2 number of values - 1 (for offset vector adjustment).
innerDataVector = ((RepeatedValueVector) dataVector).getDataVector();
assertEquals(testRowCountPowerTwo * 2 - 1, dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * 2 (inner array cardinality) * row count number of values.
offsetVector = ((RepeatedValueVector) innerDataVector).getOffsetVector();
assertEquals(testRowCountPowerTwo * 4, offsetVector.getValueCapacity());
// Data vector of inner vector should
// have 2 (outer array cardinality) * 2 (inner array cardinality) * row count number of values.
dataVector = ((RepeatedValueVector) innerDataVector).getDataVector();
assertEquals(testRowCountPowerTwo * 16, dataVector.getValueCapacity());
v.clear();
// MAX ROW COUNT
colSize.allocateVector(v, ValueVector.MAX_ROW_COUNT - 1);
// offset vector of delegate vector i.e. outer array should have row count number of values.
offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals(ValueVector.MAX_ROW_COUNT, offsetVector.getValueCapacity());
// Get data vector of delegate vector. This is repeated list again
dataVector = ((RepeatedListVector) v).getDataVector();
// offset vector of delegate vector of the inner repeated list
// This should have row count * 2 number of values.
offsetVector = ((RepeatedListVector) dataVector).getOffsetVector();
assertEquals(ValueVector.MAX_ROW_COUNT * 2, offsetVector.getValueCapacity());
// Data vector of inner vector should have row count * 2 number of values - 1 (for offset vector adjustment).
innerDataVector = ((RepeatedValueVector) dataVector).getDataVector();
assertEquals(ValueVector.MAX_ROW_COUNT * 2 - 1, dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * 2 (inner array cardinality) * row count number of values.
offsetVector = ((RepeatedValueVector) innerDataVector).getOffsetVector();
assertEquals(ValueVector.MAX_ROW_COUNT * 4, offsetVector.getValueCapacity());
// Data vector of inner vector should
// have 2 (outer array cardinality) * 2 (inner array cardinality) * row count number of values.
dataVector = ((RepeatedValueVector) innerDataVector).getDataVector();
assertEquals(ValueVector.MAX_ROW_COUNT * 16, dataVector.getValueCapacity());
v.clear();
// MIN ROW COUNT
colSize.allocateVector(v, 0);
// offset vector of delegate vector i.e. outer array should have 1 value.
offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals(ValueVector.MIN_ROW_COUNT, offsetVector.getValueCapacity());
// Get data vector of delegate vector. This is repeated list again
dataVector = ((RepeatedListVector) v).getDataVector();
// offset vector of delegate vector of the inner repeated list
offsetVector = ((RepeatedListVector) dataVector).getOffsetVector();
assertEquals(ValueVector.MIN_ROW_COUNT, offsetVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * 1.
offsetVector = ((RepeatedValueVector) innerDataVector).getOffsetVector();
assertEquals(ValueVector.MIN_ROW_COUNT * 2, offsetVector.getValueCapacity());
// Data vector of inner vector should 1 value.
dataVector = ((RepeatedValueVector) innerDataVector).getDataVector();
assertEquals(ValueVector.MIN_ROW_COUNT, dataVector.getValueCapacity());
v.clear();
}
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class TestShortArrays method testSizer.
@Test
public void testSizer() {
// Create a row set with less than one item, on
// average, per array.
TupleMetadata schema = new SchemaBuilder().add("a", MinorType.INT).addArray("b", MinorType.INT).buildSchema();
RowSetBuilder builder = fixture.rowSetBuilder(schema).addRow(1, intArray(10));
for (int i = 2; i <= 10; i++) {
builder.addRow(i, intArray());
}
RowSet rows = builder.build();
// Run the record batch sizer on the resulting batch.
RecordBatchSizer sizer = new RecordBatchSizer(rows.container());
assertEquals(2, sizer.columns().size());
ColumnSize bCol = sizer.columns().get("b");
assertEquals(0.1, bCol.getCardinality(), 0.01);
assertEquals(1, bCol.getElementCount());
// Create a vector initializer using the sizer info.
VectorInitializer vi = sizer.buildVectorInitializer();
AllocationHint bHint = vi.hint("b");
assertNotNull(bHint);
assertEquals(bHint.elementCount, bCol.getCardinality(), 0.001);
// Create a new batch, and new vector, using the sizer and
// initializer inferred from the previous batch.
SingleRowSet empty = fixture.rowSet(schema);
vi.allocateBatch(empty.container(), 100);
assertEquals(2, empty.container().getNumberOfColumns());
ValueVector bVector = empty.container().getValueVector(1).getValueVector();
assertTrue(bVector instanceof RepeatedIntVector);
assertEquals(16, ((RepeatedIntVector) bVector).getDataVector().getValueCapacity());
rows.clear();
empty.clear();
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class OperatorTestBuilder method go.
@SuppressWarnings("unchecked")
public void go() throws Exception {
final List<RowSet> actualResults = new ArrayList<>();
CloseableRecordBatch testOperator = null;
try {
validate();
int expectedNumBatches = expectedNumBatchesOpt.orElse(expectedResults.size());
physicalOpUnitTestBase.mockOpContext(physicalOperator, initReservation, maxAllocation);
final BatchCreator<PhysicalOperator> opCreator = (BatchCreator<PhysicalOperator>) physicalOpUnitTestBase.opCreatorReg.getOperatorCreator(physicalOperator.getClass());
testOperator = opCreator.getBatch(physicalOpUnitTestBase.fragContext, physicalOperator, (List) upstreamBatches);
batchIterator: for (int batchIndex = 0; ; batchIndex++) {
final RecordBatch.IterOutcome outcome = testOperator.next();
switch(outcome) {
case NONE:
if (!combineOutputBatches) {
Assert.assertEquals(expectedNumBatches, batchIndex);
}
// We are done iterating over batches. Now we need to compare them.
break batchIterator;
case OK_NEW_SCHEMA:
boolean skip = true;
try {
skip = testOperator.getContainer().getRecordCount() == 0;
} catch (IllegalStateException e) {
// We should skip this batch in this case. It means no data was included with the okay schema
} finally {
if (skip) {
batchIndex--;
break;
}
}
case OK:
if (!combineOutputBatches && batchIndex >= expectedNumBatches) {
testOperator.getContainer().clear();
Assert.fail("More batches received than expected.");
} else {
final boolean hasSelectionVector = testOperator.getSchema().getSelectionVectorMode().hasSelectionVector;
final VectorContainer container = testOperator.getContainer();
if (hasSelectionVector) {
throw new UnsupportedOperationException("Implement DRILL-6698");
} else {
actualResults.add(DirectRowSet.fromContainer(container));
}
break;
}
default:
throw new UnsupportedOperationException("Can't handle this yet");
}
}
int actualTotalRows = actualResults.stream().mapToInt(RowSet::rowCount).reduce(Integer::sum).orElse(0);
if (expectedResults.isEmpty()) {
Assert.assertEquals((int) expectedTotalRowsOpt.orElse(0), actualTotalRows);
// We are done, we don't have any expected result to compare
return;
}
if (combineOutputBatches) {
final RowSet expectedBatch = expectedResults.get(0);
final RowSet actualBatch = DirectRowSet.fromSchema(physicalOpUnitTestBase.operatorFixture.allocator, actualResults.get(0).container().getSchema());
final VectorContainer actualBatchContainer = actualBatch.container();
actualBatchContainer.setRecordCount(0);
final int numColumns = expectedBatch.schema().size();
List<MutableInt> totalBytesPerColumn = new ArrayList<>();
for (int columnIndex = 0; columnIndex < numColumns; columnIndex++) {
totalBytesPerColumn.add(new MutableInt());
}
// Get column sizes for each result batch
final List<List<RecordBatchSizer.ColumnSize>> columnSizesPerBatch = actualResults.stream().map(rowSet -> {
switch(rowSet.indirectionType()) {
case NONE:
return new RecordBatchSizer(rowSet.container()).columnsList();
default:
throw new UnsupportedOperationException("Implement DRILL-6698");
}
}).collect(Collectors.toList());
for (List<RecordBatchSizer.ColumnSize> columnSizes : columnSizesPerBatch) {
for (int columnIndex = 0; columnIndex < numColumns; columnIndex++) {
final MutableInt totalBytes = totalBytesPerColumn.get(columnIndex);
final RecordBatchSizer.ColumnSize columnSize = columnSizes.get(columnIndex);
totalBytes.add(columnSize.getTotalDataSize());
}
}
for (int columnIndex = 0; columnIndex < numColumns; columnIndex++) {
final ValueVector valueVector = actualBatchContainer.getValueVector(columnIndex).getValueVector();
if (valueVector instanceof FixedWidthVector) {
((FixedWidthVector) valueVector).allocateNew(actualTotalRows);
} else if (valueVector instanceof VariableWidthVector) {
final MutableInt totalBytes = totalBytesPerColumn.get(columnIndex);
((VariableWidthVector) valueVector).allocateNew(totalBytes.getValue(), actualTotalRows);
} else {
throw new UnsupportedOperationException();
}
}
try {
int currentIndex = 0;
for (RowSet actualRowSet : actualResults) {
final Copier copier;
final VectorContainer rowSetContainer = actualRowSet.container();
rowSetContainer.setRecordCount(actualRowSet.rowCount());
switch(actualRowSet.indirectionType()) {
case NONE:
copier = new GenericCopier();
break;
default:
throw new UnsupportedOperationException("Implement DRILL-6698");
}
copier.setup(rowSetContainer, actualBatchContainer);
copier.appendRecords(currentIndex, actualRowSet.rowCount());
currentIndex += actualRowSet.rowCount();
verify(expectedBatch, actualBatch);
}
} finally {
actualBatch.clear();
}
} else {
// Compare expected and actual results
for (int batchIndex = 0; batchIndex < expectedNumBatches; batchIndex++) {
final RowSet expectedBatch = expectedResults.get(batchIndex);
final RowSet actualBatch = actualResults.get(batchIndex);
verify(expectedBatch, actualBatch);
}
}
} finally {
if (testOperator != null) {
testOperator.close();
}
actualResults.forEach(rowSet -> rowSet.clear());
if (expectedResults != null) {
expectedResults.forEach(rowSet -> rowSet.clear());
}
upstreamBatches.forEach(rowSetBatch -> {
try {
rowSetBatch.close();
} catch (Exception e) {
logger.error("Error while closing RowSetBatch", e);
}
});
}
}
Aggregations