use of org.apache.drill.exec.record.RecordBatchSizer in project drill by axbaretto.
the class SortImpl method analyzeIncomingBatch.
/**
* Scan the vectors in the incoming batch to determine batch size.
*
* @return an analysis of the incoming batch
*/
private void analyzeIncomingBatch(VectorAccessible incoming) {
sizer = new RecordBatchSizer(incoming);
sizer.applySv2();
if (metrics.getInputBatchCount() == 0) {
logger.debug("{}", sizer.toString());
}
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by axbaretto.
the class TestOutputBatchSize method getExpectedSize.
/**
* Figures out what will be total size of the batches for a given Json input batch.
*/
private long getExpectedSize(List<String> expectedJsonBatches) throws ExecutionSetupException {
// Create a dummy scanBatch to figure out the size.
RecordBatch scanBatch = new ScanBatch(new MockPhysicalOperator(), fragContext, getReaderListForJsonBatches(expectedJsonBatches, fragContext));
Iterable<VectorAccessible> batches = new BatchIterator(scanBatch);
long totalSize = 0;
for (VectorAccessible batch : batches) {
RecordBatchSizer sizer = new RecordBatchSizer(batch);
totalSize += sizer.netSize();
}
return totalSize;
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by axbaretto.
the class TestOutputBatchSize method testSizerRepeatedList.
@Test
public void testSizerRepeatedList() throws Exception {
List<String> inputJsonBatches = Lists.newArrayList();
StringBuilder batchString = new StringBuilder();
StringBuilder newString = new StringBuilder();
newString.append("[ [1,2,3,4], [5,6,7,8] ]");
numRows = 9;
batchString.append("[");
for (int i = 0; i < numRows; i++) {
batchString.append("{\"c\" : " + newString);
batchString.append("},");
}
batchString.append("{\"c\" : " + newString);
batchString.append("}");
batchString.append("]");
inputJsonBatches.add(batchString.toString());
// Create a dummy scanBatch to figure out the size.
RecordBatch scanBatch = new ScanBatch(new MockPhysicalOperator(), fragContext, getReaderListForJsonBatches(inputJsonBatches, fragContext));
VectorAccessible va = new BatchIterator(scanBatch).iterator().next();
RecordBatchSizer sizer = new RecordBatchSizer(va);
assertEquals(1, sizer.columns().size());
RecordBatchSizer.ColumnSize column = sizer.columns().get("c");
assertNotNull(column);
/**
* stdDataSize:8*10*10, stdNetSize:8*10*10 + 4*10 + 4*10 + 4,
* dataSizePerEntry:8*8, netSizePerEntry:8*8 + 4*2 + 4,
* totalDataSize:8*8*10, totalNetSize:netSizePerEntry*10, valueCount:10,
* elementCount:10, estElementCountPerArray:1, isVariableWidth:false
*/
assertEquals(800, column.getStdDataSizePerEntry());
assertEquals(884, column.getStdNetSizePerEntry());
assertEquals(64, column.getDataSizePerEntry());
assertEquals(76, column.getNetSizePerEntry());
assertEquals(640, column.getTotalDataSize());
assertEquals(760, column.getTotalNetSize());
assertEquals(10, column.getValueCount());
assertEquals(20, column.getElementCount());
assertEquals(2, column.getCardinality(), 0.01);
assertEquals(false, column.isVariableWidth());
final int testRowCount = 1000;
final int testRowCountPowerTwo = 2048;
for (VectorWrapper<?> vw : va) {
ValueVector v = vw.getValueVector();
v.clear();
RecordBatchSizer.ColumnSize colSize = sizer.getColumn(v.getField().getName());
// Allocates to nearest power of two
colSize.allocateVector(v, testRowCount);
// offset vector of delegate vector i.e. outer array should have row count number of values.
UInt4Vector offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals((Integer.highestOneBit(testRowCount) << 1), offsetVector.getValueCapacity());
// Get inner vector of delegate vector.
ValueVector vector = ((RepeatedValueVector) v).getDataVector();
// Data vector of inner vector should
// have 2 (outer array cardinality) * 4 (inner array cardinality) * row count number of values.
ValueVector dataVector = ((RepeatedValueVector) vector).getDataVector();
assertEquals(Integer.highestOneBit((testRowCount * 8) << 1), dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * row count number of values.
offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
assertEquals((Integer.highestOneBit(testRowCount * 2) << 1), offsetVector.getValueCapacity());
v.clear();
// Allocates the same as value passed since it is already power of two.
// -1 is done for adjustment needed for offset vector.
colSize.allocateVector(v, testRowCountPowerTwo - 1);
// offset vector of delegate vector i.e. outer array should have row count number of values.
offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals(testRowCountPowerTwo, offsetVector.getValueCapacity());
// Get inner vector of delegate vector.
vector = ((RepeatedValueVector) v).getDataVector();
// Data vector of inner vector should
// have 2 (outer array cardinality) * 4 (inner array cardinality) * row count number of values.
dataVector = ((RepeatedValueVector) vector).getDataVector();
assertEquals(testRowCountPowerTwo * 8, dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * row count number of values.
offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
assertEquals(testRowCountPowerTwo * 2, offsetVector.getValueCapacity());
v.clear();
// MAX ROW COUNT
colSize.allocateVector(v, ValueVector.MAX_ROW_COUNT - 1);
// offset vector of delegate vector i.e. outer array should have row count number of values.
offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals(ValueVector.MAX_ROW_COUNT, offsetVector.getValueCapacity());
// Get inner vector of delegate vector.
vector = ((RepeatedValueVector) v).getDataVector();
// Data vector of inner vector should
// have 2 (outer array cardinality) * 4 (inner array cardinality) * row count number of values.
dataVector = ((RepeatedValueVector) vector).getDataVector();
assertEquals(ValueVector.MAX_ROW_COUNT * 8, dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * row count number of values.
offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
assertEquals(ValueVector.MAX_ROW_COUNT * 2, offsetVector.getValueCapacity());
v.clear();
// MIN ROW COUNT
colSize.allocateVector(v, 0);
// offset vector of delegate vector i.e. outer array should have 1 value.
offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals(ValueVector.MIN_ROW_COUNT, offsetVector.getValueCapacity());
// Get inner vector of delegate vector.
vector = ((RepeatedValueVector) v).getDataVector();
// Data vector of inner vector should have 1 value
dataVector = ((RepeatedValueVector) vector).getDataVector();
assertEquals(ValueVector.MIN_ROW_COUNT, dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * 1.
offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
assertEquals(ValueVector.MIN_ROW_COUNT * 2, offsetVector.getValueCapacity());
v.clear();
}
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by axbaretto.
the class TestShortArrays method testSizer.
@Test
public void testSizer() {
// Create a row set with less than one item, on
// average, per array.
BatchSchema schema = new SchemaBuilder().add("a", MinorType.INT).addArray("b", MinorType.INT).build();
RowSetBuilder builder = fixture.rowSetBuilder(schema).addRow(1, intArray(10));
for (int i = 2; i <= 10; i++) {
builder.addRow(i, intArray());
}
RowSet rows = builder.build();
// Run the record batch sizer on the resulting batch.
RecordBatchSizer sizer = new RecordBatchSizer(rows.container());
assertEquals(2, sizer.columns().size());
ColumnSize bCol = sizer.columns().get("b");
assertEquals(0.1, bCol.getCardinality(), 0.01);
assertEquals(1, bCol.getElementCount());
// Create a vector initializer using the sizer info.
VectorInitializer vi = sizer.buildVectorInitializer();
AllocationHint bHint = vi.hint("b");
assertNotNull(bHint);
assertEquals(bHint.elementCount, bCol.getCardinality(), 0.001);
// Create a new batch, and new vector, using the sizer and
// initializer inferred from the previous batch.
SingleRowSet empty = fixture.rowSet(schema);
vi.allocateBatch(empty.container(), 100);
assertEquals(2, empty.container().getNumberOfColumns());
@SuppressWarnings("resource") ValueVector bVector = empty.container().getValueVector(1).getValueVector();
assertTrue(bVector instanceof RepeatedIntVector);
assertEquals(16, ((RepeatedIntVector) bVector).getDataVector().getValueCapacity());
rows.clear();
empty.clear();
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by axbaretto.
the class HashAggTemplate method updateEstMaxBatchSize.
/**
* Update the estimated max batch size to be used in the Hash Aggr Op.
* using the record batch size to get the row width.
* @param incoming
*/
private void updateEstMaxBatchSize(RecordBatch incoming) {
// no handling of a schema (or varchar) change
if (estMaxBatchSize > 0) {
return;
}
// Use the sizer to get the input row width and the length of the longest varchar column
RecordBatchSizer sizer = new RecordBatchSizer(incoming);
logger.trace("Incoming sizer: {}", sizer);
// An empty batch only has the schema, can not tell actual length of varchars
// else use the actual varchars length, each capped at 50 (to match the space allocation)
long estInputRowWidth = sizer.rowCount() == 0 ? sizer.stdRowWidth() : sizer.netRowWidthCap50();
// Get approx max (varchar) column width to get better memory allocation
maxColumnWidth = Math.max(sizer.maxAvgColumnSize(), VARIABLE_MIN_WIDTH_VALUE_SIZE);
maxColumnWidth = Math.min(maxColumnWidth, VARIABLE_MAX_WIDTH_VALUE_SIZE);
//
// Calculate the estimated max (internal) batch (i.e. Keys batch + Values batch) size
// (which is used to decide when to spill)
// Also calculate the values batch size (used as a reserve to overcome an OOM)
//
Iterator<VectorWrapper<?>> outgoingIter = outContainer.iterator();
int fieldId = 0;
while (outgoingIter.hasNext()) {
ValueVector vv = outgoingIter.next().getValueVector();
MaterializedField mr = vv.getField();
int fieldSize = vv instanceof VariableWidthVector ? maxColumnWidth : TypeHelper.getSize(mr.getType());
estRowWidth += fieldSize;
estOutputRowWidth += fieldSize;
if (fieldId < numGroupByOutFields) {
fieldId++;
} else {
estValuesRowWidth += fieldSize;
}
}
// multiply by the max number of rows in a batch to get the final estimated max size
estMaxBatchSize = Math.max(estRowWidth, estInputRowWidth) * MAX_BATCH_SIZE;
// (When there are no aggr functions, use '1' as later code relies on this size being non-zero)
estValuesBatchSize = Math.max(estValuesRowWidth, 1) * MAX_BATCH_SIZE;
// initially assume same size
estOutgoingAllocSize = estValuesBatchSize;
logger.trace("{} phase. Estimated internal row width: {} Values row width: {} batch size: {} memory limit: {} max column width: {}", isTwoPhase ? (is2ndPhase ? "2nd" : "1st") : "Single", estRowWidth, estValuesRowWidth, estMaxBatchSize, allocator.getLimit(), maxColumnWidth);
if (estMaxBatchSize > allocator.getLimit()) {
logger.warn("HashAggregate: Estimated max batch size {} is larger than the memory limit {}", estMaxBatchSize, allocator.getLimit());
}
}
Aggregations