use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class DrillTestWrapper method addToCombinedVectorResults.
/**
* Add to result vectors and compare batch schema against expected schema while iterating batches.
* @param batches
* @param expectedSchema: the expected schema the batches should contain. Through SchemaChangeException
* if encounter different batch schema.
* @param combinedVectors: the vectors to hold the values when iterate the batches.
*
* @return number of batches
* @throws SchemaChangeException
* @throws UnsupportedEncodingException
*/
public static int addToCombinedVectorResults(Iterable<VectorAccessible> batches, BatchSchema expectedSchema, Long expectedBatchSize, Integer expectedNumBatches, Map<String, List<Object>> combinedVectors, Integer expectedTotalRecords) throws SchemaChangeException {
// TODO - this does not handle schema changes
int numBatch = 0;
long totalRecords = 0;
BatchSchema schema = null;
for (VectorAccessible loader : batches) {
numBatch++;
if (expectedSchema != null) {
if (!expectedSchema.isEquivalent(loader.getSchema())) {
throw new SchemaChangeException(String.format("Batch schema does not match expected schema\n" + "Actual schema: %s. Expected schema : %s", loader.getSchema(), expectedSchema));
}
}
if (expectedBatchSize != null) {
RecordBatchSizer sizer = new RecordBatchSizer(loader);
// Not checking actualSize as accounting is not correct when we do
// split and transfer ownership across operators.
Assert.assertTrue(sizer.getNetBatchSize() <= expectedBatchSize);
}
if (schema == null) {
schema = loader.getSchema();
for (MaterializedField mf : schema) {
combinedVectors.put(SchemaPath.getSimplePath(mf.getName()).toExpr(), new ArrayList<>());
}
} else {
// TODO - actually handle schema changes, this is just to get access to the SelectionVectorMode
// of the current batch, the check for a null schema is used to only mutate the schema once
// need to add new vectors and null fill for previous batches? distinction between null and non-existence important?
schema = loader.getSchema();
}
logger.debug("reading batch with " + loader.getRecordCount() + " rows, total read so far " + totalRecords);
totalRecords += loader.getRecordCount();
for (VectorWrapper<?> w : loader) {
String field = SchemaPath.getSimplePath(w.getField().getName()).toExpr();
ValueVector[] vectors;
if (w.isHyper()) {
vectors = w.getValueVectors();
} else {
vectors = new ValueVector[] { w.getValueVector() };
}
SelectionVector2 sv2 = null;
SelectionVector4 sv4 = null;
switch(schema.getSelectionVectorMode()) {
case TWO_BYTE:
sv2 = loader.getSelectionVector2();
break;
case FOUR_BYTE:
sv4 = loader.getSelectionVector4();
break;
default:
}
if (sv4 != null) {
for (int j = 0; j < sv4.getCount(); j++) {
int complexIndex = sv4.get(j);
int batchIndex = complexIndex >> 16;
int recordIndexInBatch = complexIndex & 65535;
Object obj = vectors[batchIndex].getAccessor().getObject(recordIndexInBatch);
if (obj != null) {
if (obj instanceof Text) {
obj = obj.toString();
}
}
combinedVectors.get(field).add(obj);
}
} else {
for (ValueVector vv : vectors) {
for (int j = 0; j < loader.getRecordCount(); j++) {
int index;
if (sv2 != null) {
index = sv2.getIndex(j);
} else {
index = j;
}
Object obj = vv.getAccessor().getObject(index);
if (obj != null) {
if (obj instanceof Text) {
obj = obj.toString();
}
}
combinedVectors.get(field).add(obj);
}
}
}
}
}
if (expectedNumBatches != null) {
// Based on how much memory is actually taken by value vectors (because of doubling stuff),
// we have to do complex math for predicting exact number of batches.
// Instead, check that number of batches is at least the minimum that is expected
// and no more than twice of that.
Assert.assertTrue(numBatch >= expectedNumBatches);
Assert.assertTrue(numBatch <= (2 * expectedNumBatches));
}
if (expectedTotalRecords != null) {
Assert.assertEquals(expectedTotalRecords.longValue(), totalRecords);
}
return numBatch;
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class TestOutputBatchSize method getExpectedSize.
/**
* Figures out what will be total size of the batches for a given Json input batch.
*/
private long getExpectedSize(List<String> expectedJsonBatches) throws ExecutionSetupException {
// Create a dummy scanBatch to figure out the size.
RecordBatch scanBatch = new ScanBatch(new MockPhysicalOperator(), fragContext, getReaderListForJsonBatches(expectedJsonBatches, fragContext));
Iterable<VectorAccessible> batches = new BatchIterator(scanBatch);
long totalSize = 0;
for (VectorAccessible batch : batches) {
RecordBatchSizer sizer = new RecordBatchSizer(batch);
totalSize += sizer.getNetBatchSize();
}
return totalSize;
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class TestOutputBatchSize method testSizerRepeatedList.
@Test
public void testSizerRepeatedList() throws Exception {
List<String> inputJsonBatches = Lists.newArrayList();
StringBuilder batchString = new StringBuilder();
StringBuilder newString = new StringBuilder();
newString.append("[ [1,2,3,4], [5,6,7,8] ]");
numRows = 9;
batchString.append("[");
for (int i = 0; i < numRows; i++) {
batchString.append("{\"c\" : " + newString);
batchString.append("},");
}
batchString.append("{\"c\" : " + newString);
batchString.append("}");
batchString.append("]");
inputJsonBatches.add(batchString.toString());
// Create a dummy scanBatch to figure out the size.
RecordBatch scanBatch = new ScanBatch(new MockPhysicalOperator(), fragContext, getReaderListForJsonBatches(inputJsonBatches, fragContext));
VectorAccessible va = new BatchIterator(scanBatch).iterator().next();
RecordBatchSizer sizer = new RecordBatchSizer(va);
assertEquals(1, sizer.columns().size());
RecordBatchSizer.ColumnSize column = sizer.columns().get("c");
assertNotNull(column);
/**
* stdDataSize:8*5*5, stdNetSize:8*5*5 + 4*5 + 4*5 + 4,
* dataSizePerEntry:8*8, netSizePerEntry:8*8 + 4*2 + 4,
* totalDataSize:8*8*10, totalNetSize:netSizePerEntry*10, valueCount:10,
* elementCount:10, estElementCountPerArray:1, isVariableWidth:false
*/
assertEquals(200, column.getStdDataSizePerEntry());
assertEquals(244, column.getStdNetSizePerEntry());
assertEquals(64, column.getDataSizePerEntry());
assertEquals(76, column.getNetSizePerEntry());
assertEquals(640, column.getTotalDataSize());
assertEquals(760, column.getTotalNetSize());
assertEquals(10, column.getValueCount());
assertEquals(20, column.getElementCount());
assertEquals(2, column.getCardinality(), 0.01);
assertEquals(false, column.isVariableWidth());
final int testRowCount = 1000;
final int testRowCountPowerTwo = 2048;
for (VectorWrapper<?> vw : va) {
ValueVector v = vw.getValueVector();
v.clear();
RecordBatchSizer.ColumnSize colSize = sizer.getColumn(v.getField().getName());
// Allocates to nearest power of two
colSize.allocateVector(v, testRowCount);
// offset vector of delegate vector i.e. outer array should have row count number of values.
UInt4Vector offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals((Integer.highestOneBit(testRowCount) << 1), offsetVector.getValueCapacity());
// Get inner vector of delegate vector.
ValueVector vector = ((RepeatedValueVector) v).getDataVector();
// Data vector of inner vector should
// have 2 (outer array cardinality) * 4 (inner array cardinality) * row count number of values.
ValueVector dataVector = ((RepeatedValueVector) vector).getDataVector();
assertEquals(Integer.highestOneBit((testRowCount * 8) << 1), dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * row count number of values.
offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
assertEquals((Integer.highestOneBit(testRowCount * 2) << 1), offsetVector.getValueCapacity());
v.clear();
// Allocates the same as value passed since it is already power of two.
// -1 is done for adjustment needed for offset vector.
colSize.allocateVector(v, testRowCountPowerTwo - 1);
// offset vector of delegate vector i.e. outer array should have row count number of values.
offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals(testRowCountPowerTwo, offsetVector.getValueCapacity());
// Get inner vector of delegate vector.
vector = ((RepeatedValueVector) v).getDataVector();
// Data vector of inner vector should
// have 2 (outer array cardinality) * 4 (inner array cardinality) * row count number of values.
dataVector = ((RepeatedValueVector) vector).getDataVector();
assertEquals(testRowCountPowerTwo * 8, dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * row count number of values.
offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
assertEquals(testRowCountPowerTwo * 2, offsetVector.getValueCapacity());
v.clear();
// MAX ROW COUNT
colSize.allocateVector(v, ValueVector.MAX_ROW_COUNT - 1);
// offset vector of delegate vector i.e. outer array should have row count number of values.
offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals(ValueVector.MAX_ROW_COUNT, offsetVector.getValueCapacity());
// Get inner vector of delegate vector.
vector = ((RepeatedValueVector) v).getDataVector();
// Data vector of inner vector should
// have 2 (outer array cardinality) * 4 (inner array cardinality) * row count number of values.
dataVector = ((RepeatedValueVector) vector).getDataVector();
assertEquals(ValueVector.MAX_ROW_COUNT * 8, dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * row count number of values.
offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
assertEquals(ValueVector.MAX_ROW_COUNT * 2, offsetVector.getValueCapacity());
v.clear();
// MIN ROW COUNT
colSize.allocateVector(v, 0);
// offset vector of delegate vector i.e. outer array should have 1 value.
offsetVector = ((RepeatedListVector) v).getOffsetVector();
assertEquals(ValueVector.MIN_ROW_COUNT, offsetVector.getValueCapacity());
// Get inner vector of delegate vector.
vector = ((RepeatedValueVector) v).getDataVector();
// Data vector of inner vector should have 1 value
dataVector = ((RepeatedValueVector) vector).getDataVector();
assertEquals(ValueVector.MIN_ROW_COUNT, dataVector.getValueCapacity());
// offset vector of inner vector should have
// 2 (outer array cardinality) * 1.
offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
assertEquals(ValueVector.MIN_ROW_COUNT * 2, offsetVector.getValueCapacity());
v.clear();
}
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class LateralJoinBatch method crossJoinAndOutputRecords.
/**
* Main entry point for producing the output records. This method populates
* the output batch after cross join of the record in a given left batch at
* left index and all the corresponding rows in right batches produced by
* Unnest for current left batch. For each call to this function number of
* records copied in output batch is limited to maximum rows output batch can
* hold or the number of rows in right incoming batch
*/
private void crossJoinAndOutputRecords() {
final int rightRecordCount = right.getRecordCount();
// If there is no record in right batch just return current index in output batch
if (rightRecordCount <= 0) {
return;
}
// Check if right batch is empty since we have to handle left join case
Preconditions.checkState(rightJoinIndex != -1, "Right batch record count is >0 but index is -1");
int currentOutIndex = outputIndex;
// Number of rows that can be copied in output batch
int maxAvailableRowSlot = maxOutputRowCount - currentOutIndex;
if (logger.isDebugEnabled()) {
logger.debug("Producing output for leftIndex: {}, rightIndex: {}, rightRecordCount: {}, outputIndex: {} and " + "availableSlotInOutput: {}", leftJoinIndex, rightJoinIndex, rightRecordCount, outputIndex, maxAvailableRowSlot);
logger.debug("Output Batch stats before copying new data: {}", new RecordBatchSizer(this));
}
// Assuming that first vector in right batch is for implicitColumn.
// get a mapping of number of rows for each rowId present in current right side batch
// final Map<Integer, Integer> indexToFreq = getRowIdToRowCountMapping();
final IntVector rowIdVector = (IntVector) implicitVector;
final int leftRecordCount = left.getRecordCount();
// rightBatch end or vice-versa
while (maxAvailableRowSlot > 0 && rightJoinIndex < rightRecordCount) {
// Get rowId from current right row
int currentRowId = rowIdVector.getAccessor().get(rightJoinIndex);
int leftRowId = leftJoinIndex + 1;
int numRowsCopied = 0;
if (currentRowId > leftRecordCount || leftJoinIndex > leftRecordCount) {
// the arguments.
throw new IllegalStateException(String.format("Either RowId in right batch is greater than total records in " + "left batch or all rows in left batch is processed but there are still rows in right batch. " + "Details[RightRowId: %s, LeftRecordCount: %s, LeftJoinIndex: %s, RightJoinIndex: %s]", currentRowId, leftRecordCount, leftJoinIndex, rightJoinIndex));
}
if (logger.isTraceEnabled()) {
// Inside the if condition to eliminate parameter boxing cost
logger.trace("leftRowId and currentRowId are: {}, {}", leftRowId, currentRowId);
}
// and numRowsCopied. Also set leftMatchFound to true to indicate when to increase leftJoinIndex.
if (leftRowId == currentRowId) {
// there is a match
matchedRecordFound = true;
numRowsCopied = 1;
// numRowsCopied = Math.min(indexToFreq.get(currentRowId), maxAvailableRowSlot);
emitRight(rightJoinIndex, outputIndex, numRowsCopied);
emitLeft(leftJoinIndex, outputIndex, numRowsCopied);
outputIndex += numRowsCopied;
rightJoinIndex += numRowsCopied;
} else if (leftRowId < currentRowId) {
// and reset the matchedRecordFound flag
if (matchedRecordFound) {
matchedRecordFound = false;
++leftJoinIndex;
continue;
} else {
// and increase the indexes properly to reflect that
if (JoinRelType.LEFT == popConfig.getJoinType()) {
numRowsCopied = 1;
emitLeft(leftJoinIndex, outputIndex, numRowsCopied);
++outputIndex;
}
++leftJoinIndex;
}
} else {
Preconditions.checkState(leftRowId <= currentRowId, "Unexpected case where rowId " + "%s in right batch of lateral is smaller than rowId %s in left batch being processed", currentRowId, leftRowId);
}
// Update the max available rows slot in output batch
maxAvailableRowSlot -= numRowsCopied;
}
}
use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.
the class HashAggTemplate method updateEstMaxBatchSize.
/**
* Update the estimated max batch size to be used in the Hash Aggr Op.
* using the record batch size to get the row width.
* @param incoming
*/
private void updateEstMaxBatchSize(RecordBatch incoming) {
// no handling of a schema (or varchar) change
if (estMaxBatchSize > 0) {
return;
}
// Use the sizer to get the input row width and the length of the longest varchar column
RecordBatchSizer sizer = outgoing.getRecordBatchMemoryManager().getRecordBatchSizer();
logger.trace("Incoming sizer: {}", sizer);
// An empty batch only has the schema, can not tell actual length of varchars
// else use the actual varchars length, each capped at 50 (to match the space allocation)
long estInputRowWidth = sizer.rowCount() == 0 ? sizer.getStdRowWidth() : sizer.getNetRowWidthCap50();
// Get approx max (varchar) column width to get better memory allocation
maxColumnWidth = Math.max(sizer.getMaxAvgColumnSize(), VARIABLE_MIN_WIDTH_VALUE_SIZE);
maxColumnWidth = Math.min(maxColumnWidth, VARIABLE_MAX_WIDTH_VALUE_SIZE);
//
// Calculate the estimated max (internal) batch (i.e. Keys batch + Values batch) size
// (which is used to decide when to spill)
// Also calculate the values batch size (used as a reserve to overcome an OOM)
//
Iterator<VectorWrapper<?>> outgoingIter = outContainer.iterator();
int fieldId = 0;
while (outgoingIter.hasNext()) {
ValueVector vv = outgoingIter.next().getValueVector();
MaterializedField mr = vv.getField();
int fieldSize = vv instanceof VariableWidthVector ? maxColumnWidth : TypeHelper.getSize(mr.getType());
estRowWidth += fieldSize;
estOutputRowWidth += fieldSize;
if (fieldId < numGroupByOutFields) {
fieldId++;
} else {
estValuesRowWidth += fieldSize;
}
}
// multiply by the max number of rows in a batch to get the final estimated max size
long estimatedMaxWidth = Math.max(estRowWidth, estInputRowWidth);
estMaxBatchSize = estimatedMaxWidth * MAX_BATCH_ROW_COUNT;
// estimated batch size should not exceed the configuration given size
int configuredBatchSize = outgoing.getRecordBatchMemoryManager().getOutputBatchSize();
estMaxBatchSize = Math.min(estMaxBatchSize, configuredBatchSize);
// work back the number of rows (may have been reduced from MAX_BATCH_ROW_COUNT)
long rowsInBatch = estMaxBatchSize / estimatedMaxWidth;
// (When there are no aggr functions, use '1' as later code relies on this size being non-zero)
estValuesBatchSize = Math.max(estValuesRowWidth, 1) * rowsInBatch;
// initially assume same size
estOutgoingAllocSize = estValuesBatchSize;
logger.trace("{} phase. Estimated internal row width: {} Values row width: {} batch size: {} memory limit: {} max column width: {}", phase.getName(), estRowWidth, estValuesRowWidth, estMaxBatchSize, allocator.getLimit(), maxColumnWidth);
if (estMaxBatchSize > allocator.getLimit()) {
logger.warn("HashAggregate: Estimated max batch size {} is larger than the memory limit {}", estMaxBatchSize, allocator.getLimit());
}
}
Aggregations