Examples with RecordBatchSizer - org.apache.drill.exec.record.RecordBatchSizer

Example 16 with RecordBatchSizer

use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.

the class DrillTestWrapper method addToCombinedVectorResults.

/**
 * Add to result vectors and compare batch schema against expected schema while iterating batches.
 * @param batches
 * @param  expectedSchema: the expected schema the batches should contain. Through SchemaChangeException
 *                       if encounter different batch schema.
 * @param combinedVectors: the vectors to hold the values when iterate the batches.
 *
 * @return number of batches
 * @throws SchemaChangeException
 * @throws UnsupportedEncodingException
 */
public static int addToCombinedVectorResults(Iterable<VectorAccessible> batches, BatchSchema expectedSchema, Long expectedBatchSize, Integer expectedNumBatches, Map<String, List<Object>> combinedVectors, Integer expectedTotalRecords) throws SchemaChangeException {
    // TODO - this does not handle schema changes
    int numBatch = 0;
    long totalRecords = 0;
    BatchSchema schema = null;
    for (VectorAccessible loader : batches) {
        numBatch++;
        if (expectedSchema != null) {
            if (!expectedSchema.isEquivalent(loader.getSchema())) {
                throw new SchemaChangeException(String.format("Batch schema does not match expected schema\n" + "Actual schema: %s.  Expected schema : %s", loader.getSchema(), expectedSchema));
            }
        }
        if (expectedBatchSize != null) {
            RecordBatchSizer sizer = new RecordBatchSizer(loader);
            // Not checking actualSize as accounting is not correct when we do
            // split and transfer ownership across operators.
            Assert.assertTrue(sizer.getNetBatchSize() <= expectedBatchSize);
        }
        if (schema == null) {
            schema = loader.getSchema();
            for (MaterializedField mf : schema) {
                combinedVectors.put(SchemaPath.getSimplePath(mf.getName()).toExpr(), new ArrayList<>());
            }
        } else {
            // TODO - actually handle schema changes, this is just to get access to the SelectionVectorMode
            // of the current batch, the check for a null schema is used to only mutate the schema once
            // need to add new vectors and null fill for previous batches? distinction between null and non-existence important?
            schema = loader.getSchema();
        }
        logger.debug("reading batch with " + loader.getRecordCount() + " rows, total read so far " + totalRecords);
        totalRecords += loader.getRecordCount();
        for (VectorWrapper<?> w : loader) {
            String field = SchemaPath.getSimplePath(w.getField().getName()).toExpr();
            ValueVector[] vectors;
            if (w.isHyper()) {
                vectors = w.getValueVectors();
            } else {
                vectors = new ValueVector[] { w.getValueVector() };
            }
            SelectionVector2 sv2 = null;
            SelectionVector4 sv4 = null;
            switch(schema.getSelectionVectorMode()) {
                case TWO_BYTE:
                    sv2 = loader.getSelectionVector2();
                    break;
                case FOUR_BYTE:
                    sv4 = loader.getSelectionVector4();
                    break;
                default:
            }
            if (sv4 != null) {
                for (int j = 0; j < sv4.getCount(); j++) {
                    int complexIndex = sv4.get(j);
                    int batchIndex = complexIndex >> 16;
                    int recordIndexInBatch = complexIndex & 65535;
                    Object obj = vectors[batchIndex].getAccessor().getObject(recordIndexInBatch);
                    if (obj != null) {
                        if (obj instanceof Text) {
                            obj = obj.toString();
                        }
                    }
                    combinedVectors.get(field).add(obj);
                }
            } else {
                for (ValueVector vv : vectors) {
                    for (int j = 0; j < loader.getRecordCount(); j++) {
                        int index;
                        if (sv2 != null) {
                            index = sv2.getIndex(j);
                        } else {
                            index = j;
                        }
                        Object obj = vv.getAccessor().getObject(index);
                        if (obj != null) {
                            if (obj instanceof Text) {
                                obj = obj.toString();
                            }
                        }
                        combinedVectors.get(field).add(obj);
                    }
                }
            }
        }
    }
    if (expectedNumBatches != null) {
        // Based on how much memory is actually taken by value vectors (because of doubling stuff),
        // we have to do complex math for predicting exact number of batches.
        // Instead, check that number of batches is at least the minimum that is expected
        // and no more than twice of that.
        Assert.assertTrue(numBatch >= expectedNumBatches);
        Assert.assertTrue(numBatch <= (2 * expectedNumBatches));
    }
    if (expectedTotalRecords != null) {
        Assert.assertEquals(expectedTotalRecords.longValue(), totalRecords);
    }
    return numBatch;
}

Also used : VectorAccessible(org.apache.drill.exec.record.VectorAccessible) MaterializedField(org.apache.drill.exec.record.MaterializedField) Text(org.apache.drill.exec.util.Text) ValueVector(org.apache.drill.exec.vector.ValueVector) RecordBatchSizer(org.apache.drill.exec.record.RecordBatchSizer) SchemaChangeException(org.apache.drill.exec.exception.SchemaChangeException) BatchSchema(org.apache.drill.exec.record.BatchSchema) SelectionVector2(org.apache.drill.exec.record.selection.SelectionVector2) SelectionVector4(org.apache.drill.exec.record.selection.SelectionVector4)

Example 17 with RecordBatchSizer

use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.

the class TestOutputBatchSize method getExpectedSize.

/**
 *  Figures out what will be total size of the batches for a given Json input batch.
 */
private long getExpectedSize(List<String> expectedJsonBatches) throws ExecutionSetupException {
    // Create a dummy scanBatch to figure out the size.
    RecordBatch scanBatch = new ScanBatch(new MockPhysicalOperator(), fragContext, getReaderListForJsonBatches(expectedJsonBatches, fragContext));
    Iterable<VectorAccessible> batches = new BatchIterator(scanBatch);
    long totalSize = 0;
    for (VectorAccessible batch : batches) {
        RecordBatchSizer sizer = new RecordBatchSizer(batch);
        totalSize += sizer.getNetBatchSize();
    }
    return totalSize;
}

Also used : RecordBatchSizer(org.apache.drill.exec.record.RecordBatchSizer) VectorAccessible(org.apache.drill.exec.record.VectorAccessible) RecordBatch(org.apache.drill.exec.record.RecordBatch) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch)

Example 18 with RecordBatchSizer

use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.

the class TestOutputBatchSize method testSizerRepeatedList.

@Test
public void testSizerRepeatedList() throws Exception {
    List<String> inputJsonBatches = Lists.newArrayList();
    StringBuilder batchString = new StringBuilder();
    StringBuilder newString = new StringBuilder();
    newString.append("[ [1,2,3,4], [5,6,7,8] ]");
    numRows = 9;
    batchString.append("[");
    for (int i = 0; i < numRows; i++) {
        batchString.append("{\"c\" : " + newString);
        batchString.append("},");
    }
    batchString.append("{\"c\" : " + newString);
    batchString.append("}");
    batchString.append("]");
    inputJsonBatches.add(batchString.toString());
    // Create a dummy scanBatch to figure out the size.
    RecordBatch scanBatch = new ScanBatch(new MockPhysicalOperator(), fragContext, getReaderListForJsonBatches(inputJsonBatches, fragContext));
    VectorAccessible va = new BatchIterator(scanBatch).iterator().next();
    RecordBatchSizer sizer = new RecordBatchSizer(va);
    assertEquals(1, sizer.columns().size());
    RecordBatchSizer.ColumnSize column = sizer.columns().get("c");
    assertNotNull(column);
    /**
     * stdDataSize:8*5*5, stdNetSize:8*5*5 + 4*5 + 4*5 + 4,
     * dataSizePerEntry:8*8, netSizePerEntry:8*8 + 4*2 + 4,
     * totalDataSize:8*8*10, totalNetSize:netSizePerEntry*10, valueCount:10,
     * elementCount:10, estElementCountPerArray:1, isVariableWidth:false
     */
    assertEquals(200, column.getStdDataSizePerEntry());
    assertEquals(244, column.getStdNetSizePerEntry());
    assertEquals(64, column.getDataSizePerEntry());
    assertEquals(76, column.getNetSizePerEntry());
    assertEquals(640, column.getTotalDataSize());
    assertEquals(760, column.getTotalNetSize());
    assertEquals(10, column.getValueCount());
    assertEquals(20, column.getElementCount());
    assertEquals(2, column.getCardinality(), 0.01);
    assertEquals(false, column.isVariableWidth());
    final int testRowCount = 1000;
    final int testRowCountPowerTwo = 2048;
    for (VectorWrapper<?> vw : va) {
        ValueVector v = vw.getValueVector();
        v.clear();
        RecordBatchSizer.ColumnSize colSize = sizer.getColumn(v.getField().getName());
        // Allocates to nearest power of two
        colSize.allocateVector(v, testRowCount);
        // offset vector of delegate vector i.e. outer array should have row count number of values.
        UInt4Vector offsetVector = ((RepeatedListVector) v).getOffsetVector();
        assertEquals((Integer.highestOneBit(testRowCount) << 1), offsetVector.getValueCapacity());
        // Get inner vector of delegate vector.
        ValueVector vector = ((RepeatedValueVector) v).getDataVector();
        // Data vector of inner vector should
        // have 2 (outer array cardinality) * 4 (inner array cardinality) * row count number of values.
        ValueVector dataVector = ((RepeatedValueVector) vector).getDataVector();
        assertEquals(Integer.highestOneBit((testRowCount * 8) << 1), dataVector.getValueCapacity());
        // offset vector of inner vector should have
        // 2 (outer array cardinality) * row count number of values.
        offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
        assertEquals((Integer.highestOneBit(testRowCount * 2) << 1), offsetVector.getValueCapacity());
        v.clear();
        // Allocates the same as value passed since it is already power of two.
        // -1 is done for adjustment needed for offset vector.
        colSize.allocateVector(v, testRowCountPowerTwo - 1);
        // offset vector of delegate vector i.e. outer array should have row count number of values.
        offsetVector = ((RepeatedListVector) v).getOffsetVector();
        assertEquals(testRowCountPowerTwo, offsetVector.getValueCapacity());
        // Get inner vector of delegate vector.
        vector = ((RepeatedValueVector) v).getDataVector();
        // Data vector of inner vector should
        // have 2 (outer array cardinality) * 4 (inner array cardinality) * row count number of values.
        dataVector = ((RepeatedValueVector) vector).getDataVector();
        assertEquals(testRowCountPowerTwo * 8, dataVector.getValueCapacity());
        // offset vector of inner vector should have
        // 2 (outer array cardinality) * row count number of values.
        offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
        assertEquals(testRowCountPowerTwo * 2, offsetVector.getValueCapacity());
        v.clear();
        // MAX ROW COUNT
        colSize.allocateVector(v, ValueVector.MAX_ROW_COUNT - 1);
        // offset vector of delegate vector i.e. outer array should have row count number of values.
        offsetVector = ((RepeatedListVector) v).getOffsetVector();
        assertEquals(ValueVector.MAX_ROW_COUNT, offsetVector.getValueCapacity());
        // Get inner vector of delegate vector.
        vector = ((RepeatedValueVector) v).getDataVector();
        // Data vector of inner vector should
        // have 2 (outer array cardinality) * 4 (inner array cardinality) * row count number of values.
        dataVector = ((RepeatedValueVector) vector).getDataVector();
        assertEquals(ValueVector.MAX_ROW_COUNT * 8, dataVector.getValueCapacity());
        // offset vector of inner vector should have
        // 2 (outer array cardinality) * row count number of values.
        offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
        assertEquals(ValueVector.MAX_ROW_COUNT * 2, offsetVector.getValueCapacity());
        v.clear();
        // MIN ROW COUNT
        colSize.allocateVector(v, 0);
        // offset vector of delegate vector i.e. outer array should have 1 value.
        offsetVector = ((RepeatedListVector) v).getOffsetVector();
        assertEquals(ValueVector.MIN_ROW_COUNT, offsetVector.getValueCapacity());
        // Get inner vector of delegate vector.
        vector = ((RepeatedValueVector) v).getDataVector();
        // Data vector of inner vector should have 1 value
        dataVector = ((RepeatedValueVector) vector).getDataVector();
        assertEquals(ValueVector.MIN_ROW_COUNT, dataVector.getValueCapacity());
        // offset vector of inner vector should have
        // 2 (outer array cardinality) * 1.
        offsetVector = ((RepeatedValueVector) vector).getOffsetVector();
        assertEquals(ValueVector.MIN_ROW_COUNT * 2, offsetVector.getValueCapacity());
        v.clear();
    }
}

Also used : VectorAccessible(org.apache.drill.exec.record.VectorAccessible) RecordBatch(org.apache.drill.exec.record.RecordBatch) RepeatedValueVector(org.apache.drill.exec.vector.complex.RepeatedValueVector) UInt4Vector(org.apache.drill.exec.vector.UInt4Vector) RepeatedValueVector(org.apache.drill.exec.vector.complex.RepeatedValueVector) ValueVector(org.apache.drill.exec.vector.ValueVector) RecordBatchSizer(org.apache.drill.exec.record.RecordBatchSizer) RepeatedListVector(org.apache.drill.exec.vector.complex.RepeatedListVector) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) Test(org.junit.Test)

Example 19 with RecordBatchSizer

use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.

the class LateralJoinBatch method crossJoinAndOutputRecords.

/**
 * Main entry point for producing the output records. This method populates
 * the output batch after cross join of the record in a given left batch at
 * left index and all the corresponding rows in right batches produced by
 * Unnest for current left batch. For each call to this function number of
 * records copied in output batch is limited to maximum rows output batch can
 * hold or the number of rows in right incoming batch
 */
private void crossJoinAndOutputRecords() {
    final int rightRecordCount = right.getRecordCount();
    // If there is no record in right batch just return current index in output batch
    if (rightRecordCount <= 0) {
        return;
    }
    // Check if right batch is empty since we have to handle left join case
    Preconditions.checkState(rightJoinIndex != -1, "Right batch record count is >0 but index is -1");
    int currentOutIndex = outputIndex;
    // Number of rows that can be copied in output batch
    int maxAvailableRowSlot = maxOutputRowCount - currentOutIndex;
    if (logger.isDebugEnabled()) {
        logger.debug("Producing output for leftIndex: {}, rightIndex: {}, rightRecordCount: {}, outputIndex: {} and " + "availableSlotInOutput: {}", leftJoinIndex, rightJoinIndex, rightRecordCount, outputIndex, maxAvailableRowSlot);
        logger.debug("Output Batch stats before copying new data: {}", new RecordBatchSizer(this));
    }
    // Assuming that first vector in right batch is for implicitColumn.
    // get a mapping of number of rows for each rowId present in current right side batch
    // final Map<Integer, Integer> indexToFreq = getRowIdToRowCountMapping();
    final IntVector rowIdVector = (IntVector) implicitVector;
    final int leftRecordCount = left.getRecordCount();
    // rightBatch end or vice-versa
    while (maxAvailableRowSlot > 0 && rightJoinIndex < rightRecordCount) {
        // Get rowId from current right row
        int currentRowId = rowIdVector.getAccessor().get(rightJoinIndex);
        int leftRowId = leftJoinIndex + 1;
        int numRowsCopied = 0;
        if (currentRowId > leftRecordCount || leftJoinIndex > leftRecordCount) {
            // the arguments.
            throw new IllegalStateException(String.format("Either RowId in right batch is greater than total records in " + "left batch or all rows in left batch is processed but there are still rows in right batch. " + "Details[RightRowId: %s, LeftRecordCount: %s, LeftJoinIndex: %s, RightJoinIndex: %s]", currentRowId, leftRecordCount, leftJoinIndex, rightJoinIndex));
        }
        if (logger.isTraceEnabled()) {
            // Inside the if condition to eliminate parameter boxing cost
            logger.trace("leftRowId and currentRowId are: {}, {}", leftRowId, currentRowId);
        }
        // and numRowsCopied. Also set leftMatchFound to true to indicate when to increase leftJoinIndex.
        if (leftRowId == currentRowId) {
            // there is a match
            matchedRecordFound = true;
            numRowsCopied = 1;
            // numRowsCopied = Math.min(indexToFreq.get(currentRowId), maxAvailableRowSlot);
            emitRight(rightJoinIndex, outputIndex, numRowsCopied);
            emitLeft(leftJoinIndex, outputIndex, numRowsCopied);
            outputIndex += numRowsCopied;
            rightJoinIndex += numRowsCopied;
        } else if (leftRowId < currentRowId) {
            // and reset the matchedRecordFound flag
            if (matchedRecordFound) {
                matchedRecordFound = false;
                ++leftJoinIndex;
                continue;
            } else {
                // and increase the indexes properly to reflect that
                if (JoinRelType.LEFT == popConfig.getJoinType()) {
                    numRowsCopied = 1;
                    emitLeft(leftJoinIndex, outputIndex, numRowsCopied);
                    ++outputIndex;
                }
                ++leftJoinIndex;
            }
        } else {
            Preconditions.checkState(leftRowId <= currentRowId, "Unexpected case where rowId " + "%s in right batch of lateral is smaller than rowId %s in left batch being processed", currentRowId, leftRowId);
        }
        // Update the max available rows slot in output batch
        maxAvailableRowSlot -= numRowsCopied;
    }
}

Also used : RecordBatchSizer(org.apache.drill.exec.record.RecordBatchSizer) IntVector(org.apache.drill.exec.vector.IntVector)

Example 20 with RecordBatchSizer

use of org.apache.drill.exec.record.RecordBatchSizer in project drill by apache.

the class HashAggTemplate method updateEstMaxBatchSize.

/**
 *  Update the estimated max batch size to be used in the Hash Aggr Op.
 *  using the record batch size to get the row width.
 * @param incoming
 */
private void updateEstMaxBatchSize(RecordBatch incoming) {
    // no handling of a schema (or varchar) change
    if (estMaxBatchSize > 0) {
        return;
    }
    // Use the sizer to get the input row width and the length of the longest varchar column
    RecordBatchSizer sizer = outgoing.getRecordBatchMemoryManager().getRecordBatchSizer();
    logger.trace("Incoming sizer: {}", sizer);
    // An empty batch only has the schema, can not tell actual length of varchars
    // else use the actual varchars length, each capped at 50 (to match the space allocation)
    long estInputRowWidth = sizer.rowCount() == 0 ? sizer.getStdRowWidth() : sizer.getNetRowWidthCap50();
    // Get approx max (varchar) column width to get better memory allocation
    maxColumnWidth = Math.max(sizer.getMaxAvgColumnSize(), VARIABLE_MIN_WIDTH_VALUE_SIZE);
    maxColumnWidth = Math.min(maxColumnWidth, VARIABLE_MAX_WIDTH_VALUE_SIZE);
    // 
    // Calculate the estimated max (internal) batch (i.e. Keys batch + Values batch) size
    // (which is used to decide when to spill)
    // Also calculate the values batch size (used as a reserve to overcome an OOM)
    // 
    Iterator<VectorWrapper<?>> outgoingIter = outContainer.iterator();
    int fieldId = 0;
    while (outgoingIter.hasNext()) {
        ValueVector vv = outgoingIter.next().getValueVector();
        MaterializedField mr = vv.getField();
        int fieldSize = vv instanceof VariableWidthVector ? maxColumnWidth : TypeHelper.getSize(mr.getType());
        estRowWidth += fieldSize;
        estOutputRowWidth += fieldSize;
        if (fieldId < numGroupByOutFields) {
            fieldId++;
        } else {
            estValuesRowWidth += fieldSize;
        }
    }
    // multiply by the max number of rows in a batch to get the final estimated max size
    long estimatedMaxWidth = Math.max(estRowWidth, estInputRowWidth);
    estMaxBatchSize = estimatedMaxWidth * MAX_BATCH_ROW_COUNT;
    // estimated batch size should not exceed the configuration given size
    int configuredBatchSize = outgoing.getRecordBatchMemoryManager().getOutputBatchSize();
    estMaxBatchSize = Math.min(estMaxBatchSize, configuredBatchSize);
    // work back the number of rows (may have been reduced from MAX_BATCH_ROW_COUNT)
    long rowsInBatch = estMaxBatchSize / estimatedMaxWidth;
    // (When there are no aggr functions, use '1' as later code relies on this size being non-zero)
    estValuesBatchSize = Math.max(estValuesRowWidth, 1) * rowsInBatch;
    // initially assume same size
    estOutgoingAllocSize = estValuesBatchSize;
    logger.trace("{} phase. Estimated internal row width: {} Values row width: {} batch size: {}  memory limit: {}  max column width: {}", phase.getName(), estRowWidth, estValuesRowWidth, estMaxBatchSize, allocator.getLimit(), maxColumnWidth);
    if (estMaxBatchSize > allocator.getLimit()) {
        logger.warn("HashAggregate: Estimated max batch size {} is larger than the memory limit {}", estMaxBatchSize, allocator.getLimit());
    }
}

Also used : ValueVector(org.apache.drill.exec.vector.ValueVector) RecordBatchSizer(org.apache.drill.exec.record.RecordBatchSizer) VectorWrapper(org.apache.drill.exec.record.VectorWrapper) MaterializedField(org.apache.drill.exec.record.MaterializedField) VariableWidthVector(org.apache.drill.exec.vector.VariableWidthVector)

Aggregations

RecordBatchSizer (org.apache.drill.exec.record.RecordBatchSizer)21 ValueVector (org.apache.drill.exec.vector.ValueVector)11 VectorAccessible (org.apache.drill.exec.record.VectorAccessible)8 RecordBatch (org.apache.drill.exec.record.RecordBatch)7 ScanBatch (org.apache.drill.exec.physical.impl.ScanBatch)6 Test (org.junit.Test)6 MaterializedField (org.apache.drill.exec.record.MaterializedField)4 UInt4Vector (org.apache.drill.exec.vector.UInt4Vector)4 RepeatedListVector (org.apache.drill.exec.vector.complex.RepeatedListVector)4 RepeatedValueVector (org.apache.drill.exec.vector.complex.RepeatedValueVector)4 BatchSchema (org.apache.drill.exec.record.BatchSchema)3 SchemaChangeException (org.apache.drill.exec.exception.SchemaChangeException)2 RowSet (org.apache.drill.exec.physical.rowSet.RowSet)2 ColumnSize (org.apache.drill.exec.record.RecordBatchSizer.ColumnSize)2 VectorInitializer (org.apache.drill.exec.record.VectorInitializer)2 AllocationHint (org.apache.drill.exec.record.VectorInitializer.AllocationHint)2 VectorWrapper (org.apache.drill.exec.record.VectorWrapper)2 SelectionVector2 (org.apache.drill.exec.record.selection.SelectionVector2)2 SelectionVector4 (org.apache.drill.exec.record.selection.SelectionVector4)2 Text (org.apache.drill.exec.util.Text)2