use of org.apache.hadoop.hive.ql.exec.vector.rowbytescontainer.VectorRowBytesContainer in project hive by apache.
the class VectorPTFGroupBatches method getSpillRowBytesContainer.
private VectorRowBytesContainer getSpillRowBytesContainer() throws HiveException {
if (spillRowBytesContainer == null) {
spillRowBytesContainer = new VectorRowBytesContainer(spillLocalDirs);
if (bufferedBatchVectorSerializeRow == null) {
bufferedBatchVectorSerializeRow = new VectorSerializeRow<LazyBinarySerializeWrite>(new LazyBinarySerializeWrite(bufferedColumnMap.length));
// Deserialize just the columns we a buffered batch, which has only the non-key inputs and
// streamed column outputs.
bufferedBatchVectorSerializeRow.init(bufferedTypeInfos);
bufferedBatchVectorDeserializeRow = new VectorDeserializeRow<LazyBinaryDeserializeRead>(new LazyBinaryDeserializeRead(bufferedTypeInfos, /* useExternalBuffer */
true));
// Deserialize the fields into the *overflow* batch using the buffered batch column map.
bufferedBatchVectorDeserializeRow.init(bufferedColumnMap);
}
}
return spillRowBytesContainer;
}
use of org.apache.hadoop.hive.ql.exec.vector.rowbytescontainer.VectorRowBytesContainer in project hive by apache.
the class VectorMapJoinGenerateResultOperator method reProcessBigTable.
@Override
protected void reProcessBigTable(int partitionId) throws HiveException {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " reProcessBigTable enter...");
}
if (spillReplayBatch == null) {
// The process method was not called -- no big table rows.
return;
}
HashPartition partition = firstSmallTable.getHashPartitions()[partitionId];
int rowCount = 0;
int batchCount = 0;
try {
VectorRowBytesContainer bigTable = partition.getMatchfileRowBytesContainer();
bigTable.prepareForReading();
while (bigTable.readNext()) {
rowCount++;
byte[] bytes = bigTable.currentBytes();
int offset = bigTable.currentOffset();
int length = bigTable.currentLength();
bigTableVectorDeserializeRow.setBytes(bytes, offset, length);
try {
bigTableVectorDeserializeRow.deserialize(spillReplayBatch, spillReplayBatch.size);
} catch (Exception e) {
throw new HiveException("\nDeserializeRead detail: " + bigTableVectorDeserializeRow.getDetailedReadPositionString(), e);
}
spillReplayBatch.size++;
if (spillReplayBatch.size == VectorizedRowBatch.DEFAULT_SIZE) {
// call process once we have a full batch
process(spillReplayBatch, posBigTable);
spillReplayBatch.reset();
batchCount++;
}
}
// Process the row batch that has less than DEFAULT_SIZE rows
if (spillReplayBatch.size > 0) {
process(spillReplayBatch, posBigTable);
spillReplayBatch.reset();
batchCount++;
}
bigTable.clear();
} catch (Exception e) {
LOG.info(CLASS_NAME + " reProcessBigTable exception! " + e);
throw new HiveException(e);
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " reProcessBigTable exit! " + rowCount + " row processed and " + batchCount + " batches processed");
}
}
use of org.apache.hadoop.hive.ql.exec.vector.rowbytescontainer.VectorRowBytesContainer in project hive by apache.
the class VectorPTFGroupBatches method bufferGroupBatch.
public void bufferGroupBatch(VectorizedRowBatch batch) throws HiveException {
try {
// When we've buffered the max allowed, spill the oldest one to make space.
if (currentBufferedBatchCount >= spillLimitBufferedBatchCount) {
VectorRowBytesContainer rowBytesContainer = getSpillRowBytesContainer();
if (!didSpillToDisk) {
// UNDONE: Don't reuse for now.
// rowBytesContainer.resetWrite();
didSpillToDisk = true;
spillRowCount = 0;
}
// Grab the oldest in-memory buffered batch and dump it to disk.
VectorizedRowBatch oldestBufferedBatch = bufferedBatches.remove(0);
final boolean selectedInUse = oldestBufferedBatch.selectedInUse;
int[] selected = oldestBufferedBatch.selected;
final int size = oldestBufferedBatch.size;
for (int logicalIndex = 0; logicalIndex < size; logicalIndex++) {
final int batchIndex = (selectedInUse ? selected[logicalIndex] : logicalIndex);
Output output = rowBytesContainer.getOuputForRowBytes();
bufferedBatchVectorSerializeRow.setOutputAppend(output);
bufferedBatchVectorSerializeRow.serializeWrite(oldestBufferedBatch, batchIndex);
rowBytesContainer.finishRow();
spillRowCount++;
}
// Put now available buffered batch at end.
oldestBufferedBatch.reset();
bufferedBatches.add(oldestBufferedBatch);
currentBufferedBatchCount--;
}
final int bufferedColumnCount = bufferedColumnMap.length;
if (allocatedBufferedBatchCount <= currentBufferedBatchCount) {
VectorizedRowBatch newBatch = newBufferedBatch(batch);
bufferedBatches.add(newBatch);
allocatedBufferedBatchCount++;
}
VectorizedRowBatch bufferedBatch = bufferedBatches.get(currentBufferedBatchCount++);
// Copy critical columns.
final int size = batch.size;
for (int i = 0; i < bufferedColumnCount; i++) {
VectorizedBatchUtil.copyNonSelectedColumnVector(batch, bufferedColumnMap[i], bufferedBatch, i, size);
}
bufferedBatch.size = size;
} catch (IOException e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.rowbytescontainer.VectorRowBytesContainer in project hive by apache.
the class VectorPTFGroupBatches method forwardSpilledBatches.
private void forwardSpilledBatches(VectorPTFOperator vecPTFOperator, VectorizedRowBatch lastBatch) throws HiveException {
overflowBatch.reset();
copyPartitionAndOrderColumnsToOverflow(lastBatch);
long spillRowsRead = 0;
try {
VectorRowBytesContainer rowBytesContainer = getSpillRowBytesContainer();
rowBytesContainer.prepareForReading();
while (rowBytesContainer.readNext()) {
byte[] bytes = rowBytesContainer.currentBytes();
int offset = rowBytesContainer.currentOffset();
int length = rowBytesContainer.currentLength();
bufferedBatchVectorDeserializeRow.setBytes(bytes, offset, length);
try {
bufferedBatchVectorDeserializeRow.deserialize(overflowBatch, overflowBatch.size);
} catch (Exception e) {
throw new HiveException("\nDeserializeRead detail: " + bufferedBatchVectorDeserializeRow.getDetailedReadPositionString(), e);
}
overflowBatch.size++;
spillRowsRead++;
if (overflowBatch.size == VectorizedRowBatch.DEFAULT_SIZE) {
fillGroupResults(overflowBatch);
vecPTFOperator.forward(overflowBatch, null);
overflowBatch.reset();
copyPartitionAndOrderColumnsToOverflow(lastBatch);
}
}
// Process the row batch that has less than DEFAULT_SIZE rows
if (overflowBatch.size > 0) {
fillGroupResults(overflowBatch);
vecPTFOperator.forward(overflowBatch, null);
overflowBatch.reset();
copyPartitionAndOrderColumnsToOverflow(lastBatch);
}
Preconditions.checkState(spillRowsRead == spillRowCount);
// For now, throw away file.
releaseSpillRowBytesContainer();
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.rowbytescontainer.VectorRowBytesContainer in project hive by apache.
the class VectorMapJoinGenerateResultOperator method spillSerializeRow.
private void spillSerializeRow(VectorizedRowBatch batch, int batchIndex, VectorMapJoinHashTableResult hashTableResult) throws IOException {
int partitionId = hashTableResult.spillPartitionId();
HybridHashTableContainer ht = (HybridHashTableContainer) mapJoinTables[posSingleVectorMapJoinSmallTable];
HashPartition hp = ht.getHashPartitions()[partitionId];
VectorRowBytesContainer rowBytesContainer = hp.getMatchfileRowBytesContainer();
Output output = rowBytesContainer.getOuputForRowBytes();
// int offset = output.getLength();
bigTableVectorSerializeRow.setOutputAppend(output);
bigTableVectorSerializeRow.serializeWrite(batch, batchIndex);
// int length = output.getLength() - offset;
rowBytesContainer.finishRow();
// LOG.debug("spillSerializeRow spilled batchIndex " + batchIndex + ", length " + length);
}
Aggregations