use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class VectorDeserializeOrcWriter method flushBatch.
private void flushBatch() throws IOException {
addBatchToWriter();
if (!isAsync) {
for (int c = 0; c < sourceBatch.cols.length; ++c) {
// This resets vectors in both batches.
ColumnVector colVector = sourceBatch.cols[c];
if (colVector != null) {
colVector.reset();
colVector.init();
}
}
sourceBatch.selectedInUse = false;
sourceBatch.size = 0;
sourceBatch.endOfFile = false;
propagateSourceBatchFieldsToDest();
} else {
// In addBatchToWriter, we have passed the batch to both ORC and operator pipeline
// (neither ever changes the vectors). We'd need a set of vectors batch to write to.
// TODO: for now, create this from scratch. Ideally we should return the vectors from ops.
// We could also have the ORC thread create it for us in its spare time...
this.sourceBatch = vrbCtx.createVectorizedRowBatch();
if (usesSourceIncludes) {
this.destinationBatch = new VectorizedRowBatch(sourceIncludes.size());
int inclBatchIx = 0;
for (Integer columnId : sourceIncludes) {
destinationBatch.cols[inclBatchIx++] = sourceBatch.cols[columnId];
}
destinationBatch.setPartitionInfo(sourceIncludes.size(), 0);
} else {
this.destinationBatch = sourceBatch;
}
}
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project h2o-3 by h2oai.
the class OrcTestUtils method compareFrameContents.
static int compareFrameContents(String fileName, Set<String> failedFiles, Frame h2oFrame, Reader orcReader, String[] colTypes, String[] colNames, boolean[] toInclude) {
// get all stripe info
List<StripeInformation> stripesInfo = orcReader.getStripes();
int wrongTests = 0;
if (stripesInfo.size() == 0) {
// Orc file contains no data
assertEquals("Orc file is empty. H2O frame row number should be zero: ", 0, h2oFrame.numRows());
} else {
// row index into H2O frame
Long startRowIndex = 0L;
for (StripeInformation oneStripe : stripesInfo) {
try {
RecordReader perStripe = orcReader.rows(oneStripe.getOffset(), oneStripe.getDataLength(), toInclude, null, colNames);
// read orc file stripes in vectorizedRowBatch
VectorizedRowBatch batch = perStripe.nextBatch(null);
boolean done = false;
Long rowCounts = 0L;
// row number of current stripe
Long rowNumber = oneStripe.getNumberOfRows();
while (!done) {
// row number of current batch
long currentBatchRow = batch.count();
ColumnVector[] dataVectors = batch.cols;
int colIndex = 0;
for (int cIdx = 0; cIdx < batch.numCols; cIdx++) {
// read one column at a time;
if (toInclude[cIdx + 1]) {
compare1Cloumn(dataVectors[cIdx], colTypes[colIndex].toLowerCase(), colIndex, currentBatchRow, h2oFrame.vec(colNames[colIndex]), startRowIndex);
colIndex++;
}
}
// record number of rows of data actually read
rowCounts = rowCounts + currentBatchRow;
startRowIndex = startRowIndex + currentBatchRow;
if (// read all rows of the stripe already.
rowCounts >= rowNumber)
done = true;
if (// not done yet, get next batch
!done)
batch = perStripe.nextBatch(batch);
}
perStripe.close();
} catch (Throwable e) {
failedFiles.add(fileName);
e.printStackTrace();
wrongTests += 1;
}
}
}
return wrongTests;
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class EncodedTreeReaderFactory method createEncodedTreeReader.
private static TreeReader createEncodedTreeReader(TypeDescription schema, List<OrcProto.ColumnEncoding> encodings, OrcEncodedColumnBatch batch, CompressionCodec codec, TreeReaderFactory.Context context) throws IOException {
int columnIndex = schema.getId();
ColumnStreamData[] streamBuffers = null;
List<ColumnVector> vectors = null;
if (batch.hasData(columnIndex)) {
streamBuffers = batch.getColumnData(columnIndex);
} else if (batch.hasVectors(columnIndex)) {
vectors = batch.getColumnVectors(columnIndex);
} else {
throw new AssertionError("Batch has no data for " + columnIndex + ": " + batch);
}
// EncodedColumnBatch is already decompressed, we don't really need to pass codec.
// But we need to know if the original data is compressed or not. This is used to skip
// positions in row index properly. If the file is originally compressed,
// then 1st position (compressed offset) in row index should be skipped to get
// uncompressed offset, else 1st position should not be skipped.
// TODO: there should be a better way to do this, code just needs to be modified
OrcProto.ColumnEncoding columnEncoding = encodings.get(columnIndex);
// stream buffers are arranged in enum order of stream kind
ColumnStreamData present = null, data = null, dictionary = null, lengths = null, secondary = null;
if (streamBuffers != null) {
present = streamBuffers[OrcProto.Stream.Kind.PRESENT_VALUE];
data = streamBuffers[OrcProto.Stream.Kind.DATA_VALUE];
dictionary = streamBuffers[OrcProto.Stream.Kind.DICTIONARY_DATA_VALUE];
lengths = streamBuffers[OrcProto.Stream.Kind.LENGTH_VALUE];
secondary = streamBuffers[OrcProto.Stream.Kind.SECONDARY_VALUE];
}
if (LOG.isDebugEnabled()) {
LOG.debug("columnIndex: {} columnType: {} streamBuffers.length: {} vectors: {} columnEncoding: {}" + " present: {} data: {} dictionary: {} lengths: {} secondary: {} tz: {}", columnIndex, schema, streamBuffers == null ? 0 : streamBuffers.length, vectors == null ? 0 : vectors.size(), columnEncoding, present != null, data, dictionary != null, lengths != null, secondary != null, context.getWriterTimezone());
}
// TODO: get rid of the builders - they serve no purpose... just call ctors directly.
switch(schema.getCategory()) {
case BINARY:
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case CHAR:
case VARCHAR:
case STRING:
case DECIMAL:
case TIMESTAMP:
case DATE:
return getPrimitiveTreeReader(columnIndex, schema, codec, columnEncoding, present, data, dictionary, lengths, secondary, context, vectors);
case LIST:
// Not currently supported.
assert vectors == null;
TypeDescription elementType = schema.getChildren().get(0);
TreeReader elementReader = createEncodedTreeReader(elementType, encodings, batch, codec, context);
return ListStreamReader.builder().setColumnIndex(columnIndex).setColumnEncoding(columnEncoding).setCompressionCodec(codec).setPresentStream(present).setLengthStream(lengths).setElementReader(elementReader).setContext(context).build();
case MAP:
// Not currently supported.
assert vectors == null;
TypeDescription keyType = schema.getChildren().get(0);
TypeDescription valueType = schema.getChildren().get(1);
TreeReader keyReader = createEncodedTreeReader(keyType, encodings, batch, codec, context);
TreeReader valueReader = createEncodedTreeReader(valueType, encodings, batch, codec, context);
return MapStreamReader.builder().setColumnIndex(columnIndex).setColumnEncoding(columnEncoding).setCompressionCodec(codec).setPresentStream(present).setLengthStream(lengths).setKeyReader(keyReader).setValueReader(valueReader).setContext(context).build();
case STRUCT:
{
// Not currently supported.
assert vectors == null;
int childCount = schema.getChildren().size();
TreeReader[] childReaders = new TreeReader[childCount];
for (int i = 0; i < childCount; i++) {
TypeDescription childType = schema.getChildren().get(i);
childReaders[i] = createEncodedTreeReader(childType, encodings, batch, codec, context);
}
return StructStreamReader.builder().setColumnIndex(columnIndex).setCompressionCodec(codec).setColumnEncoding(columnEncoding).setPresentStream(present).setChildReaders(childReaders).setContext(context).build();
}
case UNION:
{
// Not currently supported.
assert vectors == null;
int childCount = schema.getChildren().size();
TreeReader[] childReaders = new TreeReader[childCount];
for (int i = 0; i < childCount; i++) {
TypeDescription childType = schema.getChildren().get(i);
childReaders[i] = createEncodedTreeReader(childType, encodings, batch, codec, context);
}
return UnionStreamReader.builder().setColumnIndex(columnIndex).setCompressionCodec(codec).setColumnEncoding(columnEncoding).setPresentStream(present).setDataStream(data).setChildReaders(childReaders).setContext(context).build();
}
default:
throw new UnsupportedOperationException("Data type not supported: " + schema);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class VectorUDFDateAddColCol method evaluate.
@Override
public void evaluate(VectorizedRowBatch batch) {
if (childExpressions != null) {
super.evaluateChildren(batch);
}
ColumnVector inputColVector1 = batch.cols[colNum1];
LongColumnVector inputColVector2 = (LongColumnVector) batch.cols[colNum2];
int[] sel = batch.selected;
int n = batch.size;
long[] vector2 = inputColVector2.vector;
LongColumnVector outV = (LongColumnVector) batch.cols[outputColumn];
long[] outputVector = outV.vector;
if (n <= 0) {
// Nothing to do
return;
}
// Handle null
NullUtil.propagateNullsColCol(inputColVector1, inputColVector2, outV, batch.selected, batch.size, batch.selectedInUse);
switch(inputTypes[0]) {
case DATE:
// Now disregard null in second pass.
if ((inputColVector1.isRepeating) && (inputColVector2.isRepeating)) {
// All must be selected otherwise size would be zero
// Repeating property will not change.
outV.isRepeating = true;
outputVector[0] = evaluateDate(inputColVector1, 0, vector2[0]);
} else if (batch.selectedInUse) {
for (int j = 0; j != n; j++) {
int i = sel[j];
outputVector[i] = evaluateDate(inputColVector1, i, vector2[i]);
}
} else {
for (int i = 0; i != n; i++) {
outputVector[i] = evaluateDate(inputColVector1, i, vector2[i]);
}
}
break;
case TIMESTAMP:
// Now disregard null in second pass.
if ((inputColVector1.isRepeating) && (inputColVector2.isRepeating)) {
// All must be selected otherwise size would be zero
// Repeating property will not change.
outV.isRepeating = true;
outputVector[0] = evaluateTimestamp(inputColVector1, 0, vector2[0]);
} else if (batch.selectedInUse) {
for (int j = 0; j != n; j++) {
int i = sel[j];
outputVector[i] = evaluateTimestamp(inputColVector1, i, vector2[i]);
}
} else {
for (int i = 0; i != n; i++) {
outputVector[i] = evaluateTimestamp(inputColVector1, i, vector2[i]);
}
}
break;
case STRING:
case CHAR:
case VARCHAR:
// Now disregard null in second pass.
if ((inputColVector1.isRepeating) && (inputColVector2.isRepeating)) {
// All must be selected otherwise size would be zero
// Repeating property will not change.
outV.isRepeating = true;
evaluateString((BytesColumnVector) inputColVector1, outV, 0, vector2[0]);
} else if (batch.selectedInUse) {
for (int j = 0; j != n; j++) {
int i = sel[j];
evaluateString((BytesColumnVector) inputColVector1, outV, i, vector2[i]);
}
} else {
for (int i = 0; i != n; i++) {
evaluateString((BytesColumnVector) inputColVector1, outV, i, vector2[i]);
}
}
break;
default:
throw new Error("Unsupported input type " + inputTypes[0].name());
}
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class VectorUDFDateAddColScalar method evaluate.
@Override
public void evaluate(VectorizedRowBatch batch) {
if (childExpressions != null) {
super.evaluateChildren(batch);
}
LongColumnVector outV = (LongColumnVector) batch.cols[outputColumn];
ColumnVector inputCol = batch.cols[this.colNum];
/* every line below this is identical for evaluateLong & evaluateString */
final int n = inputCol.isRepeating ? 1 : batch.size;
int[] sel = batch.selected;
final boolean selectedInUse = (inputCol.isRepeating == false) && batch.selectedInUse;
if (batch.size == 0) {
/* n != batch.size when isRepeating */
return;
}
/* true for all algebraic UDFs with no state */
outV.isRepeating = inputCol.isRepeating;
switch(inputTypes[0]) {
case DATE:
if (inputCol.noNulls) {
outV.noNulls = true;
if (selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
outV.vector[i] = evaluateDate(inputCol, i);
}
} else {
for (int i = 0; i < n; i++) {
outV.vector[i] = evaluateDate(inputCol, i);
}
}
} else {
// Handle case with nulls. Don't do function if the value is null, to save time,
// because calling the function can be expensive.
outV.noNulls = false;
if (selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
outV.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
outV.vector[i] = evaluateDate(inputCol, i);
}
}
} else {
for (int i = 0; i < n; i++) {
outV.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
outV.vector[i] = evaluateDate(inputCol, i);
}
}
}
}
break;
case TIMESTAMP:
if (inputCol.noNulls) {
outV.noNulls = true;
if (batch.selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
outV.vector[i] = evaluateTimestamp(inputCol, i);
}
} else {
for (int i = 0; i < n; i++) {
outV.vector[i] = evaluateTimestamp(inputCol, i);
}
}
} else {
// Handle case with nulls. Don't do function if the value is null, to save time,
// because calling the function can be expensive.
outV.noNulls = false;
if (batch.selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
outV.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
outV.vector[i] = evaluateTimestamp(inputCol, i);
}
}
} else {
for (int i = 0; i < n; i++) {
outV.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
outV.vector[i] = evaluateTimestamp(inputCol, i);
}
}
}
}
break;
case STRING:
case CHAR:
case VARCHAR:
if (inputCol.noNulls) {
outV.noNulls = true;
if (batch.selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
evaluateString(inputCol, outV, i);
}
} else {
for (int i = 0; i < n; i++) {
evaluateString(inputCol, outV, i);
}
}
} else {
// Handle case with nulls. Don't do function if the value is null, to save time,
// because calling the function can be expensive.
outV.noNulls = false;
if (batch.selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
outV.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
evaluateString(inputCol, outV, i);
}
}
} else {
for (int i = 0; i < n; i++) {
outV.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
evaluateString(inputCol, outV, i);
}
}
}
}
break;
default:
throw new Error("Unsupported input type " + inputTypes[0].name());
}
}
Aggregations