use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class TestVectorizedORCReader method checkVectorizedReader.
private void checkVectorizedReader() throws Exception {
Reader vreader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf));
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf));
RecordReaderImpl vrr = (RecordReaderImpl) vreader.rows();
RecordReaderImpl rr = (RecordReaderImpl) reader.rows();
VectorizedRowBatch batch = reader.getSchema().createRowBatch();
OrcStruct row = null;
// Check Vectorized ORC reader against ORC row reader
while (vrr.nextBatch(batch)) {
for (int i = 0; i < batch.size; i++) {
row = (OrcStruct) rr.next(row);
for (int j = 0; j < batch.cols.length; j++) {
Object a = (row.getFieldValue(j));
ColumnVector cv = batch.cols[j];
// if the value is repeating, use row 0
int rowId = cv.isRepeating ? 0 : i;
// make sure the null flag agrees
if (a == null) {
Assert.assertEquals(true, !cv.noNulls && cv.isNull[rowId]);
} else if (a instanceof BooleanWritable) {
// Boolean values are stores a 1's and 0's, so convert and compare
Long temp = (long) (((BooleanWritable) a).get() ? 1 : 0);
long b = ((LongColumnVector) cv).vector[rowId];
Assert.assertEquals(temp.toString(), Long.toString(b));
} else if (a instanceof TimestampWritable) {
// Timestamps are stored as long, so convert and compare
TimestampWritable t = ((TimestampWritable) a);
TimestampColumnVector tcv = ((TimestampColumnVector) cv);
Assert.assertEquals(t.getTimestamp(), tcv.asScratchTimestamp(rowId));
} else if (a instanceof DateWritable) {
// Dates are stored as long, so convert and compare
DateWritable adt = (DateWritable) a;
long b = ((LongColumnVector) cv).vector[rowId];
Assert.assertEquals(adt.get().getTime(), DateWritable.daysToMillis((int) b));
} else if (a instanceof HiveDecimalWritable) {
// Decimals are stored as BigInteger, so convert and compare
HiveDecimalWritable dec = (HiveDecimalWritable) a;
HiveDecimalWritable b = ((DecimalColumnVector) cv).vector[i];
Assert.assertEquals(dec, b);
} else if (a instanceof DoubleWritable) {
double b = ((DoubleColumnVector) cv).vector[rowId];
assertEquals(a.toString(), Double.toString(b));
} else if (a instanceof Text) {
BytesColumnVector bcv = (BytesColumnVector) cv;
Text b = new Text();
b.set(bcv.vector[rowId], bcv.start[rowId], bcv.length[rowId]);
assertEquals(a, b);
} else if (a instanceof IntWritable || a instanceof LongWritable || a instanceof ByteWritable || a instanceof ShortWritable) {
assertEquals(a.toString(), Long.toString(((LongColumnVector) cv).vector[rowId]));
} else {
assertEquals("huh", a.getClass().getName());
}
}
}
// Check repeating
Assert.assertEquals(false, batch.cols[0].isRepeating);
Assert.assertEquals(false, batch.cols[1].isRepeating);
Assert.assertEquals(false, batch.cols[2].isRepeating);
Assert.assertEquals(true, batch.cols[3].isRepeating);
Assert.assertEquals(false, batch.cols[4].isRepeating);
Assert.assertEquals(false, batch.cols[5].isRepeating);
Assert.assertEquals(false, batch.cols[6].isRepeating);
Assert.assertEquals(false, batch.cols[7].isRepeating);
Assert.assertEquals(false, batch.cols[8].isRepeating);
Assert.assertEquals(false, batch.cols[9].isRepeating);
// Check non null
Assert.assertEquals(false, batch.cols[0].noNulls);
Assert.assertEquals(false, batch.cols[1].noNulls);
Assert.assertEquals(true, batch.cols[2].noNulls);
Assert.assertEquals(true, batch.cols[3].noNulls);
Assert.assertEquals(false, batch.cols[4].noNulls);
Assert.assertEquals(false, batch.cols[5].noNulls);
Assert.assertEquals(false, batch.cols[6].noNulls);
Assert.assertEquals(false, batch.cols[7].noNulls);
Assert.assertEquals(false, batch.cols[8].noNulls);
Assert.assertEquals(false, batch.cols[9].noNulls);
}
Assert.assertEquals(false, rr.nextBatch(batch));
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class FakeVectorRowBatchFromObjectIterables method produceNextBatch.
@Override
public VectorizedRowBatch produceNextBatch() {
batch.size = 0;
batch.selectedInUse = false;
for (int i = 0; i < types.length; ++i) {
ColumnVector col = batch.cols[i];
col.noNulls = true;
col.isRepeating = false;
}
while (!eof && batch.size < this.batchSize) {
int r = batch.size;
for (int i = 0; i < types.length; ++i) {
Iterator<Object> it = iterators.get(i);
if (!it.hasNext()) {
eof = true;
break;
}
Object value = it.next();
if (null == value) {
batch.cols[i].isNull[batch.size] = true;
batch.cols[i].noNulls = false;
} else {
// Must reset the isNull, could be set from prev batch use
batch.cols[i].isNull[batch.size] = false;
columnAssign[i].assign(batch.cols[i], batch.size, value);
}
}
if (!eof) {
batch.size += 1;
}
}
return batch;
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class VectorizedListColumnReader method setIsRepeating.
private void setIsRepeating(ListColumnVector lcv) {
ColumnVector child0 = getChildData(lcv, 0);
for (int i = 1; i < lcv.offsets.length; i++) {
ColumnVector currentChild = getChildData(lcv, i);
if (!compareColumnVector(child0, currentChild)) {
lcv.isRepeating = false;
return;
}
}
lcv.isRepeating = true;
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class VectorCoalesce method evaluate.
@Override
public void evaluate(VectorizedRowBatch batch) {
if (childExpressions != null) {
super.evaluateChildren(batch);
}
int[] sel = batch.selected;
int n = batch.size;
ColumnVector outputColVector = batch.cols[outputColumnNum];
boolean[] outputIsNull = outputColVector.isNull;
if (n <= 0) {
// Nothing to do
return;
}
if (unassignedBatchIndices == null || n > unassignedBatchIndices.length) {
// (Re)allocate larger to be a multiple of 1024 (DEFAULT_SIZE).
final int roundUpSize = ((n + VectorizedRowBatch.DEFAULT_SIZE - 1) / VectorizedRowBatch.DEFAULT_SIZE) * VectorizedRowBatch.DEFAULT_SIZE;
unassignedBatchIndices = new int[roundUpSize];
}
// We do not need to do a column reset since we are carefully changing the output.
outputColVector.isRepeating = false;
// CONSIDER: Should be do this for all vector expressions that can
// work on BytesColumnVector output columns???
outputColVector.init();
final int columnCount = inputColumns.length;
/*
* Process the input columns to find a non-NULL value for each row.
*
* We track the unassigned batchIndex of the rows that have not received
* a non-NULL value yet. Similar to a selected array.
*/
boolean isAllUnassigned = true;
int unassignedColumnCount = 0;
for (int k = 0; k < inputColumns.length; k++) {
ColumnVector cv = batch.cols[inputColumns[k]];
if (cv.isRepeating) {
if (cv.noNulls || !cv.isNull[0]) {
/*
* With a repeating value we can finish all remaining rows.
*/
if (isAllUnassigned) {
// No other columns provided non-NULL values. We can return repeated output.
outputIsNull[0] = false;
outputColVector.setElement(0, 0, cv);
outputColVector.isRepeating = true;
return;
} else {
// We cannot use copySelected method here.
for (int i = 0; i < unassignedColumnCount; i++) {
final int batchIndex = unassignedBatchIndices[i];
outputIsNull[batchIndex] = false;
// Our input is repeating (i.e. inputColNumber = 0).
outputColVector.setElement(batchIndex, 0, cv);
}
return;
}
} else {
// Repeated NULLs -- skip this input column.
}
} else {
/*
* Non-repeating input column. Use any non-NULL values for unassigned rows.
*/
if (isAllUnassigned) {
/*
* No other columns provided non-NULL values. We *may* be able to finish all rows
* with this input column...
*/
if (cv.noNulls) {
// Since no NULLs, we can provide values for all rows.
if (batch.selectedInUse) {
for (int i = 0; i < n; i++) {
final int batchIndex = sel[i];
outputIsNull[batchIndex] = false;
outputColVector.setElement(batchIndex, batchIndex, cv);
}
} else {
Arrays.fill(outputIsNull, 0, n, false);
for (int batchIndex = 0; batchIndex < n; batchIndex++) {
outputColVector.setElement(batchIndex, batchIndex, cv);
}
}
return;
} else {
// We might not be able to assign all rows because of input NULLs. Start tracking any
// unassigned rows.
boolean[] inputIsNull = cv.isNull;
if (batch.selectedInUse) {
for (int i = 0; i < n; i++) {
final int batchIndex = sel[i];
if (!inputIsNull[batchIndex]) {
outputIsNull[batchIndex] = false;
outputColVector.setElement(batchIndex, batchIndex, cv);
} else {
unassignedBatchIndices[unassignedColumnCount++] = batchIndex;
}
}
} else {
for (int batchIndex = 0; batchIndex < n; batchIndex++) {
if (!inputIsNull[batchIndex]) {
outputIsNull[batchIndex] = false;
outputColVector.setElement(batchIndex, batchIndex, cv);
} else {
unassignedBatchIndices[unassignedColumnCount++] = batchIndex;
}
}
}
if (unassignedColumnCount == 0) {
return;
}
isAllUnassigned = false;
}
} else {
/*
* We previously assigned *some* rows with non-NULL values. The batch indices of
* the unassigned row were tracked.
*/
if (cv.noNulls) {
// Assign all remaining rows.
for (int i = 0; i < unassignedColumnCount; i++) {
final int batchIndex = unassignedBatchIndices[i];
outputIsNull[batchIndex] = false;
outputColVector.setElement(batchIndex, batchIndex, cv);
}
return;
} else {
// Use any non-NULL values found; remember the remaining unassigned.
boolean[] inputIsNull = cv.isNull;
int newUnassignedColumnCount = 0;
for (int i = 0; i < unassignedColumnCount; i++) {
final int batchIndex = unassignedBatchIndices[i];
if (!inputIsNull[batchIndex]) {
outputIsNull[batchIndex] = false;
outputColVector.setElement(batchIndex, batchIndex, cv);
} else {
unassignedBatchIndices[newUnassignedColumnCount++] = batchIndex;
}
}
if (newUnassignedColumnCount == 0) {
return;
}
unassignedColumnCount = newUnassignedColumnCount;
}
}
}
}
// NULL out the remaining columns.
outputColVector.noNulls = false;
if (isAllUnassigned) {
outputIsNull[0] = true;
outputColVector.isRepeating = true;
} else {
for (int i = 0; i < unassignedColumnCount; i++) {
final int batchIndex = unassignedBatchIndices[i];
outputIsNull[batchIndex] = true;
}
}
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class VectorUDFDateDiffColScalar method evaluate.
@Override
public void evaluate(VectorizedRowBatch batch) {
if (childExpressions != null) {
super.evaluateChildren(batch);
}
LongColumnVector outputColVector = (LongColumnVector) batch.cols[outputColumnNum];
ColumnVector inputCol = batch.cols[this.colNum];
/* every line below this is identical for evaluateLong & evaluateString */
final int n = inputCol.isRepeating ? 1 : batch.size;
int[] sel = batch.selected;
final boolean selectedInUse = (inputCol.isRepeating == false) && batch.selectedInUse;
boolean[] outputIsNull = outputColVector.isNull;
if (batch.size == 0) {
/* n != batch.size when isRepeating */
return;
}
// We do not need to do a column reset since we are carefully changing the output.
outputColVector.isRepeating = false;
PrimitiveCategory primitiveCategory1 = ((PrimitiveTypeInfo) inputTypeInfos[1]).getPrimitiveCategory();
switch(primitiveCategory1) {
case DATE:
baseDate = (int) longValue;
break;
case TIMESTAMP:
date.setTime(timestampValue.getTime());
baseDate = DateWritable.dateToDays(date);
break;
case STRING:
case CHAR:
case VARCHAR:
try {
date.setTime(formatter.parse(new String(bytesValue, "UTF-8")).getTime());
baseDate = DateWritable.dateToDays(date);
break;
} catch (Exception e) {
outputColVector.noNulls = false;
if (selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
outputColVector.isNull[i] = true;
}
} else {
for (int i = 0; i < n; i++) {
outputColVector.isNull[i] = true;
}
}
return;
}
default:
throw new Error("Invalid input type #1: " + primitiveCategory1.name());
}
PrimitiveCategory primitiveCategory0 = ((PrimitiveTypeInfo) inputTypeInfos[0]).getPrimitiveCategory();
switch(primitiveCategory0) {
case DATE:
if (inputCol.isRepeating) {
if (inputCol.noNulls || !inputCol.isNull[0]) {
outputColVector.isNull[0] = false;
outputColVector.vector[0] = evaluateDate(inputCol, 0);
} else {
outputColVector.isNull[0] = true;
outputColVector.noNulls = false;
}
outputColVector.isRepeating = true;
} else if (inputCol.noNulls) {
if (batch.selectedInUse) {
if (!outputColVector.noNulls) {
for (int j = 0; j != n; j++) {
final int i = sel[j];
// Set isNull before call in case it changes it mind.
outputIsNull[i] = false;
outputColVector.vector[i] = evaluateDate(inputCol, i);
}
} else {
for (int j = 0; j != n; j++) {
final int i = sel[j];
outputColVector.vector[i] = evaluateDate(inputCol, i);
}
}
} else {
if (!outputColVector.noNulls) {
// Assume it is almost always a performance win to fill all of isNull so we can
// safely reset noNulls.
Arrays.fill(outputIsNull, false);
outputColVector.noNulls = true;
}
for (int i = 0; i != n; i++) {
outputColVector.vector[i] = evaluateDate(inputCol, i);
}
}
} else /* there are nulls in the inputColVector */
{
// Carefully handle NULLs..
// Handle case with nulls. Don't do function if the value is null, to save time,
// because calling the function can be expensive.
outputColVector.noNulls = false;
if (selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
outputColVector.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
outputColVector.vector[i] = evaluateDate(inputCol, i);
}
}
} else {
for (int i = 0; i < n; i++) {
outputColVector.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
outputColVector.vector[i] = evaluateDate(inputCol, i);
}
}
}
}
break;
case TIMESTAMP:
if (inputCol.isRepeating) {
if (inputCol.noNulls || !inputCol.isNull[0]) {
outputColVector.isNull[0] = false;
outputColVector.vector[0] = evaluateTimestamp(inputCol, 0);
} else {
outputColVector.isNull[0] = true;
outputColVector.noNulls = false;
}
outputColVector.isRepeating = true;
} else if (inputCol.noNulls) {
if (batch.selectedInUse) {
if (!outputColVector.noNulls) {
for (int j = 0; j != n; j++) {
final int i = sel[j];
// Set isNull before call in case it changes it mind.
outputIsNull[i] = false;
outputColVector.vector[i] = evaluateTimestamp(inputCol, i);
}
} else {
for (int j = 0; j != n; j++) {
final int i = sel[j];
outputColVector.vector[i] = evaluateTimestamp(inputCol, i);
}
}
} else {
if (!outputColVector.noNulls) {
// Assume it is almost always a performance win to fill all of isNull so we can
// safely reset noNulls.
Arrays.fill(outputIsNull, false);
outputColVector.noNulls = true;
}
for (int i = 0; i != n; i++) {
outputColVector.vector[i] = evaluateTimestamp(inputCol, i);
}
}
} else /* there are nulls in the inputColVector */
{
// Carefully handle NULLs..
// Handle case with nulls. Don't do function if the value is null, to save time,
// because calling the function can be expensive.
outputColVector.noNulls = false;
if (selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
outputColVector.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
outputColVector.vector[i] = evaluateTimestamp(inputCol, i);
}
}
} else {
for (int i = 0; i < n; i++) {
outputColVector.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
outputColVector.vector[i] = evaluateTimestamp(inputCol, i);
}
}
}
}
break;
case STRING:
case CHAR:
case VARCHAR:
if (inputCol.isRepeating) {
if (inputCol.noNulls || !inputCol.isNull[0]) {
outputColVector.isNull[0] = false;
evaluateString(inputCol, outputColVector, 0);
} else {
outputColVector.isNull[0] = true;
outputColVector.noNulls = false;
}
outputColVector.isRepeating = true;
} else if (inputCol.noNulls) {
if (batch.selectedInUse) {
if (!outputColVector.noNulls) {
for (int j = 0; j != n; j++) {
final int i = sel[j];
// Set isNull before call in case it changes it mind.
outputIsNull[i] = false;
evaluateString(inputCol, outputColVector, i);
}
} else {
for (int j = 0; j != n; j++) {
final int i = sel[j];
evaluateString(inputCol, outputColVector, i);
}
}
} else {
if (!outputColVector.noNulls) {
// Assume it is almost always a performance win to fill all of isNull so we can
// safely reset noNulls.
Arrays.fill(outputIsNull, false);
outputColVector.noNulls = true;
}
for (int i = 0; i != n; i++) {
evaluateString(inputCol, outputColVector, i);
}
}
} else /* there are nulls in the inputColVector */
{
// Carefully handle NULLs..
// Handle case with nulls. Don't do function if the value is null, to save time,
// because calling the function can be expensive.
outputColVector.noNulls = false;
if (selectedInUse) {
for (int j = 0; j < n; j++) {
int i = sel[j];
outputColVector.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
evaluateString(inputCol, outputColVector, i);
}
}
} else {
for (int i = 0; i < n; i++) {
outputColVector.isNull[i] = inputCol.isNull[i];
if (!inputCol.isNull[i]) {
evaluateString(inputCol, outputColVector, i);
}
}
}
}
break;
default:
throw new Error("Invalid input type #0: " + primitiveCategory0.name());
}
}
Aggregations