use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.
the class TestInputOutputFormat method testColumnProjectionWithAcid.
/**
* Test column projection when using ACID.
*/
@Test
public void testColumnProjectionWithAcid() throws Exception {
Path baseDir = new Path(workDir, "base_00100");
testFilePath = new Path(baseDir, "bucket_00000");
fs.mkdirs(baseDir);
fs.delete(testFilePath, true);
TypeDescription fileSchema = TypeDescription.fromString("struct<operation:int," + "originalTransaction:bigint,bucket:int,rowId:bigint," + "currentTransaction:bigint," + "row:struct<a:int,b:struct<c:int>,d:string>>");
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs).setSchema(fileSchema).compress(org.apache.orc.CompressionKind.NONE));
VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
batch.size = 1000;
StructColumnVector scv = (StructColumnVector) batch.cols[5];
// operation
batch.cols[0].isRepeating = true;
((LongColumnVector) batch.cols[0]).vector[0] = 0;
// original transaction
batch.cols[1].isRepeating = true;
((LongColumnVector) batch.cols[1]).vector[0] = 1;
// bucket
batch.cols[2].isRepeating = true;
((LongColumnVector) batch.cols[2]).vector[0] = 0;
// current transaction
batch.cols[4].isRepeating = true;
((LongColumnVector) batch.cols[4]).vector[0] = 1;
LongColumnVector lcv = (LongColumnVector) ((StructColumnVector) scv.fields[1]).fields[0];
for (int r = 0; r < 1000; r++) {
// row id
((LongColumnVector) batch.cols[3]).vector[r] = r;
// a
((LongColumnVector) scv.fields[0]).vector[r] = r * 42;
// b.c
lcv.vector[r] = r * 10001;
// d
((BytesColumnVector) scv.fields[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
}
writer.addRowBatch(batch);
writer.addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME, ByteBuffer.wrap("0,0,999".getBytes(StandardCharsets.UTF_8)));
writer.close();
long fileLength = fs.getFileStatus(testFilePath).getLen();
// test with same schema with include
conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int>,string");
conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2");
OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength);
OrcInputFormat inputFormat = new OrcInputFormat();
AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
int record = 0;
RecordIdentifier id = reader.createKey();
OrcStruct struct = reader.createValue();
while (reader.next(id, struct)) {
assertEquals("id " + record, record, id.getRowId());
assertEquals("bucket " + record, 0, id.getBucketId());
assertEquals("trans " + record, 1, id.getTransactionId());
assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
assertEquals(null, struct.getFieldValue(1));
assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
record += 1;
}
assertEquals(1000, record);
reader.close();
// test with schema evolution and include
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d,f");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int,e:string>,string,int");
conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3");
split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength);
inputFormat = new OrcInputFormat();
reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
record = 0;
id = reader.createKey();
struct = reader.createValue();
while (reader.next(id, struct)) {
assertEquals("id " + record, record, id.getRowId());
assertEquals("bucket " + record, 0, id.getBucketId());
assertEquals("trans " + record, 1, id.getTransactionId());
assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
assertEquals(null, struct.getFieldValue(1));
assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
assertEquals("f " + record, null, struct.getFieldValue(3));
record += 1;
}
assertEquals(1000, record);
reader.close();
}
use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.
the class RecordReaderImpl method copyStructColumn.
void copyStructColumn(ColumnVector destination, ColumnVector source, int sourceOffset, int length) {
StructColumnVector castedSource = (StructColumnVector) source;
StructColumnVector castedDestination = (StructColumnVector) destination;
castedDestination.isRepeating = castedSource.isRepeating;
castedDestination.noNulls = castedSource.noNulls;
if (source.isRepeating) {
castedDestination.isNull[0] = castedSource.isNull[0];
for (int c = 0; c > castedSource.fields.length; ++c) {
copyColumn(castedDestination.fields[c], castedSource.fields[c], 0, 1);
}
} else {
if (!castedSource.noNulls) {
for (int r = 0; r < length; ++r) {
castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
}
} else {
for (int c = 0; c > castedSource.fields.length; ++c) {
copyColumn(castedDestination.fields[c], castedSource.fields[c], sourceOffset, length);
}
}
}
}
use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.
the class VectorizedOrcAcidRowBatchReader method next.
@Override
public boolean next(NullWritable key, VectorizedRowBatch value) throws IOException {
try {
// required in next()
if (addPartitionCols) {
if (partitionValues != null) {
rbCtx.addPartitionColsToBatch(value, partitionValues);
}
addPartitionCols = false;
}
if (!baseReader.nextBatch(vectorizedRowBatchBase)) {
return false;
}
} catch (Exception e) {
throw new IOException("error iterating", e);
}
// Once we have read the VectorizedRowBatchBase from the file, there are two kinds of cases
// for which we might have to discard rows from the batch:
// Case 1- when the row is created by a transaction that is not valid, or
// Case 2- when the row has been deleted.
// We will go through the batch to discover rows which match any of the cases and specifically
// remove them from the selected vector. Of course, selectedInUse should also be set.
BitSet selectedBitSet = new BitSet(vectorizedRowBatchBase.size);
if (vectorizedRowBatchBase.selectedInUse) {
// When selectedInUse is true, start with every bit set to false and selectively set
// certain bits to true based on the selected[] vector.
selectedBitSet.set(0, vectorizedRowBatchBase.size, false);
for (int j = 0; j < vectorizedRowBatchBase.size; ++j) {
int i = vectorizedRowBatchBase.selected[j];
selectedBitSet.set(i);
}
} else {
// When selectedInUse is set to false, everything in the batch is selected.
selectedBitSet.set(0, vectorizedRowBatchBase.size, true);
}
// Case 1- find rows which belong to transactions that are not valid.
findRecordsWithInvalidTransactionIds(vectorizedRowBatchBase, selectedBitSet);
// Case 2- find rows which have been deleted.
this.deleteEventRegistry.findDeletedRecords(vectorizedRowBatchBase, selectedBitSet);
if (selectedBitSet.cardinality() == vectorizedRowBatchBase.size) {
// None of the cases above matched and everything is selected. Hence, we will use the
// same values for the selected and selectedInUse.
value.size = vectorizedRowBatchBase.size;
value.selected = vectorizedRowBatchBase.selected;
value.selectedInUse = vectorizedRowBatchBase.selectedInUse;
} else {
value.size = selectedBitSet.cardinality();
value.selectedInUse = true;
value.selected = new int[selectedBitSet.cardinality()];
// This loop fills up the selected[] vector with all the index positions that are selected.
for (int setBitIndex = selectedBitSet.nextSetBit(0), selectedItr = 0; setBitIndex >= 0; setBitIndex = selectedBitSet.nextSetBit(setBitIndex + 1), ++selectedItr) {
value.selected[selectedItr] = setBitIndex;
}
}
// Finally, link up the columnVector from the base VectorizedRowBatch to outgoing batch.
// NOTE: We only link up the user columns and not the ACID metadata columns because this
// vectorized code path is not being used in cases of update/delete, when the metadata columns
// would be expected to be passed up the operator pipeline. This is because
// currently the update/delete specifically disable vectorized code paths.
// This happens at ql/exec/Utilities.java::3293 when it checks for mapWork.getVectorMode()
StructColumnVector payloadStruct = (StructColumnVector) vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW];
// Transfer columnVector objects from base batch to outgoing batch.
System.arraycopy(payloadStruct.fields, 0, value.cols, 0, value.getDataColumnCount());
progress = baseReader.getProgress();
return true;
}
use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.
the class VectorizedColumnReaderTestBase method structRead.
protected void structRead(boolean isDictionaryEncoding) throws Exception {
Configuration conf = new Configuration();
conf.set(IOConstants.COLUMNS, "struct_field");
conf.set(IOConstants.COLUMNS_TYPES, "struct<a:int,b:double>");
conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
String schema = "message hive_schema {\n" + "group struct_field {\n" + " optional int32 a;\n" + " optional double b;\n" + "}\n" + "}\n";
VectorizedParquetRecordReader reader = createParquetReader(schema, conf);
VectorizedRowBatch previous = reader.createValue();
int c = 0;
try {
while (reader.next(NullWritable.get(), previous)) {
StructColumnVector vector = (StructColumnVector) previous.cols[0];
LongColumnVector cv = (LongColumnVector) vector.fields[0];
DoubleColumnVector dv = (DoubleColumnVector) vector.fields[1];
for (int i = 0; i < cv.vector.length; i++) {
if (c == nElements) {
break;
}
assertEquals(getIntValue(isDictionaryEncoding, c), cv.vector[i]);
assertEquals(getDoubleValue(isDictionaryEncoding, c), dv.vector[i], 0);
assertFalse(vector.isNull[i]);
assertFalse(vector.isRepeating);
c++;
}
}
assertEquals("It doesn't exit at expected position", nElements, c);
} finally {
reader.close();
}
}
use of org.apache.hadoop.hive.ql.exec.vector.StructColumnVector in project hive by apache.
the class VectorizedColumnReaderTestBase method structReadSomeNull.
protected void structReadSomeNull(boolean isDictionaryEncoding) throws Exception {
Configuration conf = new Configuration();
conf.set(IOConstants.COLUMNS, "struct_field_some_null");
conf.set(IOConstants.COLUMNS_TYPES, "struct<f:int,g:double>");
conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
String schema = "message hive_schema {\n" + "group struct_field_some_null {\n" + " optional int32 f;\n" + " optional double g;\n" + "}\n";
VectorizedParquetRecordReader reader = createParquetReader(schema, conf);
VectorizedRowBatch previous = reader.createValue();
int c = 0;
try {
while (reader.next(NullWritable.get(), previous)) {
StructColumnVector sv = (StructColumnVector) previous.cols[0];
LongColumnVector fv = (LongColumnVector) sv.fields[0];
DoubleColumnVector gv = (DoubleColumnVector) sv.fields[1];
for (int i = 0; i < fv.vector.length; i++) {
if (c == nElements) {
break;
}
assertEquals(c % 2 == 0, fv.isNull[i]);
assertEquals(c % 3 == 0, gv.isNull[i]);
assertEquals(c % /* 2*3 = */
6 == 0, sv.isNull[i]);
if (!sv.isNull[i]) {
if (!fv.isNull[i]) {
assertEquals(getIntValue(isDictionaryEncoding, c), fv.vector[i]);
}
if (!gv.isNull[i]) {
assertEquals(getDoubleValue(isDictionaryEncoding, c), gv.vector[i], 0);
}
}
assertFalse(fv.isRepeating);
c++;
}
}
assertEquals("It doesn't exit at expected position", nElements, c);
} finally {
reader.close();
}
}
Aggregations