use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventOriginalFiltering2.
private void testDeleteEventOriginalFiltering2() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, false);
// Need to use a bigger row than DummyRow for the writer to flush the stripes
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
Properties properties = new Properties();
OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(properties, conf);
writerOptions.inspector(bigOriginalRowInspector).stripeSize(1).batchSize(1);
String originalFile = "000000_0";
Path originalFilePath = new Path(root, originalFile);
byte[] data = new byte[1000];
Writer writer = OrcFile.createWriter(originalFilePath, writerOptions);
writer.addRow(new BigOriginalRow(data));
writer.addRow(new BigOriginalRow(data));
writer.addRow(new BigOriginalRow(data));
writer.close();
Reader reader = OrcFile.createReader(originalFilePath, OrcFile.readerOptions(conf));
List<StripeInformation> stripes = reader.getStripes();
// Make sure 3 stripes are created
assertEquals(3, stripes.size());
FileStatus fileStatus = fs.getFileStatus(originalFilePath);
long fileLength = fileStatus.getLen();
// Set vector mode to true in the map work so that we can generate the syntheticProps
MapWork mapWork = new MapWork();
mapWork.setVectorMode(true);
VectorizedRowBatchCtx vrbContext = new VectorizedRowBatchCtx();
mapWork.setVectorizedRowBatchCtx(vrbContext);
HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
Utilities.setMapWork(conf, mapWork);
OrcSplit.OffsetAndBucketProperty syntheticProps = VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket(fileStatus, root, true, true, conf);
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).bucket(0);
int bucketProperty = BucketCodec.V1.encode(options);
// 1. Splits within a stripe
// A split that's completely within the 2nd stripe
StripeInformation stripe = stripes.get(1);
OrcSplit split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 1), filterOn);
// A split that's completely within the last stripe
stripe = stripes.get(2);
split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 3), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// 2. Splits starting at a stripe boundary
// A split that starts where the 1st stripe starts and ends before the 1st stripe ends
stripe = stripes.get(0);
split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the 1st stripe
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 0), filterOn);
// A split that starts where the 2nd stripe starts and ends after the 2nd stripe ends
stripe = stripes.get(1);
split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// 3. Splits ending at a stripe boundary
// A split that starts before the last stripe starts and ends at the last stripe boundary
stripe = stripes.get(2);
split = new OrcSplit(originalFilePath, null, stripe.getOffset() - 50, stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the last stripe
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// A split that starts after the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset() + 50, reader.getContentLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// A split that starts where the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset(), reader.getContentLength(), new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for all 3 stripes
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 2), filterOn);
}
use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering2.
private void testDeleteEventFiltering2() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
boolean skipKeyIdx = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVETESTMODEACIDKEYIDXSKIP);
int bucket = 1;
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(true).minimumWriteId(10000002).maximumWriteId(10000002).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
// create data that looks like a compacted base that includes some data
// from 'original' files and some from native Acid write
RecordUpdater updater = new OrcRecordUpdater(root, options);
updater.insert(0, new DummyRow(1, 0, 0, bucket));
updater.insert(0, new DummyRow(1, 1, 0, bucket));
updater.insert(0, new DummyRow(2, 2, 0, bucket));
updater.insert(10000001, new DummyRow(3, 0, 10000001, bucket));
updater.close(false);
// delete 3rd row
options.writingBase(false).minimumWriteId(10000004).maximumWriteId(10000004);
updater = new OrcRecordUpdater(root, options);
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 0, 0, bucket));
// hypothetically this matches something in (nonexistent here)
// delta_10000003_10000003
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 5, 10000003, bucket));
updater.close(false);
conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
// HWM is not important - just make sure deltas created above are read as
// if committed
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:10000005:" + Long.MAX_VALUE + "::");
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
assertEquals(1, splitStrategies.size());
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(1, splits.size());
assertEquals(root.toUri().toString() + File.separator + "base_10000002/bucket_00001", splits.get(0).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, new VectorizedRowBatchCtx());
ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 1", filterOn ? 1 : 2, deleteEventRegistry.size());
OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
SearchArgument sarg = vectorizedReader.getDeleteEventSarg();
if (filterOn) {
if (skipKeyIdx) {
// If key index is not present, the min max key interval uses stripe stats instead
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(10000001, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(10000001, bucketProperty, 0)), keyInterval);
}
// key point is that in leaf-5 is (rowId <= 2) even though maxKey has
// rowId 0. more in VectorizedOrcAcidRowBatchReader.findMinMaxKeys
assertEquals("leaf-0 = (LESS_THAN originalTransaction 0)," + " leaf-1 = (LESS_THAN bucket 536936448)," + " leaf-2 = (LESS_THAN rowId 0)," + " leaf-3 = (LESS_THAN_EQUALS originalTransaction 10000001)," + " leaf-4 = (LESS_THAN_EQUALS bucket 536936448)," + " leaf-5 = (LESS_THAN_EQUALS rowId 2)," + " expr = (and (not leaf-0) (not leaf-1) " + "(not leaf-2) leaf-3 leaf-4 leaf-5)", ((SearchArgumentImpl) sarg).toOldString());
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
assertNull(sarg);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method validateKeyInterval.
private void validateKeyInterval(OrcSplit split, RecordIdentifier lowKey, RecordIdentifier highKey, boolean filterOn) throws Exception {
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(split, conf, Reporter.NULL, new VectorizedRowBatchCtx());
OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
SearchArgument sarg = vectorizedReader.getDeleteEventSarg();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(lowKey, highKey), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
assertNull(sarg);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class TestVectorIndex method doVectorCastTest.
private boolean doVectorCastTest(TypeInfo typeInfo, List<String> columns, String[] columnNames, TypeInfo[] typeInfos, DataTypePhysicalVariation[] dataTypePhysicalVariations, List<ExprNodeDesc> children, GenericUDF udf, ExprNodeGenericFuncDesc exprDesc, IndexTestMode indexTestMode, VectorRandomBatchSource batchSource, ObjectInspector objectInspector, TypeInfo outputTypeInfo, Object[] resultObjects) throws Exception {
HiveConf hiveConf = new HiveConf();
if (indexTestMode == IndexTestMode.ADAPTOR) {
hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_TEST_VECTOR_ADAPTOR_OVERRIDE, true);
}
VectorizationContext vectorizationContext = new VectorizationContext("name", columns, Arrays.asList(typeInfos), Arrays.asList(dataTypePhysicalVariations), hiveConf);
VectorExpression vectorExpression = vectorizationContext.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.PROJECTION);
vectorExpression.transientInit(hiveConf);
if (indexTestMode == IndexTestMode.VECTOR_EXPRESSION && vectorExpression instanceof VectorUDFAdaptor) {
System.out.println("*NO NATIVE VECTOR EXPRESSION* typeInfo " + typeInfo.toString() + " indexTestMode " + indexTestMode + " vectorExpression " + vectorExpression.toString());
}
System.out.println("*VECTOR EXPRESSION* " + vectorExpression.getClass().getSimpleName());
/*
System.out.println(
"*DEBUG* typeInfo " + typeInfo.toString() +
" indexTestMode " + indexTestMode +
" vectorExpression " + vectorExpression.toString());
*/
VectorRandomRowSource rowSource = batchSource.getRowSource();
VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(columnNames, rowSource.typeInfos(), rowSource.dataTypePhysicalVariations(), /* dataColumnNums */
null, /* partitionColumnCount */
0, /* virtualColumnCount */
0, /* neededVirtualColumns */
null, vectorizationContext.getScratchColumnTypeNames(), vectorizationContext.getScratchDataTypePhysicalVariations());
VectorizedRowBatch batch = batchContext.createVectorizedRowBatch();
VectorExtractRow resultVectorExtractRow = new VectorExtractRow();
resultVectorExtractRow.init(new TypeInfo[] { outputTypeInfo }, new int[] { vectorExpression.getOutputColumnNum() });
Object[] scrqtchRow = new Object[1];
/*
System.out.println(
"*DEBUG* typeInfo1 " + typeInfo1.toString() +
" typeInfo2 " + typeInfo2.toString() +
" arithmeticTestMode " + arithmeticTestMode +
" columnScalarMode " + columnScalarMode +
" vectorExpression " + vectorExpression.toString());
*/
batchSource.resetBatchIteration();
int rowIndex = 0;
while (true) {
if (!batchSource.fillNextBatch(batch)) {
break;
}
vectorExpression.evaluate(batch);
extractResultObjects(batch, rowIndex, resultVectorExtractRow, scrqtchRow, objectInspector, resultObjects);
rowIndex += batch.size;
}
return true;
}
use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.
the class TestVectorCastStatement method doVectorCastTest.
private boolean doVectorCastTest(TypeInfo typeInfo, TypeInfo targetTypeInfo, List<String> columns, String[] columnNames, TypeInfo[] typeInfos, DataTypePhysicalVariation[] dataTypePhysicalVariations, List<ExprNodeDesc> children, CastStmtTestMode castStmtTestMode, VectorRandomBatchSource batchSource, Object[] resultObjects) throws Exception {
GenericUDF udf;
try {
udf = VectorizationContext.getGenericUDFForCast(targetTypeInfo);
} catch (HiveException e) {
return false;
}
ExprNodeGenericFuncDesc exprDesc = new ExprNodeGenericFuncDesc(targetTypeInfo, udf, children);
HiveConf hiveConf = new HiveConf();
if (castStmtTestMode == CastStmtTestMode.ADAPTOR) {
hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_TEST_VECTOR_ADAPTOR_OVERRIDE, true);
}
VectorizationContext vectorizationContext = new VectorizationContext("name", columns, Arrays.asList(typeInfos), Arrays.asList(dataTypePhysicalVariations), hiveConf);
VectorExpression vectorExpression = vectorizationContext.getVectorExpression(exprDesc);
vectorExpression.transientInit(hiveConf);
if (castStmtTestMode == CastStmtTestMode.VECTOR_EXPRESSION && vectorExpression instanceof VectorUDFAdaptor) {
System.out.println("*NO NATIVE VECTOR EXPRESSION* typeInfo " + typeInfo.toString() + " castStmtTestMode " + castStmtTestMode + " vectorExpression " + vectorExpression.toString());
}
// System.out.println("*VECTOR EXPRESSION* " + vectorExpression.getClass().getSimpleName());
/*
System.out.println(
"*DEBUG* typeInfo " + typeInfo.toString() +
" targetTypeInfo " + targetTypeInfo +
" castStmtTestMode " + castStmtTestMode +
" vectorExpression " + vectorExpression.toString());
*/
VectorRandomRowSource rowSource = batchSource.getRowSource();
VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(columnNames, rowSource.typeInfos(), rowSource.dataTypePhysicalVariations(), /* dataColumnNums */
null, /* partitionColumnCount */
0, /* virtualColumnCount */
0, /* neededVirtualColumns */
null, vectorizationContext.getScratchColumnTypeNames(), vectorizationContext.getScratchDataTypePhysicalVariations());
VectorizedRowBatch batch = batchContext.createVectorizedRowBatch();
VectorExtractRow resultVectorExtractRow = new VectorExtractRow();
resultVectorExtractRow.init(new TypeInfo[] { targetTypeInfo }, new int[] { vectorExpression.getOutputColumnNum() });
Object[] scrqtchRow = new Object[1];
batchSource.resetBatchIteration();
int rowIndex = 0;
while (true) {
if (!batchSource.fillNextBatch(batch)) {
break;
}
vectorExpression.evaluate(batch);
extractResultObjects(batch, rowIndex, resultVectorExtractRow, scrqtchRow, resultObjects);
rowIndex += batch.size;
}
return true;
}
Aggregations