use of org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader.ColumnizedDeleteEventRegistry in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testVectorizedOrcAcidRowBatchReader.
private void testVectorizedOrcAcidRowBatchReader(String deleteEventRegistry) throws Exception {
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
assertEquals(1, splitStrategies.size());
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(1, splits.size());
assertEquals(root.toUri().toString() + File.separator + "delta_0000001_0000010_0000/bucket_00000", splits.get(0).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
// Mark one of the transactions as an exception to test that invalid transactions
// are being handled properly.
// Exclude transaction 5
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:14:1:1:5");
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, new VectorizedRowBatchCtx());
if (deleteEventRegistry.equals(ColumnizedDeleteEventRegistry.class.getName())) {
assertTrue(vectorizedReader.getDeleteEventRegistry() instanceof ColumnizedDeleteEventRegistry);
}
if (deleteEventRegistry.equals(SortMergedDeleteEventRegistry.class.getName())) {
assertTrue(vectorizedReader.getDeleteEventRegistry() instanceof SortMergedDeleteEventRegistry);
}
TypeDescription schema = OrcInputFormat.getDesiredRowTypeDescr(conf, true, Integer.MAX_VALUE);
VectorizedRowBatch vectorizedRowBatch = schema.createRowBatchV2();
// set data column count as 1.
vectorizedRowBatch.setPartitionInfo(1, 0);
long previousPayload = Long.MIN_VALUE;
while (vectorizedReader.next(null, vectorizedRowBatch)) {
assertTrue(vectorizedRowBatch.selectedInUse);
LongColumnVector col = (LongColumnVector) vectorizedRowBatch.cols[0];
for (int i = 0; i < vectorizedRowBatch.size; ++i) {
int idx = vectorizedRowBatch.selected[i];
long payload = col.vector[idx];
long owid = (payload / NUM_ROWID_PER_OWID) + 1;
long rowId = payload % NUM_ROWID_PER_OWID;
assertFalse(rowId % 2 == 0 || rowId % 3 == 0);
// Check that writeid#5 has been excluded.
assertTrue(owid != 5);
// Check that the data is in sorted order.
assertTrue(payload > previousPayload);
previousPayload = payload;
}
}
}
use of org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader.ColumnizedDeleteEventRegistry in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventOriginalFiltering.
public void testDeleteEventOriginalFiltering() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, false);
// Create 3 original files with 3 rows each
Properties properties = new Properties();
properties.setProperty("columns", DummyOriginalRow.getColumnNamesProperty());
properties.setProperty("columns.types", DummyOriginalRow.getColumnTypesProperty());
OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(properties, conf);
writerOptions.inspector(originalInspector);
Path testFilePath = new Path(root, "000000_0");
Writer writer = OrcFile.createWriter(testFilePath, writerOptions);
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.close();
testFilePath = new Path(root, "000000_0_copy_1");
writer = OrcFile.createWriter(testFilePath, writerOptions);
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.close();
testFilePath = new Path(root, "000000_0_copy_2");
writer = OrcFile.createWriter(testFilePath, writerOptions);
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.close();
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
int bucket = 0;
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(false).minimumWriteId(1).maximumWriteId(1).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
RecordUpdater updater = new OrcRecordUpdater(root, options);
// delete 1 row from each of the original files
// Delete the last record in this split to test boundary conditions. It should not be present in the delete event
// registry for the next split
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 2, 0, bucket));
// Delete the first record in this split to test boundary conditions. It should not be present in the delete event
// registry for the previous split
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 3, 0, bucket));
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 7, 0, bucket));
updater.close(false);
// HWM is not important - just make sure deltas created above are read as if committed
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:2:" + Long.MAX_VALUE + "::");
// Set vector mode to true int the map work so that we recognize this as a vector mode execution during the split
// generation. Without this we will not compute the offset for the synthetic row ids.
MapWork mapWork = new MapWork();
mapWork.setVectorMode(true);
VectorizedRowBatchCtx vrbContext = new VectorizedRowBatchCtx();
mapWork.setVectorizedRowBatchCtx(vrbContext);
HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
Utilities.setMapWork(conf, mapWork);
// now we have 3 delete events total, but for each split we should only
// load 1 into DeleteRegistry (if filtering is on)
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
assertEquals(1, splitStrategies.size());
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(3, splits.size());
assertEquals(root.toUri().toString() + File.separator + "000000_0", splits.get(0).getPath().toUri().toString());
assertTrue(splits.get(0).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "000000_0_copy_1", splits.get(1).getPath().toUri().toString());
assertTrue(splits.get(1).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "000000_0_copy_2", splits.get(2).getPath().toUri().toString());
assertTrue(splits.get(2).isOriginal());
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, vrbContext);
ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 1", filterOn ? 1 : 3, deleteEventRegistry.size());
OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(1), conf, Reporter.NULL, vrbContext);
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 2", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 3), new RecordIdentifier(0, bucketProperty, 5)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(2), conf, Reporter.NULL, vrbContext);
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 3", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 6), new RecordIdentifier(0, bucketProperty, 8)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
}
use of org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader.ColumnizedDeleteEventRegistry in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering.
/**
* Tests that we can figure out min/max ROW__ID for each split and then use
* that to only load delete events between min/max.
* This test doesn't actually check what is read - that is done more E2E
* unit tests.
* @throws Exception
*/
private void testDeleteEventFiltering() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
int bucket = 0;
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(false).minimumWriteId(1).maximumWriteId(1).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
// create 3 insert deltas so that we have 3 splits
RecordUpdater updater = new OrcRecordUpdater(root, options);
// In the first delta add 2000 recs to simulate recs in multiple stripes.
int numRows = 2000;
for (int i = 1; i <= numRows; i++) {
updater.insert(options.getMinimumWriteId(), new DummyRow(i, i - 1, options.getMinimumWriteId(), bucket));
}
updater.close(false);
options.minimumWriteId(2).maximumWriteId(2);
updater = new OrcRecordUpdater(root, options);
updater.insert(options.getMinimumWriteId(), new DummyRow(4, 0, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(5, 1, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(6, 2, options.getMinimumWriteId(), bucket));
updater.close(false);
options.minimumWriteId(3).maximumWriteId(3);
updater = new OrcRecordUpdater(root, options);
updater.insert(options.getMinimumWriteId(), new DummyRow(7, 0, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(8, 1, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(9, 2, options.getMinimumWriteId(), bucket));
updater.close(false);
// delete 1 row from each of the insert deltas
options.minimumWriteId(4).maximumWriteId(4);
updater = new OrcRecordUpdater(root, options);
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 0, 1, bucket));
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 1, 2, bucket));
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 2, 3, bucket));
updater.close(false);
conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
// HWM is not important - just make sure deltas created above are read as
// if committed
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:5:" + Long.MAX_VALUE + "::");
// now we have 3 delete events total, but for each split we should only
// load 1 into DeleteRegistry (if filtering is on)
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
assertEquals(1, splitStrategies.size());
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(3, splits.size());
assertEquals(root.toUri().toString() + File.separator + "delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "delta_0000002_0000002_0000/bucket_00000", splits.get(1).getPath().toUri().toString());
assertFalse(splits.get(1).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "delta_0000003_0000003_0000/bucket_00000", splits.get(2).getPath().toUri().toString());
assertFalse(splits.get(2).isOriginal());
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, new VectorizedRowBatchCtx());
ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 1", filterOn ? 1 : 3, deleteEventRegistry.size());
OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(1, bucketProperty, 0), new RecordIdentifier(1, bucketProperty, numRows - 1)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(1), conf, Reporter.NULL, new VectorizedRowBatchCtx());
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 2", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(2, bucketProperty, 0), new RecordIdentifier(2, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(2), conf, Reporter.NULL, new VectorizedRowBatchCtx());
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 3", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(3, bucketProperty, 0), new RecordIdentifier(3, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
}
use of org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader.ColumnizedDeleteEventRegistry in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering2.
private void testDeleteEventFiltering2() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
boolean skipKeyIdx = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVETESTMODEACIDKEYIDXSKIP);
int bucket = 1;
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(true).minimumWriteId(10000002).maximumWriteId(10000002).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
// create data that looks like a compacted base that includes some data
// from 'original' files and some from native Acid write
RecordUpdater updater = new OrcRecordUpdater(root, options);
updater.insert(0, new DummyRow(1, 0, 0, bucket));
updater.insert(0, new DummyRow(1, 1, 0, bucket));
updater.insert(0, new DummyRow(2, 2, 0, bucket));
updater.insert(10000001, new DummyRow(3, 0, 10000001, bucket));
updater.close(false);
// delete 3rd row
options.writingBase(false).minimumWriteId(10000004).maximumWriteId(10000004);
updater = new OrcRecordUpdater(root, options);
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 0, 0, bucket));
// hypothetically this matches something in (nonexistent here)
// delta_10000003_10000003
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 5, 10000003, bucket));
updater.close(false);
conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
// HWM is not important - just make sure deltas created above are read as
// if committed
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:10000005:" + Long.MAX_VALUE + "::");
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
assertEquals(1, splitStrategies.size());
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(1, splits.size());
assertEquals(root.toUri().toString() + File.separator + "base_10000002/bucket_00001", splits.get(0).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, new VectorizedRowBatchCtx());
ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 1", filterOn ? 1 : 2, deleteEventRegistry.size());
OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
SearchArgument sarg = vectorizedReader.getDeleteEventSarg();
if (filterOn) {
if (skipKeyIdx) {
// If key index is not present, the min max key interval uses stripe stats instead
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(10000001, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(10000001, bucketProperty, 0)), keyInterval);
}
// key point is that in leaf-5 is (rowId <= 2) even though maxKey has
// rowId 0. more in VectorizedOrcAcidRowBatchReader.findMinMaxKeys
assertEquals("leaf-0 = (LESS_THAN originalTransaction 0)," + " leaf-1 = (LESS_THAN bucket 536936448)," + " leaf-2 = (LESS_THAN rowId 0)," + " leaf-3 = (LESS_THAN_EQUALS originalTransaction 10000001)," + " leaf-4 = (LESS_THAN_EQUALS bucket 536936448)," + " leaf-5 = (LESS_THAN_EQUALS rowId 2)," + " expr = (and (not leaf-0) (not leaf-1) " + "(not leaf-2) leaf-3 leaf-4 leaf-5)", ((SearchArgumentImpl) sarg).toOldString());
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
assertNull(sarg);
}
}
Aggregations