use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestOrcRecordUpdater method testConcurrentParseKeyIndex.
/*
CharsetDecoder instances are not thread safe, so it can end up in an inconsistent state when reading multiple
buffers parallel.
E.g:
java.lang.IllegalStateException: Current state = FLUSHED, new state = CODING_END
*/
@Test
public void testConcurrentParseKeyIndex() throws Exception {
// Given
Reader mockReader = mock(Reader.class);
when(mockReader.hasMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME)).thenReturn(true);
// Create a large buffer
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < 3000; i++) {
sb.append("100000,200000,300000;");
}
when(mockReader.getMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME)).thenReturn(ByteBuffer.wrap(sb.toString().getBytes()));
// When
// Hit OrcRecordUpdater.parseKeyIndex with large parallelism
final int parallelism = 4000;
Callable<RecordIdentifier[]>[] r = new Callable[parallelism];
for (int i = 0; i < parallelism; i++) {
r[i] = () -> {
return OrcRecordUpdater.parseKeyIndex(mockReader);
};
}
ExecutorService executorService = Executors.newFixedThreadPool(parallelism);
List<Future<RecordIdentifier[]>> res = executorService.invokeAll(Arrays.asList(r));
// Check for exceptions
for (Future<RecordIdentifier[]> ri : res) {
ri.get();
}
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering.
/**
* Tests that we can figure out min/max ROW__ID for each split and then use
* that to only load delete events between min/max.
* This test doesn't actually check what is read - that is done more E2E
* unit tests.
* @throws Exception
*/
private void testDeleteEventFiltering() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
int bucket = 0;
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(false).minimumWriteId(1).maximumWriteId(1).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
// create 3 insert deltas so that we have 3 splits
RecordUpdater updater = new OrcRecordUpdater(root, options);
// In the first delta add 2000 recs to simulate recs in multiple stripes.
int numRows = 2000;
for (int i = 1; i <= numRows; i++) {
updater.insert(options.getMinimumWriteId(), new DummyRow(i, i - 1, options.getMinimumWriteId(), bucket));
}
updater.close(false);
options.minimumWriteId(2).maximumWriteId(2);
updater = new OrcRecordUpdater(root, options);
updater.insert(options.getMinimumWriteId(), new DummyRow(4, 0, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(5, 1, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(6, 2, options.getMinimumWriteId(), bucket));
updater.close(false);
options.minimumWriteId(3).maximumWriteId(3);
updater = new OrcRecordUpdater(root, options);
updater.insert(options.getMinimumWriteId(), new DummyRow(7, 0, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(8, 1, options.getMinimumWriteId(), bucket));
updater.insert(options.getMinimumWriteId(), new DummyRow(9, 2, options.getMinimumWriteId(), bucket));
updater.close(false);
// delete 1 row from each of the insert deltas
options.minimumWriteId(4).maximumWriteId(4);
updater = new OrcRecordUpdater(root, options);
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 0, 1, bucket));
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 1, 2, bucket));
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 2, 3, bucket));
updater.close(false);
conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
// HWM is not important - just make sure deltas created above are read as
// if committed
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:5:" + Long.MAX_VALUE + "::");
// now we have 3 delete events total, but for each split we should only
// load 1 into DeleteRegistry (if filtering is on)
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
assertEquals(1, splitStrategies.size());
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(3, splits.size());
assertEquals(root.toUri().toString() + File.separator + "delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "delta_0000002_0000002_0000/bucket_00000", splits.get(1).getPath().toUri().toString());
assertFalse(splits.get(1).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "delta_0000003_0000003_0000/bucket_00000", splits.get(2).getPath().toUri().toString());
assertFalse(splits.get(2).isOriginal());
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, new VectorizedRowBatchCtx());
ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 1", filterOn ? 1 : 3, deleteEventRegistry.size());
OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(1, bucketProperty, 0), new RecordIdentifier(1, bucketProperty, numRows - 1)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(1), conf, Reporter.NULL, new VectorizedRowBatchCtx());
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 2", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(2, bucketProperty, 0), new RecordIdentifier(2, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(2), conf, Reporter.NULL, new VectorizedRowBatchCtx());
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 3", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(3, bucketProperty, 0), new RecordIdentifier(3, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventOriginalFiltering2.
private void testDeleteEventOriginalFiltering2() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, false);
// Need to use a bigger row than DummyRow for the writer to flush the stripes
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
Properties properties = new Properties();
OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(properties, conf);
writerOptions.inspector(bigOriginalRowInspector).stripeSize(1).batchSize(1);
String originalFile = "000000_0";
Path originalFilePath = new Path(root, originalFile);
byte[] data = new byte[1000];
Writer writer = OrcFile.createWriter(originalFilePath, writerOptions);
writer.addRow(new BigOriginalRow(data));
writer.addRow(new BigOriginalRow(data));
writer.addRow(new BigOriginalRow(data));
writer.close();
Reader reader = OrcFile.createReader(originalFilePath, OrcFile.readerOptions(conf));
List<StripeInformation> stripes = reader.getStripes();
// Make sure 3 stripes are created
assertEquals(3, stripes.size());
FileStatus fileStatus = fs.getFileStatus(originalFilePath);
long fileLength = fileStatus.getLen();
// Set vector mode to true in the map work so that we can generate the syntheticProps
MapWork mapWork = new MapWork();
mapWork.setVectorMode(true);
VectorizedRowBatchCtx vrbContext = new VectorizedRowBatchCtx();
mapWork.setVectorizedRowBatchCtx(vrbContext);
HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
Utilities.setMapWork(conf, mapWork);
OrcSplit.OffsetAndBucketProperty syntheticProps = VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket(fileStatus, root, true, true, conf);
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).bucket(0);
int bucketProperty = BucketCodec.V1.encode(options);
// 1. Splits within a stripe
// A split that's completely within the 2nd stripe
StripeInformation stripe = stripes.get(1);
OrcSplit split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 1), filterOn);
// A split that's completely within the last stripe
stripe = stripes.get(2);
split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 3), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// 2. Splits starting at a stripe boundary
// A split that starts where the 1st stripe starts and ends before the 1st stripe ends
stripe = stripes.get(0);
split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the 1st stripe
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 0), filterOn);
// A split that starts where the 2nd stripe starts and ends after the 2nd stripe ends
stripe = stripes.get(1);
split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// 3. Splits ending at a stripe boundary
// A split that starts before the last stripe starts and ends at the last stripe boundary
stripe = stripes.get(2);
split = new OrcSplit(originalFilePath, null, stripe.getOffset() - 50, stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the last stripe
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// A split that starts after the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset() + 50, reader.getContentLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// A split that starts where the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset(), reader.getContentLength(), new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for all 3 stripes
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 2), filterOn);
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering2.
private void testDeleteEventFiltering2() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
boolean skipKeyIdx = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVETESTMODEACIDKEYIDXSKIP);
int bucket = 1;
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(true).minimumWriteId(10000002).maximumWriteId(10000002).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
// create data that looks like a compacted base that includes some data
// from 'original' files and some from native Acid write
RecordUpdater updater = new OrcRecordUpdater(root, options);
updater.insert(0, new DummyRow(1, 0, 0, bucket));
updater.insert(0, new DummyRow(1, 1, 0, bucket));
updater.insert(0, new DummyRow(2, 2, 0, bucket));
updater.insert(10000001, new DummyRow(3, 0, 10000001, bucket));
updater.close(false);
// delete 3rd row
options.writingBase(false).minimumWriteId(10000004).maximumWriteId(10000004);
updater = new OrcRecordUpdater(root, options);
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 0, 0, bucket));
// hypothetically this matches something in (nonexistent here)
// delta_10000003_10000003
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 5, 10000003, bucket));
updater.close(false);
conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
// HWM is not important - just make sure deltas created above are read as
// if committed
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:10000005:" + Long.MAX_VALUE + "::");
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
assertEquals(1, splitStrategies.size());
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(1, splits.size());
assertEquals(root.toUri().toString() + File.separator + "base_10000002/bucket_00001", splits.get(0).getPath().toUri().toString());
assertFalse(splits.get(0).isOriginal());
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, new VectorizedRowBatchCtx());
ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 1", filterOn ? 1 : 2, deleteEventRegistry.size());
OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
SearchArgument sarg = vectorizedReader.getDeleteEventSarg();
if (filterOn) {
if (skipKeyIdx) {
// If key index is not present, the min max key interval uses stripe stats instead
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(10000001, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(10000001, bucketProperty, 0)), keyInterval);
}
// key point is that in leaf-5 is (rowId <= 2) even though maxKey has
// rowId 0. more in VectorizedOrcAcidRowBatchReader.findMinMaxKeys
assertEquals("leaf-0 = (LESS_THAN originalTransaction 0)," + " leaf-1 = (LESS_THAN bucket 536936448)," + " leaf-2 = (LESS_THAN rowId 0)," + " leaf-3 = (LESS_THAN_EQUALS originalTransaction 10000001)," + " leaf-4 = (LESS_THAN_EQUALS bucket 536936448)," + " leaf-5 = (LESS_THAN_EQUALS rowId 2)," + " expr = (and (not leaf-0) (not leaf-1) " + "(not leaf-2) leaf-3 leaf-4 leaf-5)", ((SearchArgumentImpl) sarg).toOldString());
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
assertNull(sarg);
}
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestOrcRawRecordMerger method testEmpty.
@Test
public void testEmpty() throws Exception {
final int BUCKET = 0;
Configuration conf = new Configuration();
OrcOutputFormat of = new OrcOutputFormat();
FileSystem fs = FileSystem.getLocal(conf);
Path root = new Path(tmpDir, "testEmpty").makeQualified(fs);
fs.delete(root, true);
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
// write the empty base
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).inspector(inspector).bucket(BUCKET).writingBase(true).maximumWriteId(100).finalDestination(root);
of.getRecordUpdater(root, options).close(false);
conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
ValidWriteIdList writeIdList = new ValidReaderWriteIdList("testEmpty:200:" + Long.MAX_VALUE);
AcidDirectory directory = AcidUtils.getAcidState(fs, root, conf, writeIdList, null, false);
Path basePath = AcidUtils.createBucketFile(directory.getBaseDirectory(), BUCKET);
Reader baseReader = OrcFile.createReader(basePath, OrcFile.readerOptions(conf));
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
OrcRawRecordMerger merger = new OrcRawRecordMerger(conf, true, baseReader, false, BUCKET, createMaximalTxnList(), new Reader.Options(), AcidUtils.getPaths(directory.getCurrentDirectories()), new OrcRawRecordMerger.Options().isCompacting(false));
RecordIdentifier key = merger.createKey();
OrcStruct value = merger.createValue();
assertEquals(false, merger.next(key, value));
}
Aggregations