use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering3.
private void testDeleteEventFiltering3() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
boolean columnStatsPresent = OrcConf.ROW_INDEX_STRIDE.getLong(conf) != 0;
// To create small stripes
OrcConf.STRIPE_SIZE.setLong(conf, 1);
// Need to use a bigger row than DummyRow for the writer to flush the stripes
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
// Use OrcRecordUpdater.OrcOptions to set the batch size.
OrcRecordUpdater.OrcOptions orcOptions = new OrcRecordUpdater.OrcOptions(conf);
orcOptions.orcOptions(OrcFile.writerOptions(conf).batchSize(1));
int bucket = 1;
AcidOutputFormat.Options options = orcOptions.filesystem(fs).bucket(bucket).writingBase(true).minimumWriteId(10000002).maximumWriteId(10000002).inspector(bigRowInspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
// Create 3 stripes with 1 row each
byte[] data = new byte[1000];
RecordUpdater updater = new OrcRecordUpdater(root, options);
updater.insert(10000002, new BigRow(data, 0, 0, bucket));
updater.insert(10000002, new BigRow(data, 1, 0, bucket));
updater.insert(10000002, new BigRow(data, 2, 0, bucket));
updater.close(false);
String acidFile = "base_10000002/bucket_00001";
Path acidFilePath = new Path(root, acidFile);
Reader reader = OrcFile.createReader(acidFilePath, OrcFile.readerOptions(conf));
List<StripeInformation> stripes = reader.getStripes();
// Make sure 3 stripes are created
assertEquals(3, stripes.size());
long fileLength = fs.getFileStatus(acidFilePath).getLen();
// 1. Splits within a stripe
// A split that's completely within the 2nd stripe
StripeInformation stripe = stripes.get(1);
OrcSplit split = new OrcSplit(acidFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
validateKeyInterval(split, new RecordIdentifier(1, 1, 1), new RecordIdentifier(0, 0, 0), filterOn);
// A split that's completely within the last stripe
stripe = stripes.get(2);
split = new OrcSplit(acidFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
validateKeyInterval(split, new RecordIdentifier(1, 1, 1), new RecordIdentifier(0, 0, 0), filterOn);
// 2. Splits starting at a stripe boundary
// A split that starts where the 1st stripe starts and ends before the 1st stripe ends
stripe = stripes.get(0);
split = new OrcSplit(acidFilePath, null, stripe.getOffset(), stripe.getLength() - 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for the 1st stripe
if (columnStatsPresent) {
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 0), new RecordIdentifier(10000002, bucketProperty, 0), filterOn);
} else {
validateKeyInterval(split, null, new RecordIdentifier(10000002, bucketProperty, 0), filterOn);
}
// A split that starts where the 2nd stripe starts and ends after the 2nd stripe ends
stripe = stripes.get(1);
split = new OrcSplit(acidFilePath, null, stripe.getOffset(), stripe.getLength() + 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 1), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
// 3. Splits ending at a stripe boundary
// A split that starts before the last stripe starts and ends at the last stripe boundary
stripe = stripes.get(2);
split = new OrcSplit(acidFilePath, null, stripe.getOffset() - 50, stripe.getLength() + 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for the last stripe
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 2), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
// A split that starts after the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(acidFilePath, null, stripes.get(0).getOffset() + 50, reader.getContentLength() - 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 1), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
// A split that starts where the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(acidFilePath, null, stripes.get(0).getOffset(), reader.getContentLength(), new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for all 3 stripes
if (columnStatsPresent) {
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 0), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
} else {
validateKeyInterval(split, null, new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
}
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventOriginalFiltering.
public void testDeleteEventOriginalFiltering() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, false);
// Create 3 original files with 3 rows each
Properties properties = new Properties();
properties.setProperty("columns", DummyOriginalRow.getColumnNamesProperty());
properties.setProperty("columns.types", DummyOriginalRow.getColumnTypesProperty());
OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(properties, conf);
writerOptions.inspector(originalInspector);
Path testFilePath = new Path(root, "000000_0");
Writer writer = OrcFile.createWriter(testFilePath, writerOptions);
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.close();
testFilePath = new Path(root, "000000_0_copy_1");
writer = OrcFile.createWriter(testFilePath, writerOptions);
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.close();
testFilePath = new Path(root, "000000_0_copy_2");
writer = OrcFile.createWriter(testFilePath, writerOptions);
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.addRow(new DummyOriginalRow(0));
writer.close();
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
int bucket = 0;
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(false).minimumWriteId(1).maximumWriteId(1).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
RecordUpdater updater = new OrcRecordUpdater(root, options);
// delete 1 row from each of the original files
// Delete the last record in this split to test boundary conditions. It should not be present in the delete event
// registry for the next split
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 2, 0, bucket));
// Delete the first record in this split to test boundary conditions. It should not be present in the delete event
// registry for the previous split
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 3, 0, bucket));
updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 7, 0, bucket));
updater.close(false);
// HWM is not important - just make sure deltas created above are read as if committed
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:2:" + Long.MAX_VALUE + "::");
// Set vector mode to true int the map work so that we recognize this as a vector mode execution during the split
// generation. Without this we will not compute the offset for the synthetic row ids.
MapWork mapWork = new MapWork();
mapWork.setVectorMode(true);
VectorizedRowBatchCtx vrbContext = new VectorizedRowBatchCtx();
mapWork.setVectorizedRowBatchCtx(vrbContext);
HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
Utilities.setMapWork(conf, mapWork);
// now we have 3 delete events total, but for each split we should only
// load 1 into DeleteRegistry (if filtering is on)
List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
assertEquals(1, splitStrategies.size());
List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
assertEquals(3, splits.size());
assertEquals(root.toUri().toString() + File.separator + "000000_0", splits.get(0).getPath().toUri().toString());
assertTrue(splits.get(0).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "000000_0_copy_1", splits.get(1).getPath().toUri().toString());
assertTrue(splits.get(1).isOriginal());
assertEquals(root.toUri().toString() + File.separator + "000000_0_copy_2", splits.get(2).getPath().toUri().toString());
assertTrue(splits.get(2).isOriginal());
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, vrbContext);
ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 1", filterOn ? 1 : 3, deleteEventRegistry.size());
OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 2)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(1), conf, Reporter.NULL, vrbContext);
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 2", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 3), new RecordIdentifier(0, bucketProperty, 5)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(2), conf, Reporter.NULL, vrbContext);
deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
assertEquals("number of delete events for stripe 3", filterOn ? 1 : 3, deleteEventRegistry.size());
keyInterval = vectorizedReader.getKeyInterval();
if (filterOn) {
assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 6), new RecordIdentifier(0, bucketProperty, 8)), keyInterval);
} else {
assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
}
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class CompactorTest method addFile.
private void addFile(Table t, Partition p, long minTxn, long maxTxn, int numRecords, FileType type, int numBuckets, boolean allBucketsPresent, long visibilityId) throws Exception {
String partValue = (p == null) ? null : p.getValues().get(0);
Path location = new Path(getLocation(t.getTableName(), partValue));
String filename = null;
switch(type) {
case BASE:
filename = AcidUtils.BASE_PREFIX + maxTxn + (visibilityId > 0 ? AcidUtils.VISIBILITY_PREFIX + visibilityId : "");
break;
// Fall through to delta
case LENGTH_FILE:
case DELTA:
filename = makeDeltaDirName(minTxn, maxTxn);
break;
// handled below
case LEGACY:
break;
}
FileSystem fs = FileSystem.get(conf);
for (int bucket = 0; bucket < numBuckets; bucket++) {
// skip one
if (bucket == 0 && !allBucketsPresent)
continue;
Path partFile = null;
if (type == FileType.LEGACY) {
partFile = new Path(location, String.format(AcidUtils.LEGACY_FILE_BUCKET_DIGITS, bucket) + "_0");
} else {
Path dir = new Path(location, filename);
fs.mkdirs(dir);
partFile = AcidUtils.createBucketFile(dir, bucket);
if (type == FileType.LENGTH_FILE) {
partFile = new Path(partFile.toString() + AcidUtils.DELTA_SIDE_FILE_SUFFIX);
}
}
FSDataOutputStream out = fs.create(partFile);
if (type == FileType.LENGTH_FILE) {
// hmm - length files should store length in bytes...
out.writeInt(numRecords);
} else {
for (int i = 0; i < numRecords; i++) {
RecordIdentifier ri = new RecordIdentifier(maxTxn - 1, bucket, i);
ri.write(out);
out.writeBytes("mary had a little lamb its fleece was white as snow\n");
}
}
out.close();
}
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class OrcRawRecordMerger method discoverOriginalKeyBounds.
/**
* Find the key range for original bucket files.
* @param reader the reader
* @param bucket the bucket number we are reading
* @param options the options for reading with
* @throws IOException
*/
private void discoverOriginalKeyBounds(Reader reader, int bucket, Reader.Options options) throws IOException {
long rowLength = 0;
long rowOffset = 0;
long offset = options.getOffset();
long maxOffset = options.getMaxOffset();
boolean isTail = true;
for (StripeInformation stripe : reader.getStripes()) {
if (offset > stripe.getOffset()) {
rowOffset += stripe.getNumberOfRows();
} else if (maxOffset > stripe.getOffset()) {
rowLength += stripe.getNumberOfRows();
} else {
isTail = false;
break;
}
}
if (rowOffset > 0) {
minKey = new RecordIdentifier(0, bucket, rowOffset - 1);
}
if (!isTail) {
maxKey = new RecordIdentifier(0, bucket, rowOffset + rowLength - 1);
}
}
use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.
the class TestRecordInspectorImpl method testExtractRecordIdentifier.
@Test
public void testExtractRecordIdentifier() {
RecordIdentifier recordIdentifier = new RecordIdentifier(10L, 4, 20L);
MutableRecord record = new MutableRecord(1, "hello", recordIdentifier);
assertThat(inspector.extractRecordIdentifier(record), is(recordIdentifier));
}
Aggregations