use of org.apache.orc.StripeInformation in project hive by apache.
the class FixAcidKeyIndex method validate.
public static AcidKeyIndexValidationResult validate(Configuration conf, Path inputPath) throws IOException {
AcidKeyIndexValidationResult result = new AcidKeyIndexValidationResult();
FileSystem fs = inputPath.getFileSystem(conf);
try (Reader reader = OrcFile.createReader(fs, inputPath);
RecordReader rr = reader.rows()) {
List<StripeInformation> stripes = reader.getStripes();
RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader);
StructObjectInspector soi = (StructObjectInspector) reader.getObjectInspector();
// struct<operation:int,originalTransaction:bigint,bucket:int,rowId:bigint,currentTransaction:bigint
List<? extends StructField> structFields = soi.getAllStructFieldRefs();
StructField transactionField = structFields.get(1);
LongObjectInspector transactionOI = (LongObjectInspector) transactionField.getFieldObjectInspector();
StructField bucketField = structFields.get(2);
IntObjectInspector bucketOI = (IntObjectInspector) bucketField.getFieldObjectInspector();
StructField rowIdField = structFields.get(3);
LongObjectInspector rowIdOI = (LongObjectInspector) rowIdField.getFieldObjectInspector();
long rowsProcessed = 0;
for (int i = 0; i < stripes.size(); i++) {
rowsProcessed += stripes.get(i).getNumberOfRows();
rr.seekToRow(rowsProcessed - 1);
OrcStruct row = (OrcStruct) rr.next(null);
long lastTransaction = transactionOI.get(soi.getStructFieldData(row, transactionField));
int lastBucket = bucketOI.get(soi.getStructFieldData(row, bucketField));
long lastRowId = rowIdOI.get(soi.getStructFieldData(row, rowIdField));
RecordIdentifier recordIdentifier = new RecordIdentifier(lastTransaction, lastBucket, lastRowId);
result.recordIdentifiers.add(recordIdentifier);
if (stripes.size() != keyIndex.length || keyIndex[i] == null || recordIdentifier.compareTo(keyIndex[i]) != 0) {
result.isValid = false;
}
}
}
return result;
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcFileFormatProxy method applySargToMetadata.
@Override
public SplitInfos applySargToMetadata(SearchArgument sarg, ByteBuffer fileMetadata, Configuration conf) throws IOException {
// TODO: ideally we should store shortened representation of only the necessary fields
// in HBase; it will probably require custom SARG application code.
OrcTail orcTail = ReaderImpl.extractFileTail(fileMetadata);
OrcProto.Footer footer = orcTail.getFooter();
int stripeCount = footer.getStripesCount();
// Always convert To PROLEPTIC_GREGORIAN
List<StripeStatistics> stripeStats;
try (org.apache.orc.Reader dummyReader = new org.apache.orc.impl.ReaderImpl(null, org.apache.orc.OrcFile.readerOptions(org.apache.orc.OrcFile.readerOptions(conf).getConfiguration()).useUTCTimestamp(true).convertToProlepticGregorian(true).orcTail(orcTail))) {
stripeStats = dummyReader.getVariantStripeStatistics(null);
}
boolean[] result = OrcInputFormat.pickStripesViaTranslatedSarg(sarg, orcTail.getWriterVersion(), footer.getTypesList(), stripeStats, stripeCount);
// For ORC case, send the boundaries of the stripes so we don't have to send the footer.
SplitInfos.Builder sb = SplitInfos.newBuilder();
List<StripeInformation> stripes = orcTail.getStripes();
boolean isEliminated = true;
for (int i = 0; i < result.length; ++i) {
if (result != null && !result[i])
continue;
isEliminated = false;
StripeInformation si = stripes.get(i);
if (LOG.isDebugEnabled()) {
LOG.debug("PPD is adding a split " + i + ": " + si.getOffset() + ", " + si.getLength());
}
sb.addInfos(SplitInfo.newBuilder().setIndex(i).setOffset(si.getOffset()).setLength(si.getLength()));
}
return isEliminated ? null : sb.build();
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestOrcFile method testMemoryManagementV11.
@Test
public void testMemoryManagementV11() throws Exception {
OrcConf.ROWS_BETWEEN_CHECKS.setLong(conf, 100);
final long poolSize = 50_000;
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
MemoryManager memoryManager = new MemoryManagerImpl(poolSize);
// set up 10 files that all request the full size.
MemoryManager.Callback ignore = newScale -> false;
for (int f = 0; f < 9; ++f) {
memoryManager.addWriter(new Path("file-" + f), poolSize, ignore);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE).stripeSize(50000).bufferSize(100).rowIndexStride(0).memory(memoryManager).batchSize(100).version(OrcFile.Version.V_0_11));
assertEquals(0.1, ((MemoryManagerImpl) memoryManager).getAllocationScale());
for (int i = 0; i < 2500; ++i) {
writer.addRow(new InnerStruct(i * 300, Integer.toHexString(10 * i)));
}
writer.close();
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
int i = 0;
for (StripeInformation stripe : reader.getStripes()) {
i += 1;
assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(), stripe.getDataLength() < 5000);
}
assertEquals(25, i);
assertEquals(2500, reader.getNumberOfRows());
reader.close();
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering3.
private void testDeleteEventFiltering3() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
boolean columnStatsPresent = OrcConf.ROW_INDEX_STRIDE.getLong(conf) != 0;
// To create small stripes
OrcConf.STRIPE_SIZE.setLong(conf, 1);
// Need to use a bigger row than DummyRow for the writer to flush the stripes
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
// Use OrcRecordUpdater.OrcOptions to set the batch size.
OrcRecordUpdater.OrcOptions orcOptions = new OrcRecordUpdater.OrcOptions(conf);
orcOptions.orcOptions(OrcFile.writerOptions(conf).batchSize(1));
int bucket = 1;
AcidOutputFormat.Options options = orcOptions.filesystem(fs).bucket(bucket).writingBase(true).minimumWriteId(10000002).maximumWriteId(10000002).inspector(bigRowInspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
int bucketProperty = BucketCodec.V1.encode(options);
// Create 3 stripes with 1 row each
byte[] data = new byte[1000];
RecordUpdater updater = new OrcRecordUpdater(root, options);
updater.insert(10000002, new BigRow(data, 0, 0, bucket));
updater.insert(10000002, new BigRow(data, 1, 0, bucket));
updater.insert(10000002, new BigRow(data, 2, 0, bucket));
updater.close(false);
String acidFile = "base_10000002/bucket_00001";
Path acidFilePath = new Path(root, acidFile);
Reader reader = OrcFile.createReader(acidFilePath, OrcFile.readerOptions(conf));
List<StripeInformation> stripes = reader.getStripes();
// Make sure 3 stripes are created
assertEquals(3, stripes.size());
long fileLength = fs.getFileStatus(acidFilePath).getLen();
// 1. Splits within a stripe
// A split that's completely within the 2nd stripe
StripeInformation stripe = stripes.get(1);
OrcSplit split = new OrcSplit(acidFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
validateKeyInterval(split, new RecordIdentifier(1, 1, 1), new RecordIdentifier(0, 0, 0), filterOn);
// A split that's completely within the last stripe
stripe = stripes.get(2);
split = new OrcSplit(acidFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
validateKeyInterval(split, new RecordIdentifier(1, 1, 1), new RecordIdentifier(0, 0, 0), filterOn);
// 2. Splits starting at a stripe boundary
// A split that starts where the 1st stripe starts and ends before the 1st stripe ends
stripe = stripes.get(0);
split = new OrcSplit(acidFilePath, null, stripe.getOffset(), stripe.getLength() - 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for the 1st stripe
if (columnStatsPresent) {
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 0), new RecordIdentifier(10000002, bucketProperty, 0), filterOn);
} else {
validateKeyInterval(split, null, new RecordIdentifier(10000002, bucketProperty, 0), filterOn);
}
// A split that starts where the 2nd stripe starts and ends after the 2nd stripe ends
stripe = stripes.get(1);
split = new OrcSplit(acidFilePath, null, stripe.getOffset(), stripe.getLength() + 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 1), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
// 3. Splits ending at a stripe boundary
// A split that starts before the last stripe starts and ends at the last stripe boundary
stripe = stripes.get(2);
split = new OrcSplit(acidFilePath, null, stripe.getOffset() - 50, stripe.getLength() + 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for the last stripe
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 2), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
// A split that starts after the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(acidFilePath, null, stripes.get(0).getOffset() + 50, reader.getContentLength() - 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 1), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
// A split that starts where the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(acidFilePath, null, stripes.get(0).getOffset(), reader.getContentLength(), new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
// The key interval for all 3 stripes
if (columnStatsPresent) {
validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 0), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
} else {
validateKeyInterval(split, null, new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
}
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcRawRecordMerger method discoverOriginalKeyBounds.
/**
* Find the key range for original bucket files.
* @param reader the reader
* @param bucket the bucket number we are reading
* @param options the options for reading with
* @throws IOException
*/
private void discoverOriginalKeyBounds(Reader reader, int bucket, Reader.Options options) throws IOException {
long rowLength = 0;
long rowOffset = 0;
long offset = options.getOffset();
long maxOffset = options.getMaxOffset();
boolean isTail = true;
for (StripeInformation stripe : reader.getStripes()) {
if (offset > stripe.getOffset()) {
rowOffset += stripe.getNumberOfRows();
} else if (maxOffset > stripe.getOffset()) {
rowLength += stripe.getNumberOfRows();
} else {
isTail = false;
break;
}
}
if (rowOffset > 0) {
minKey = new RecordIdentifier(0, bucket, rowOffset - 1);
}
if (!isTail) {
maxKey = new RecordIdentifier(0, bucket, rowOffset + rowLength - 1);
}
}
Aggregations