Search in sources :

Example 16 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class FixAcidKeyIndex method validate.

public static AcidKeyIndexValidationResult validate(Configuration conf, Path inputPath) throws IOException {
    AcidKeyIndexValidationResult result = new AcidKeyIndexValidationResult();
    FileSystem fs = inputPath.getFileSystem(conf);
    try (Reader reader = OrcFile.createReader(fs, inputPath);
        RecordReader rr = reader.rows()) {
        List<StripeInformation> stripes = reader.getStripes();
        RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader);
        StructObjectInspector soi = (StructObjectInspector) reader.getObjectInspector();
        // struct<operation:int,originalTransaction:bigint,bucket:int,rowId:bigint,currentTransaction:bigint
        List<? extends StructField> structFields = soi.getAllStructFieldRefs();
        StructField transactionField = structFields.get(1);
        LongObjectInspector transactionOI = (LongObjectInspector) transactionField.getFieldObjectInspector();
        StructField bucketField = structFields.get(2);
        IntObjectInspector bucketOI = (IntObjectInspector) bucketField.getFieldObjectInspector();
        StructField rowIdField = structFields.get(3);
        LongObjectInspector rowIdOI = (LongObjectInspector) rowIdField.getFieldObjectInspector();
        long rowsProcessed = 0;
        for (int i = 0; i < stripes.size(); i++) {
            rowsProcessed += stripes.get(i).getNumberOfRows();
            rr.seekToRow(rowsProcessed - 1);
            OrcStruct row = (OrcStruct) rr.next(null);
            long lastTransaction = transactionOI.get(soi.getStructFieldData(row, transactionField));
            int lastBucket = bucketOI.get(soi.getStructFieldData(row, bucketField));
            long lastRowId = rowIdOI.get(soi.getStructFieldData(row, rowIdField));
            RecordIdentifier recordIdentifier = new RecordIdentifier(lastTransaction, lastBucket, lastRowId);
            result.recordIdentifiers.add(recordIdentifier);
            if (stripes.size() != keyIndex.length || keyIndex[i] == null || recordIdentifier.compareTo(keyIndex[i]) != 0) {
                result.isValid = false;
            }
        }
    }
    return result;
}
Also used : LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) FileSystem(org.apache.hadoop.fs.FileSystem) StripeInformation(org.apache.orc.StripeInformation) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 17 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class OrcFileFormatProxy method applySargToMetadata.

@Override
public SplitInfos applySargToMetadata(SearchArgument sarg, ByteBuffer fileMetadata, Configuration conf) throws IOException {
    // TODO: ideally we should store shortened representation of only the necessary fields
    // in HBase; it will probably require custom SARG application code.
    OrcTail orcTail = ReaderImpl.extractFileTail(fileMetadata);
    OrcProto.Footer footer = orcTail.getFooter();
    int stripeCount = footer.getStripesCount();
    // Always convert To PROLEPTIC_GREGORIAN
    List<StripeStatistics> stripeStats;
    try (org.apache.orc.Reader dummyReader = new org.apache.orc.impl.ReaderImpl(null, org.apache.orc.OrcFile.readerOptions(org.apache.orc.OrcFile.readerOptions(conf).getConfiguration()).useUTCTimestamp(true).convertToProlepticGregorian(true).orcTail(orcTail))) {
        stripeStats = dummyReader.getVariantStripeStatistics(null);
    }
    boolean[] result = OrcInputFormat.pickStripesViaTranslatedSarg(sarg, orcTail.getWriterVersion(), footer.getTypesList(), stripeStats, stripeCount);
    // For ORC case, send the boundaries of the stripes so we don't have to send the footer.
    SplitInfos.Builder sb = SplitInfos.newBuilder();
    List<StripeInformation> stripes = orcTail.getStripes();
    boolean isEliminated = true;
    for (int i = 0; i < result.length; ++i) {
        if (result != null && !result[i])
            continue;
        isEliminated = false;
        StripeInformation si = stripes.get(i);
        if (LOG.isDebugEnabled()) {
            LOG.debug("PPD is adding a split " + i + ": " + si.getOffset() + ", " + si.getLength());
        }
        sb.addInfos(SplitInfo.newBuilder().setIndex(i).setOffset(si.getOffset()).setLength(si.getLength()));
    }
    return isEliminated ? null : sb.build();
}
Also used : OrcProto(org.apache.orc.OrcProto) StripeStatistics(org.apache.orc.StripeStatistics) SplitInfos(org.apache.hadoop.hive.metastore.Metastore.SplitInfos) StripeInformation(org.apache.orc.StripeInformation) OrcTail(org.apache.orc.impl.OrcTail)

Example 18 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class TestOrcFile method testMemoryManagementV11.

@Test
public void testMemoryManagementV11() throws Exception {
    OrcConf.ROWS_BETWEEN_CHECKS.setLong(conf, 100);
    final long poolSize = 50_000;
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    MemoryManager memoryManager = new MemoryManagerImpl(poolSize);
    // set up 10 files that all request the full size.
    MemoryManager.Callback ignore = newScale -> false;
    for (int f = 0; f < 9; ++f) {
        memoryManager.addWriter(new Path("file-" + f), poolSize, ignore);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE).stripeSize(50000).bufferSize(100).rowIndexStride(0).memory(memoryManager).batchSize(100).version(OrcFile.Version.V_0_11));
    assertEquals(0.1, ((MemoryManagerImpl) memoryManager).getAllocationScale());
    for (int i = 0; i < 2500; ++i) {
        writer.addRow(new InnerStruct(i * 300, Integer.toHexString(10 * i)));
    }
    writer.close();
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    int i = 0;
    for (StripeInformation stripe : reader.getStripes()) {
        i += 1;
        assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(), stripe.getDataLength() < 5000);
    }
    assertEquals(25, i);
    assertEquals(2500, reader.getNumberOfRows());
    reader.close();
}
Also used : Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Text(org.apache.hadoop.io.Text) Random(java.util.Random) Date(org.apache.hadoop.hive.common.type.Date) HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) LongWritable(org.apache.hadoop.io.LongWritable) ByteBuffer(java.nio.ByteBuffer) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) TimestampWritableV2(org.apache.hadoop.hive.serde2.io.TimestampWritableV2) DecimalColumnStatistics(org.apache.orc.DecimalColumnStatistics) OrcConf(org.apache.orc.OrcConf) Assert.assertNotNull(junit.framework.Assert.assertNotNull) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) Path(org.apache.hadoop.fs.Path) BigInteger(java.math.BigInteger) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) ShortWritable(org.apache.hadoop.hive.serde2.io.ShortWritable) IntWritable(org.apache.hadoop.io.IntWritable) Parameterized(org.junit.runners.Parameterized) LlapDaemonInfo(org.apache.hadoop.hive.llap.LlapDaemonInfo) StripeStatistics(org.apache.orc.StripeStatistics) Longs(com.google.common.primitives.Longs) DateWritableV2(org.apache.hadoop.hive.serde2.io.DateWritableV2) BooleanColumnStatistics(org.apache.orc.BooleanColumnStatistics) Collection(java.util.Collection) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StringColumnStatistics(org.apache.orc.StringColumnStatistics) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) List(java.util.List) MemoryManagerImpl(org.apache.orc.impl.MemoryManagerImpl) BooleanWritable(org.apache.hadoop.io.BooleanWritable) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Assert.assertEquals(junit.framework.Assert.assertEquals) ByteWritable(org.apache.hadoop.hive.serde2.io.ByteWritable) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) RunWith(org.junit.runner.RunWith) Parameters(org.junit.runners.Parameterized.Parameters) OrcProto(org.apache.orc.OrcProto) HashMap(java.util.HashMap) LlapProxy(org.apache.hadoop.hive.llap.io.api.LlapProxy) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) StripeInformation(org.apache.orc.StripeInformation) ArrayList(java.util.ArrayList) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) SearchArgumentFactory(org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory) Assert.assertTrue(junit.framework.Assert.assertTrue) Lists(com.google.common.collect.Lists) DoubleColumnStatistics(org.apache.orc.DoubleColumnStatistics) TestName(org.junit.rules.TestName) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) OrcUtils(org.apache.orc.OrcUtils) BytesWritable(org.apache.hadoop.io.BytesWritable) ManagementFactory(java.lang.management.ManagementFactory) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) DoubleWritable(org.apache.hadoop.hive.serde2.io.DoubleWritable) Assert.assertNull(junit.framework.Assert.assertNull) Before(org.junit.Before) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) Iterator(java.util.Iterator) Timestamp(org.apache.hadoop.hive.common.type.Timestamp) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ColumnStatistics(org.apache.orc.ColumnStatistics) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) BinaryColumnStatistics(org.apache.orc.BinaryColumnStatistics) Test(org.junit.Test) File(java.io.File) PredicateLeaf(org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf) Rule(org.junit.Rule) ObjectInspectorFactory(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory) HiveTestUtils(org.apache.hive.common.util.HiveTestUtils) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) Assert(org.junit.Assert) FloatWritable(org.apache.hadoop.io.FloatWritable) Assert.assertFalse(junit.framework.Assert.assertFalse) MemoryManager(org.apache.orc.MemoryManager) Path(org.apache.hadoop.fs.Path) HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) MemoryManagerImpl(org.apache.orc.impl.MemoryManagerImpl) MemoryManager(org.apache.orc.MemoryManager) StripeInformation(org.apache.orc.StripeInformation) Test(org.junit.Test)

Example 19 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering3.

private void testDeleteEventFiltering3() throws Exception {
    boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
    boolean columnStatsPresent = OrcConf.ROW_INDEX_STRIDE.getLong(conf) != 0;
    // To create small stripes
    OrcConf.STRIPE_SIZE.setLong(conf, 1);
    // Need to use a bigger row than DummyRow for the writer to flush the stripes
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
    // Use OrcRecordUpdater.OrcOptions to set the batch size.
    OrcRecordUpdater.OrcOptions orcOptions = new OrcRecordUpdater.OrcOptions(conf);
    orcOptions.orcOptions(OrcFile.writerOptions(conf).batchSize(1));
    int bucket = 1;
    AcidOutputFormat.Options options = orcOptions.filesystem(fs).bucket(bucket).writingBase(true).minimumWriteId(10000002).maximumWriteId(10000002).inspector(bigRowInspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
    int bucketProperty = BucketCodec.V1.encode(options);
    // Create 3 stripes with 1 row each
    byte[] data = new byte[1000];
    RecordUpdater updater = new OrcRecordUpdater(root, options);
    updater.insert(10000002, new BigRow(data, 0, 0, bucket));
    updater.insert(10000002, new BigRow(data, 1, 0, bucket));
    updater.insert(10000002, new BigRow(data, 2, 0, bucket));
    updater.close(false);
    String acidFile = "base_10000002/bucket_00001";
    Path acidFilePath = new Path(root, acidFile);
    Reader reader = OrcFile.createReader(acidFilePath, OrcFile.readerOptions(conf));
    List<StripeInformation> stripes = reader.getStripes();
    // Make sure 3 stripes are created
    assertEquals(3, stripes.size());
    long fileLength = fs.getFileStatus(acidFilePath).getLen();
    // 1. Splits within a stripe
    // A split that's completely within the 2nd stripe
    StripeInformation stripe = stripes.get(1);
    OrcSplit split = new OrcSplit(acidFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    validateKeyInterval(split, new RecordIdentifier(1, 1, 1), new RecordIdentifier(0, 0, 0), filterOn);
    // A split that's completely within the last stripe
    stripe = stripes.get(2);
    split = new OrcSplit(acidFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    validateKeyInterval(split, new RecordIdentifier(1, 1, 1), new RecordIdentifier(0, 0, 0), filterOn);
    // 2. Splits starting at a stripe boundary
    // A split that starts where the 1st stripe starts and ends before the 1st stripe ends
    stripe = stripes.get(0);
    split = new OrcSplit(acidFilePath, null, stripe.getOffset(), stripe.getLength() - 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for the 1st stripe
    if (columnStatsPresent) {
        validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 0), new RecordIdentifier(10000002, bucketProperty, 0), filterOn);
    } else {
        validateKeyInterval(split, null, new RecordIdentifier(10000002, bucketProperty, 0), filterOn);
    }
    // A split that starts where the 2nd stripe starts and ends after the 2nd stripe ends
    stripe = stripes.get(1);
    split = new OrcSplit(acidFilePath, null, stripe.getOffset(), stripe.getLength() + 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for the last 2 stripes
    validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 1), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    // 3. Splits ending at a stripe boundary
    // A split that starts before the last stripe starts and ends at the last stripe boundary
    stripe = stripes.get(2);
    split = new OrcSplit(acidFilePath, null, stripe.getOffset() - 50, stripe.getLength() + 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for the last stripe
    validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 2), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    // A split that starts after the 1st stripe starts and ends where the last stripe ends
    split = new OrcSplit(acidFilePath, null, stripes.get(0).getOffset() + 50, reader.getContentLength() - 50, new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for the last 2 stripes
    validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 1), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    // A split that starts where the 1st stripe starts and ends where the last stripe ends
    split = new OrcSplit(acidFilePath, null, stripes.get(0).getOffset(), reader.getContentLength(), new String[] { "localhost" }, null, false, true, getDeltaMetaDataWithBucketFile(1), fileLength, fileLength, root, null);
    // The key interval for all 3 stripes
    if (columnStatsPresent) {
        validateKeyInterval(split, new RecordIdentifier(10000002, bucketProperty, 0), new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    } else {
        validateKeyInterval(split, null, new RecordIdentifier(10000002, bucketProperty, 2), filterOn);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) StripeInformation(org.apache.orc.StripeInformation)

Example 20 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class OrcRawRecordMerger method discoverOriginalKeyBounds.

/**
   * Find the key range for original bucket files.
   * @param reader the reader
   * @param bucket the bucket number we are reading
   * @param options the options for reading with
   * @throws IOException
   */
private void discoverOriginalKeyBounds(Reader reader, int bucket, Reader.Options options) throws IOException {
    long rowLength = 0;
    long rowOffset = 0;
    long offset = options.getOffset();
    long maxOffset = options.getMaxOffset();
    boolean isTail = true;
    for (StripeInformation stripe : reader.getStripes()) {
        if (offset > stripe.getOffset()) {
            rowOffset += stripe.getNumberOfRows();
        } else if (maxOffset > stripe.getOffset()) {
            rowLength += stripe.getNumberOfRows();
        } else {
            isTail = false;
            break;
        }
    }
    if (rowOffset > 0) {
        minKey = new RecordIdentifier(0, bucket, rowOffset - 1);
    }
    if (!isTail) {
        maxKey = new RecordIdentifier(0, bucket, rowOffset + rowLength - 1);
    }
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StripeInformation(org.apache.orc.StripeInformation)

Aggregations

StripeInformation (org.apache.orc.StripeInformation)30 Test (org.junit.Test)10 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)9 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)8 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)8 LongObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector)8 Path (org.apache.hadoop.fs.Path)7 OrcProto (org.apache.orc.OrcProto)7 ArrayList (java.util.ArrayList)6 Random (java.util.Random)6 OrcStripeMetadata (org.apache.hadoop.hive.llap.io.metadata.OrcStripeMetadata)5 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)5 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)5 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)5 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)5 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)5 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)5 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)5 HiveDecimalObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector)5 ShortObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector)5