Search in sources :

Example 26 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class TestOrcFile method testWithoutIndex.

/**
 * Read and write a randomly generated snappy file.
 * @throws Exception
 */
@Test
public void testWithoutIndex() throws Exception {
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(5000).compress(CompressionKind.SNAPPY).bufferSize(1000).rowIndexStride(0));
    Random rand = new Random(24);
    for (int i = 0; i < 10000; ++i) {
        InnerStruct row = new InnerStruct(rand.nextInt(), Integer.toBinaryString(rand.nextInt()));
        for (int j = 0; j < 5; ++j) {
            writer.addRow(row);
        }
    }
    writer.close();
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    assertEquals(50000, reader.getNumberOfRows());
    assertEquals(0, reader.getRowIndexStride());
    StripeInformation stripe = reader.getStripes().iterator().next();
    assertEquals(true, stripe.getDataLength() != 0);
    assertEquals(0, stripe.getIndexLength());
    RecordReader rows = reader.rows();
    rand = new Random(24);
    OrcStruct row = null;
    for (int i = 0; i < 10000; ++i) {
        int intVal = rand.nextInt();
        String strVal = Integer.toBinaryString(rand.nextInt());
        for (int j = 0; j < 5; ++j) {
            assertEquals(true, rows.hasNext());
            row = (OrcStruct) rows.next(row);
            assertEquals(intVal, ((IntWritable) row.getFieldValue(0)).get());
            assertEquals(strVal, row.getFieldValue(1).toString());
        }
    }
    assertEquals(false, rows.hasNext());
    rows.close();
}
Also used : HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) Random(java.util.Random) StripeInformation(org.apache.orc.StripeInformation) Test(org.junit.Test)

Example 27 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class TestOrcSerDeStats method testSerdeStatsOldFormat.

@Test(expected = ClassCastException.class)
public void testSerdeStatsOldFormat() throws Exception {
    Path oldFilePath = new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"));
    Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    int stripeCount = 0;
    int rowCount = 0;
    long currentOffset = -1;
    for (StripeInformation stripe : reader.getStripes()) {
        stripeCount += 1;
        rowCount += stripe.getNumberOfRows();
        if (currentOffset < 0) {
            currentOffset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
        } else {
            assertEquals(currentOffset, stripe.getOffset());
            currentOffset += stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
        }
    }
    assertEquals(reader.getNumberOfRows(), rowCount);
    assertEquals(6615000, reader.getRawDataSize());
    assertEquals(2, stripeCount);
    // check the stats
    ColumnStatistics[] stats = reader.getStatistics();
    assertEquals(7500, stats[1].getNumberOfValues());
    assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
    assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
    assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
    assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
    assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
    assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
    assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
    assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000", stats[3].toString());
    assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMaximum());
    assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMinimum());
    assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
    assertEquals("count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807", stats[5].toString());
    assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
    assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
    assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
    assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0", stats[7].toString());
    assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum());
    assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum());
    assertEquals(0, ((StringColumnStatistics) stats[9]).getSum());
    assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
    // old orc format will not have binary statistics. toString() will show only
    // the general column statistics
    assertEquals("count: 7500 hasNull: true", stats[8].toString());
    // since old orc format doesn't support binary statistics,
    // this should throw ClassCastException
    assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum());
    reader.close();
}
Also used : Path(org.apache.hadoop.fs.Path) DoubleColumnStatistics(org.apache.orc.DoubleColumnStatistics) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) BooleanColumnStatistics(org.apache.orc.BooleanColumnStatistics) StringColumnStatistics(org.apache.orc.StringColumnStatistics) ColumnStatistics(org.apache.orc.ColumnStatistics) BinaryColumnStatistics(org.apache.orc.BinaryColumnStatistics) StripeInformation(org.apache.orc.StripeInformation) Test(org.junit.Test)

Example 28 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventOriginalFiltering2.

private void testDeleteEventOriginalFiltering2() throws Exception {
    boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
    conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, false);
    // Need to use a bigger row than DummyRow for the writer to flush the stripes
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
    Properties properties = new Properties();
    OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(properties, conf);
    writerOptions.inspector(bigOriginalRowInspector).stripeSize(1).batchSize(1);
    String originalFile = "000000_0";
    Path originalFilePath = new Path(root, originalFile);
    byte[] data = new byte[1000];
    Writer writer = OrcFile.createWriter(originalFilePath, writerOptions);
    writer.addRow(new BigOriginalRow(data));
    writer.addRow(new BigOriginalRow(data));
    writer.addRow(new BigOriginalRow(data));
    writer.close();
    Reader reader = OrcFile.createReader(originalFilePath, OrcFile.readerOptions(conf));
    List<StripeInformation> stripes = reader.getStripes();
    // Make sure 3 stripes are created
    assertEquals(3, stripes.size());
    FileStatus fileStatus = fs.getFileStatus(originalFilePath);
    long fileLength = fileStatus.getLen();
    // Set vector mode to true in the map work so that we can generate the syntheticProps
    MapWork mapWork = new MapWork();
    mapWork.setVectorMode(true);
    VectorizedRowBatchCtx vrbContext = new VectorizedRowBatchCtx();
    mapWork.setVectorizedRowBatchCtx(vrbContext);
    HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
    Utilities.setMapWork(conf, mapWork);
    OrcSplit.OffsetAndBucketProperty syntheticProps = VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket(fileStatus, root, true, true, conf);
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).bucket(0);
    int bucketProperty = BucketCodec.V1.encode(options);
    // 1. Splits within a stripe
    // A split that's completely within the 2nd stripe
    StripeInformation stripe = stripes.get(1);
    OrcSplit split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 1), filterOn);
    // A split that's completely within the last stripe
    stripe = stripes.get(2);
    split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 3), new RecordIdentifier(0, bucketProperty, 2), filterOn);
    // 2. Splits starting at a stripe boundary
    // A split that starts where the 1st stripe starts and ends before the 1st stripe ends
    stripe = stripes.get(0);
    split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for the 1st stripe
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 0), filterOn);
    // A split that starts where the 2nd stripe starts and ends after the 2nd stripe ends
    stripe = stripes.get(1);
    split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for the last 2 stripes
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
    // 3. Splits ending at a stripe boundary
    // A split that starts before the last stripe starts and ends at the last stripe boundary
    stripe = stripes.get(2);
    split = new OrcSplit(originalFilePath, null, stripe.getOffset() - 50, stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for the last stripe
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 2), filterOn);
    // A split that starts after the 1st stripe starts and ends where the last stripe ends
    split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset() + 50, reader.getContentLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for the last 2 stripes
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
    // A split that starts where the 1st stripe starts and ends where the last stripe ends
    split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset(), reader.getContentLength(), new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for all 3 stripes
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 2), filterOn);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Properties(java.util.Properties) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) StripeInformation(org.apache.orc.StripeInformation)

Example 29 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class FixAcidKeyIndex method recoverFile.

static void recoverFile(Configuration conf, Path inputPath, String backup) throws IOException {
    FileSystem fs = inputPath.getFileSystem(conf);
    Path recoveredPath = getRecoveryFile(inputPath);
    try (Reader reader = OrcFile.createReader(fs, inputPath)) {
        if (OrcInputFormat.isOriginal(reader)) {
            System.out.println(inputPath + " is not an acid file. No need to recover.");
            return;
        }
        AcidKeyIndexValidationResult validationResult = validate(conf, inputPath);
        if (validationResult.isValid) {
            System.out.println(inputPath + " has a valid acid key index. No need to recover.");
            return;
        }
        System.out.println("Recovering " + inputPath);
        // make sure that file does not exist
        try {
            fs.delete(recoveredPath, false);
        } catch (FileNotFoundException e) {
        // no problem, we're just making sure the file doesn't exist
        }
        // Writer should match the orc configuration from the original file
        OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf).compress(reader.getCompression()).version(reader.getFileVersion()).rowIndexStride(reader.getRowIndexStride()).inspector(reader.getObjectInspector());
        // compression buffer size should only be set if compression is enabled
        if (reader.getCompression() != org.apache.hadoop.hive.ql.io.orc.CompressionKind.NONE) {
            writerOptions.bufferSize(reader.getCompressionSize()).enforceBufferSize();
        }
        try (Writer writer = OrcFile.createWriter(recoveredPath, writerOptions)) {
            List<StripeInformation> stripes = reader.getStripes();
            List<StripeStatistics> stripeStats = reader.getOrcProtoStripeStatistics();
            try (FSDataInputStream inputStream = fs.open(inputPath)) {
                for (int idx = 0; idx < stripes.size(); ++idx) {
                    // initialize buffer to read the entire stripe.
                    StripeInformation stripe = stripes.get(idx);
                    int stripeLength = (int) stripe.getLength();
                    byte[] buffer = new byte[stripeLength];
                    inputStream.readFully(stripe.getOffset(), buffer, 0, stripeLength);
                    // append the stripe buffer to the new ORC file
                    writer.appendStripe(buffer, 0, buffer.length, stripe, stripeStats.get(idx));
                }
            }
            // Add the rest of the metadata keys.
            for (String metadataKey : reader.getMetadataKeys()) {
                if (!metadataKey.equals(OrcRecordUpdater.ACID_KEY_INDEX_NAME)) {
                    writer.addUserMetadata(metadataKey, reader.getMetadataValue(metadataKey));
                }
            }
            StringBuilder sb = new StringBuilder();
            validationResult.recordIdentifiers.stream().forEach(ri -> sb.append(ri.getWriteId()).append(",").append(ri.getBucketProperty()).append(",").append(ri.getRowId()).append(";"));
            // Finally add the fixed acid key index.
            writer.addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME, UTF8.encode(sb.toString()));
        }
    }
    // Confirm the file is really fixed, and replace the old file.
    AcidKeyIndexValidationResult fileFixed = validate(conf, recoveredPath);
    if (fileFixed.isValid) {
        Path backupDataPath;
        String scheme = inputPath.toUri().getScheme();
        String authority = inputPath.toUri().getAuthority();
        String filePath = inputPath.toUri().getPath();
        // use the same filesystem as input file if backup-path is not explicitly specified
        if (backup.equals(DEFAULT_BACKUP_PATH)) {
            backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
        } else {
            backupDataPath = Path.mergePaths(new Path(backup), inputPath);
        }
        // Move data file to backup path
        moveFiles(fs, inputPath, backupDataPath);
        // finally move recovered file to actual file
        moveFiles(fs, recoveredPath, inputPath);
        System.out.println("Fixed acid key index for " + inputPath);
    } else {
        System.out.println("Unable to fix acid key index for " + inputPath);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileNotFoundException(java.io.FileNotFoundException) StripeStatistics(org.apache.orc.OrcProto.StripeStatistics) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StripeInformation(org.apache.orc.StripeInformation)

Example 30 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class TestCrudCompactorOnTez method checkBloomFilterInAcidFile.

private void checkBloomFilterInAcidFile(FileSystem fs, Path bucketFilePath) throws IOException {
    Reader orcReader = OrcFile.createReader(bucketFilePath, OrcFile.readerOptions(fs.getConf()).filesystem(fs));
    StripeInformation stripe = orcReader.getStripes().get(0);
    try (RecordReaderImpl rows = (RecordReaderImpl) orcReader.rows()) {
        boolean bloomFilter = rows.readStripeFooter(stripe).getStreamsList().stream().anyMatch(s -> s.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER_UTF8 || s.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER);
        Assert.assertTrue("Bloom filter is missing", bloomFilter);
    }
}
Also used : Reader(org.apache.orc.Reader) ProtoMessageReader(org.apache.tez.dag.history.logging.proto.ProtoMessageReader) RecordReader(org.apache.orc.RecordReader) StripeInformation(org.apache.orc.StripeInformation) RecordReaderImpl(org.apache.orc.impl.RecordReaderImpl)

Aggregations

StripeInformation (org.apache.orc.StripeInformation)30 Test (org.junit.Test)10 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)9 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)8 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)8 LongObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector)8 Path (org.apache.hadoop.fs.Path)7 OrcProto (org.apache.orc.OrcProto)7 ArrayList (java.util.ArrayList)6 Random (java.util.Random)6 OrcStripeMetadata (org.apache.hadoop.hive.llap.io.metadata.OrcStripeMetadata)5 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)5 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)5 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)5 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)5 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)5 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)5 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)5 HiveDecimalObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector)5 ShortObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector)5