Search in sources :

Example 6 with Reader

use of org.apache.orc.Reader in project flink by apache.

the class OrcBulkWriterTestUtil method validate.

public static void validate(File files, List<Record> expected) throws IOException {
    final File[] buckets = files.listFiles();
    assertNotNull(buckets);
    assertEquals(1, buckets.length);
    final File[] partFiles = buckets[0].listFiles();
    assertNotNull(partFiles);
    for (File partFile : partFiles) {
        assertTrue(partFile.length() > 0);
        OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration());
        Reader reader = OrcFile.createReader(new org.apache.hadoop.fs.Path(partFile.toURI()), readerOptions);
        assertEquals(3, reader.getNumberOfRows());
        assertEquals(2, reader.getSchema().getFieldNames().size());
        assertSame(reader.getCompressionKind(), CompressionKind.LZ4);
        assertTrue(reader.hasMetadataValue(USER_METADATA_KEY));
        assertTrue(reader.getMetadataKeys().contains(USER_METADATA_KEY));
        List<Record> results = getResults(reader);
        assertEquals(3, results.size());
        assertEquals(results, expected);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) OrcFile(org.apache.orc.OrcFile) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) Record(org.apache.flink.orc.data.Record) OrcFile(org.apache.orc.OrcFile) File(java.io.File)

Example 7 with Reader

use of org.apache.orc.Reader in project druid by druid-io.

the class OrcReader method intermediateRowIterator.

@Override
protected CloseableIterator<OrcStruct> intermediateRowIterator() throws IOException {
    final Closer closer = Closer.create();
    // We fetch here to cache a copy locally. However, this might need to be changed if we want to split an orc file
    // into several InputSplits in the future.
    final byte[] buffer = new byte[InputEntity.DEFAULT_FETCH_BUFFER_SIZE];
    final CleanableFile file = closer.register(source.fetch(temporaryDirectory, buffer));
    final Path path = new Path(file.file().toURI());
    final ClassLoader currentClassLoader = Thread.currentThread().getContextClassLoader();
    final Reader reader;
    try {
        Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
        reader = closer.register(OrcFile.createReader(path, OrcFile.readerOptions(conf)));
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassLoader);
    }
    // The below line will get the schmea to read the whole columns.
    // This can be improved by projecting some columns only what users want in the future.
    final TypeDescription schema = reader.getSchema();
    final RecordReader batchReader = reader.rows(reader.options());
    final OrcMapredRecordReader<OrcStruct> recordReader = new OrcMapredRecordReader<>(batchReader, schema);
    closer.register(recordReader::close);
    return new CloseableIterator<OrcStruct>() {

        final NullWritable key = recordReader.createKey();

        OrcStruct value = null;

        @Override
        public boolean hasNext() {
            if (value == null) {
                try {
                    // The returned OrcStruct in next() can be kept in memory for a while.
                    // Here, we create a new instance of OrcStruct before calling RecordReader.next(),
                    // so that we can avoid to share the same reference to the "value" across rows.
                    value = recordReader.createValue();
                    if (!recordReader.next(key, value)) {
                        value = null;
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return value != null;
        }

        @Override
        public OrcStruct next() {
            if (value == null) {
                throw new NoSuchElementException();
            }
            final OrcStruct currentValue = value;
            value = null;
            return currentValue;
        }

        @Override
        public void close() throws IOException {
            closer.close();
        }
    };
}
Also used : Closer(org.apache.druid.java.util.common.io.Closer) Path(org.apache.hadoop.fs.Path) CloseableIterator(org.apache.druid.java.util.common.parsers.CloseableIterator) RecordReader(org.apache.orc.RecordReader) OrcMapredRecordReader(org.apache.orc.mapred.OrcMapredRecordReader) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) IntermediateRowParsingReader(org.apache.druid.data.input.IntermediateRowParsingReader) OrcMapredRecordReader(org.apache.orc.mapred.OrcMapredRecordReader) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription) OrcMapredRecordReader(org.apache.orc.mapred.OrcMapredRecordReader) CleanableFile(org.apache.druid.data.input.InputEntity.CleanableFile) NoSuchElementException(java.util.NoSuchElementException)

Example 8 with Reader

use of org.apache.orc.Reader in project hive by apache.

the class TestTxnCommands2 method testAcidOrcWritePreservesFieldNames.

@Test
public void testAcidOrcWritePreservesFieldNames() throws Exception {
    // with vectorization
    String tableName = "acidorcwritefieldnames";
    hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
    runStatementOnDriver("DROP TABLE IF EXISTS " + tableName);
    runStatementOnDriver("CREATE TABLE " + tableName + " (a INT, b STRING) CLUSTERED BY (a) INTO " + BUCKET_COUNT + " BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true')");
    runStatementOnDriver("INSERT INTO " + tableName + " VALUES (1, 'foo'), (2, 'bar')");
    tableName = "acidorcwritefieldnames_complex";
    runStatementOnDriver("DROP TABLE IF EXISTS " + tableName);
    runStatementOnDriver("CREATE TABLE " + tableName + " (a INT, b STRING, s STRUCT<c:int, si:STRUCT<d:double," + "e:float>>) CLUSTERED BY (a) INTO " + BUCKET_COUNT + " BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true')");
    runStatementOnDriver("INSERT INTO " + tableName + " select a, b, named_struct('c',10,'si'," + "named_struct('d',cast(1.0 as double),'e',cast(2.0 as float))) from acidorcwritefieldnames");
    FileSystem fs = FileSystem.get(hiveConf);
    FileStatus[] fileStatuses = fs.globStatus(new Path(getWarehouseDir() + "/" + tableName + "/" + AcidUtils.DELTA_PREFIX + "*/" + AcidUtils.BUCKET_PREFIX + "*"));
    Assert.assertEquals(BUCKET_COUNT, fileStatuses.length);
    OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(hiveConf);
    for (FileStatus fileStatus : fileStatuses) {
        Reader r = OrcFile.createReader(fileStatus.getPath(), readerOptions);
        TypeDescription rowSchema = r.getSchema().getChildren().get(5);
        Assert.assertEquals("struct<a:int,b:string,s:struct<c:int,si:struct<d:double,e:float>>>", rowSchema.toString());
    }
    // without vectorization
    tableName = "acidorcwritefieldnames";
    hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
    runStatementOnDriver("DROP TABLE IF EXISTS " + tableName);
    runStatementOnDriver("CREATE TABLE " + tableName + " (a INT, b STRING) CLUSTERED BY (a) INTO " + BUCKET_COUNT + " BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true')");
    runStatementOnDriver("INSERT INTO " + tableName + " VALUES (1, 'foo'), (2, 'bar')");
    tableName = "acidorcwritefieldnames_complex";
    runStatementOnDriver("DROP TABLE IF EXISTS " + tableName);
    runStatementOnDriver("CREATE TABLE " + tableName + " (a INT, b STRING, s STRUCT<c:int, si:STRUCT<d:double," + "e:float>>) CLUSTERED BY (a) INTO " + BUCKET_COUNT + " BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true')");
    runStatementOnDriver("INSERT INTO " + tableName + " select a, b, named_struct('c',10,'si'," + "named_struct('d',cast(1.0 as double),'e',cast(2.0 as float))) from acidorcwritefieldnames");
    fs = FileSystem.get(hiveConf);
    fileStatuses = fs.globStatus(new Path(getWarehouseDir() + "/" + tableName + "/" + AcidUtils.DELTA_PREFIX + "*/" + AcidUtils.BUCKET_PREFIX + "*"));
    Assert.assertEquals(BUCKET_COUNT, fileStatuses.length);
    readerOptions = OrcFile.readerOptions(hiveConf);
    for (FileStatus fileStatus : fileStatuses) {
        Reader r = OrcFile.createReader(fileStatus.getPath(), readerOptions);
        TypeDescription rowSchema = r.getSchema().getChildren().get(5);
        Assert.assertEquals("struct<a:int,b:string,s:struct<c:int,si:struct<d:double,e:float>>>", rowSchema.toString());
    }
    hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) OrcFile(org.apache.orc.OrcFile) FileSystem(org.apache.hadoop.fs.FileSystem) Reader(org.apache.orc.Reader) TypeDescription(org.apache.orc.TypeDescription) Test(org.junit.Test)

Example 9 with Reader

use of org.apache.orc.Reader in project hive by apache.

the class TestCrudCompactorOnTez method checkBloomFilterInAcidFile.

private void checkBloomFilterInAcidFile(FileSystem fs, Path bucketFilePath) throws IOException {
    Reader orcReader = OrcFile.createReader(bucketFilePath, OrcFile.readerOptions(fs.getConf()).filesystem(fs));
    StripeInformation stripe = orcReader.getStripes().get(0);
    try (RecordReaderImpl rows = (RecordReaderImpl) orcReader.rows()) {
        boolean bloomFilter = rows.readStripeFooter(stripe).getStreamsList().stream().anyMatch(s -> s.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER_UTF8 || s.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER);
        Assert.assertTrue("Bloom filter is missing", bloomFilter);
    }
}
Also used : Reader(org.apache.orc.Reader) ProtoMessageReader(org.apache.tez.dag.history.logging.proto.ProtoMessageReader) RecordReader(org.apache.orc.RecordReader) StripeInformation(org.apache.orc.StripeInformation) RecordReaderImpl(org.apache.orc.impl.RecordReaderImpl)

Example 10 with Reader

use of org.apache.orc.Reader in project hive by apache.

the class TestCrudCompactorOnTez method checkBucketIdAndRowIdInAcidFile.

/**
 * Read file, and
 * 1. make sure that the bucket property in each row matches the file name.
 * For example, if the bucketId is 0, we check file bucket_00000 to make sure that the third
 * column contains only the value 536870912.
 * 2. make sure that rowIds are in ascending order
 * @param fs file system
 * @param path where to look for the bucket file
 * @param bucketId bucket Id to check, e.g. 0.
 */
private void checkBucketIdAndRowIdInAcidFile(FileSystem fs, Path path, int bucketId) throws IOException {
    Path bucketFilePath = AcidUtils.createBucketFile(path, bucketId);
    Reader orcReader = OrcFile.createReader(bucketFilePath, OrcFile.readerOptions(fs.getConf()).filesystem(fs));
    TypeDescription schema = orcReader.getSchema();
    try (RecordReader rows = orcReader.rows()) {
        VectorizedRowBatch batch = schema.createRowBatch();
        rows.nextBatch(batch);
        // check that bucket property in each row matches the bucket in the file name
        long[] bucketIdVector = ((LongColumnVector) batch.cols[2]).vector;
        for (int i = 0; i < batch.count(); i++) {
            Assert.assertEquals(bucketId, decodeBucketProperty(bucketIdVector[i]));
        }
        // check that writeIds, then rowIds are sorted in ascending order
        long[] writeIdVector = ((LongColumnVector) batch.cols[1]).vector;
        long[] rowIdVector = ((LongColumnVector) batch.cols[3]).vector;
        long writeId = writeIdVector[0];
        long rowId = 0;
        for (int i = 0; i < batch.count(); i++) {
            long currentWriteId = writeIdVector[i];
            long currentRowId = rowIdVector[i];
            if (writeId == writeIdVector[i]) {
                Assert.assertTrue(rowId <= currentRowId);
                rowId = currentRowId;
            } else {
                Assert.assertTrue(writeId < currentWriteId);
                writeId = currentWriteId;
                rowId = 0;
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) ProtoMessageReader(org.apache.tez.dag.history.logging.proto.ProtoMessageReader) RecordReader(org.apache.orc.RecordReader) TypeDescription(org.apache.orc.TypeDescription) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Aggregations

Reader (org.apache.orc.Reader)10 RecordReader (org.apache.orc.RecordReader)8 Path (org.apache.hadoop.fs.Path)5 OrcFile (org.apache.orc.OrcFile)4 File (java.io.File)3 IOException (java.io.IOException)3 Configuration (org.apache.hadoop.conf.Configuration)3 TypeDescription (org.apache.orc.TypeDescription)3 ProtoMessageReader (org.apache.tez.dag.history.logging.proto.ProtoMessageReader)2 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 NoSuchElementException (java.util.NoSuchElementException)1 CleanableFile (org.apache.druid.data.input.InputEntity.CleanableFile)1 IntermediateRowParsingReader (org.apache.druid.data.input.IntermediateRowParsingReader)1 Closer (org.apache.druid.java.util.common.io.Closer)1 CloseableIterator (org.apache.druid.java.util.common.parsers.CloseableIterator)1 Predicate (org.apache.flink.orc.OrcFilters.Predicate)1 Record (org.apache.flink.orc.data.Record)1 GenericRowData (org.apache.flink.table.data.GenericRowData)1 RowData (org.apache.flink.table.data.RowData)1 FileStatus (org.apache.hadoop.fs.FileStatus)1