use of org.apache.orc.Reader in project flink by apache.
the class OrcBulkWriterTestUtil method validate.
public static void validate(File files, List<Record> expected) throws IOException {
final File[] buckets = files.listFiles();
assertNotNull(buckets);
assertEquals(1, buckets.length);
final File[] partFiles = buckets[0].listFiles();
assertNotNull(partFiles);
for (File partFile : partFiles) {
assertTrue(partFile.length() > 0);
OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration());
Reader reader = OrcFile.createReader(new org.apache.hadoop.fs.Path(partFile.toURI()), readerOptions);
assertEquals(3, reader.getNumberOfRows());
assertEquals(2, reader.getSchema().getFieldNames().size());
assertSame(reader.getCompressionKind(), CompressionKind.LZ4);
assertTrue(reader.hasMetadataValue(USER_METADATA_KEY));
assertTrue(reader.getMetadataKeys().contains(USER_METADATA_KEY));
List<Record> results = getResults(reader);
assertEquals(3, results.size());
assertEquals(results, expected);
}
}
use of org.apache.orc.Reader in project druid by druid-io.
the class OrcReader method intermediateRowIterator.
@Override
protected CloseableIterator<OrcStruct> intermediateRowIterator() throws IOException {
final Closer closer = Closer.create();
// We fetch here to cache a copy locally. However, this might need to be changed if we want to split an orc file
// into several InputSplits in the future.
final byte[] buffer = new byte[InputEntity.DEFAULT_FETCH_BUFFER_SIZE];
final CleanableFile file = closer.register(source.fetch(temporaryDirectory, buffer));
final Path path = new Path(file.file().toURI());
final ClassLoader currentClassLoader = Thread.currentThread().getContextClassLoader();
final Reader reader;
try {
Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
reader = closer.register(OrcFile.createReader(path, OrcFile.readerOptions(conf)));
} finally {
Thread.currentThread().setContextClassLoader(currentClassLoader);
}
// The below line will get the schmea to read the whole columns.
// This can be improved by projecting some columns only what users want in the future.
final TypeDescription schema = reader.getSchema();
final RecordReader batchReader = reader.rows(reader.options());
final OrcMapredRecordReader<OrcStruct> recordReader = new OrcMapredRecordReader<>(batchReader, schema);
closer.register(recordReader::close);
return new CloseableIterator<OrcStruct>() {
final NullWritable key = recordReader.createKey();
OrcStruct value = null;
@Override
public boolean hasNext() {
if (value == null) {
try {
// The returned OrcStruct in next() can be kept in memory for a while.
// Here, we create a new instance of OrcStruct before calling RecordReader.next(),
// so that we can avoid to share the same reference to the "value" across rows.
value = recordReader.createValue();
if (!recordReader.next(key, value)) {
value = null;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return value != null;
}
@Override
public OrcStruct next() {
if (value == null) {
throw new NoSuchElementException();
}
final OrcStruct currentValue = value;
value = null;
return currentValue;
}
@Override
public void close() throws IOException {
closer.close();
}
};
}
use of org.apache.orc.Reader in project hive by apache.
the class TestTxnCommands2 method testAcidOrcWritePreservesFieldNames.
@Test
public void testAcidOrcWritePreservesFieldNames() throws Exception {
// with vectorization
String tableName = "acidorcwritefieldnames";
hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
runStatementOnDriver("DROP TABLE IF EXISTS " + tableName);
runStatementOnDriver("CREATE TABLE " + tableName + " (a INT, b STRING) CLUSTERED BY (a) INTO " + BUCKET_COUNT + " BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true')");
runStatementOnDriver("INSERT INTO " + tableName + " VALUES (1, 'foo'), (2, 'bar')");
tableName = "acidorcwritefieldnames_complex";
runStatementOnDriver("DROP TABLE IF EXISTS " + tableName);
runStatementOnDriver("CREATE TABLE " + tableName + " (a INT, b STRING, s STRUCT<c:int, si:STRUCT<d:double," + "e:float>>) CLUSTERED BY (a) INTO " + BUCKET_COUNT + " BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true')");
runStatementOnDriver("INSERT INTO " + tableName + " select a, b, named_struct('c',10,'si'," + "named_struct('d',cast(1.0 as double),'e',cast(2.0 as float))) from acidorcwritefieldnames");
FileSystem fs = FileSystem.get(hiveConf);
FileStatus[] fileStatuses = fs.globStatus(new Path(getWarehouseDir() + "/" + tableName + "/" + AcidUtils.DELTA_PREFIX + "*/" + AcidUtils.BUCKET_PREFIX + "*"));
Assert.assertEquals(BUCKET_COUNT, fileStatuses.length);
OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(hiveConf);
for (FileStatus fileStatus : fileStatuses) {
Reader r = OrcFile.createReader(fileStatus.getPath(), readerOptions);
TypeDescription rowSchema = r.getSchema().getChildren().get(5);
Assert.assertEquals("struct<a:int,b:string,s:struct<c:int,si:struct<d:double,e:float>>>", rowSchema.toString());
}
// without vectorization
tableName = "acidorcwritefieldnames";
hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
runStatementOnDriver("DROP TABLE IF EXISTS " + tableName);
runStatementOnDriver("CREATE TABLE " + tableName + " (a INT, b STRING) CLUSTERED BY (a) INTO " + BUCKET_COUNT + " BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true')");
runStatementOnDriver("INSERT INTO " + tableName + " VALUES (1, 'foo'), (2, 'bar')");
tableName = "acidorcwritefieldnames_complex";
runStatementOnDriver("DROP TABLE IF EXISTS " + tableName);
runStatementOnDriver("CREATE TABLE " + tableName + " (a INT, b STRING, s STRUCT<c:int, si:STRUCT<d:double," + "e:float>>) CLUSTERED BY (a) INTO " + BUCKET_COUNT + " BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true')");
runStatementOnDriver("INSERT INTO " + tableName + " select a, b, named_struct('c',10,'si'," + "named_struct('d',cast(1.0 as double),'e',cast(2.0 as float))) from acidorcwritefieldnames");
fs = FileSystem.get(hiveConf);
fileStatuses = fs.globStatus(new Path(getWarehouseDir() + "/" + tableName + "/" + AcidUtils.DELTA_PREFIX + "*/" + AcidUtils.BUCKET_PREFIX + "*"));
Assert.assertEquals(BUCKET_COUNT, fileStatuses.length);
readerOptions = OrcFile.readerOptions(hiveConf);
for (FileStatus fileStatus : fileStatuses) {
Reader r = OrcFile.createReader(fileStatus.getPath(), readerOptions);
TypeDescription rowSchema = r.getSchema().getChildren().get(5);
Assert.assertEquals("struct<a:int,b:string,s:struct<c:int,si:struct<d:double,e:float>>>", rowSchema.toString());
}
hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
}
use of org.apache.orc.Reader in project hive by apache.
the class TestCrudCompactorOnTez method checkBloomFilterInAcidFile.
private void checkBloomFilterInAcidFile(FileSystem fs, Path bucketFilePath) throws IOException {
Reader orcReader = OrcFile.createReader(bucketFilePath, OrcFile.readerOptions(fs.getConf()).filesystem(fs));
StripeInformation stripe = orcReader.getStripes().get(0);
try (RecordReaderImpl rows = (RecordReaderImpl) orcReader.rows()) {
boolean bloomFilter = rows.readStripeFooter(stripe).getStreamsList().stream().anyMatch(s -> s.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER_UTF8 || s.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER);
Assert.assertTrue("Bloom filter is missing", bloomFilter);
}
}
use of org.apache.orc.Reader in project hive by apache.
the class TestCrudCompactorOnTez method checkBucketIdAndRowIdInAcidFile.
/**
* Read file, and
* 1. make sure that the bucket property in each row matches the file name.
* For example, if the bucketId is 0, we check file bucket_00000 to make sure that the third
* column contains only the value 536870912.
* 2. make sure that rowIds are in ascending order
* @param fs file system
* @param path where to look for the bucket file
* @param bucketId bucket Id to check, e.g. 0.
*/
private void checkBucketIdAndRowIdInAcidFile(FileSystem fs, Path path, int bucketId) throws IOException {
Path bucketFilePath = AcidUtils.createBucketFile(path, bucketId);
Reader orcReader = OrcFile.createReader(bucketFilePath, OrcFile.readerOptions(fs.getConf()).filesystem(fs));
TypeDescription schema = orcReader.getSchema();
try (RecordReader rows = orcReader.rows()) {
VectorizedRowBatch batch = schema.createRowBatch();
rows.nextBatch(batch);
// check that bucket property in each row matches the bucket in the file name
long[] bucketIdVector = ((LongColumnVector) batch.cols[2]).vector;
for (int i = 0; i < batch.count(); i++) {
Assert.assertEquals(bucketId, decodeBucketProperty(bucketIdVector[i]));
}
// check that writeIds, then rowIds are sorted in ascending order
long[] writeIdVector = ((LongColumnVector) batch.cols[1]).vector;
long[] rowIdVector = ((LongColumnVector) batch.cols[3]).vector;
long writeId = writeIdVector[0];
long rowId = 0;
for (int i = 0; i < batch.count(); i++) {
long currentWriteId = writeIdVector[i];
long currentRowId = rowIdVector[i];
if (writeId == writeIdVector[i]) {
Assert.assertTrue(rowId <= currentRowId);
rowId = currentRowId;
} else {
Assert.assertTrue(writeId < currentWriteId);
writeId = currentWriteId;
rowId = 0;
}
}
}
}
Aggregations