use of org.apache.orc.StripeInformation in project hive by apache.
the class TestOrcFile method testWithoutIndex.
/**
* Read and write a randomly generated snappy file.
* @throws Exception
*/
@Test
public void testWithoutIndex() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(5000).compress(CompressionKind.SNAPPY).bufferSize(1000).rowIndexStride(0));
Random rand = new Random(24);
for (int i = 0; i < 10000; ++i) {
InnerStruct row = new InnerStruct(rand.nextInt(), Integer.toBinaryString(rand.nextInt()));
for (int j = 0; j < 5; ++j) {
writer.addRow(row);
}
}
writer.close();
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(50000, reader.getNumberOfRows());
assertEquals(0, reader.getRowIndexStride());
StripeInformation stripe = reader.getStripes().iterator().next();
assertEquals(true, stripe.getDataLength() != 0);
assertEquals(0, stripe.getIndexLength());
RecordReader rows = reader.rows();
rand = new Random(24);
OrcStruct row = null;
for (int i = 0; i < 10000; ++i) {
int intVal = rand.nextInt();
String strVal = Integer.toBinaryString(rand.nextInt());
for (int j = 0; j < 5; ++j) {
assertEquals(true, rows.hasNext());
row = (OrcStruct) rows.next(row);
assertEquals(intVal, ((IntWritable) row.getFieldValue(0)).get());
assertEquals(strVal, row.getFieldValue(1).toString());
}
}
assertEquals(false, rows.hasNext());
rows.close();
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestOrcSerDeStats method testSerdeStatsOldFormat.
@Test(expected = ClassCastException.class)
public void testSerdeStatsOldFormat() throws Exception {
Path oldFilePath = new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"));
Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf).filesystem(fs));
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for (StripeInformation stripe : reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
}
}
assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(6615000, reader.getRawDataSize());
assertEquals(2, stripeCount);
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(7500, stats[1].getNumberOfValues());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000", stats[3].toString());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals("count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807", stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0", stats[7].toString());
assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum());
assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum());
assertEquals(0, ((StringColumnStatistics) stats[9]).getSum());
assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
// old orc format will not have binary statistics. toString() will show only
// the general column statistics
assertEquals("count: 7500 hasNull: true", stats[8].toString());
// since old orc format doesn't support binary statistics,
// this should throw ClassCastException
assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum());
reader.close();
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventOriginalFiltering2.
private void testDeleteEventOriginalFiltering2() throws Exception {
boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, false);
// Need to use a bigger row than DummyRow for the writer to flush the stripes
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
Properties properties = new Properties();
OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(properties, conf);
writerOptions.inspector(bigOriginalRowInspector).stripeSize(1).batchSize(1);
String originalFile = "000000_0";
Path originalFilePath = new Path(root, originalFile);
byte[] data = new byte[1000];
Writer writer = OrcFile.createWriter(originalFilePath, writerOptions);
writer.addRow(new BigOriginalRow(data));
writer.addRow(new BigOriginalRow(data));
writer.addRow(new BigOriginalRow(data));
writer.close();
Reader reader = OrcFile.createReader(originalFilePath, OrcFile.readerOptions(conf));
List<StripeInformation> stripes = reader.getStripes();
// Make sure 3 stripes are created
assertEquals(3, stripes.size());
FileStatus fileStatus = fs.getFileStatus(originalFilePath);
long fileLength = fileStatus.getLen();
// Set vector mode to true in the map work so that we can generate the syntheticProps
MapWork mapWork = new MapWork();
mapWork.setVectorMode(true);
VectorizedRowBatchCtx vrbContext = new VectorizedRowBatchCtx();
mapWork.setVectorizedRowBatchCtx(vrbContext);
HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
Utilities.setMapWork(conf, mapWork);
OrcSplit.OffsetAndBucketProperty syntheticProps = VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket(fileStatus, root, true, true, conf);
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).bucket(0);
int bucketProperty = BucketCodec.V1.encode(options);
// 1. Splits within a stripe
// A split that's completely within the 2nd stripe
StripeInformation stripe = stripes.get(1);
OrcSplit split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 1), filterOn);
// A split that's completely within the last stripe
stripe = stripes.get(2);
split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 3), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// 2. Splits starting at a stripe boundary
// A split that starts where the 1st stripe starts and ends before the 1st stripe ends
stripe = stripes.get(0);
split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the 1st stripe
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 0), filterOn);
// A split that starts where the 2nd stripe starts and ends after the 2nd stripe ends
stripe = stripes.get(1);
split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// 3. Splits ending at a stripe boundary
// A split that starts before the last stripe starts and ends at the last stripe boundary
stripe = stripes.get(2);
split = new OrcSplit(originalFilePath, null, stripe.getOffset() - 50, stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the last stripe
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// A split that starts after the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset() + 50, reader.getContentLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for the last 2 stripes
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
// A split that starts where the 1st stripe starts and ends where the last stripe ends
split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset(), reader.getContentLength(), new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
// The key interval for all 3 stripes
validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 2), filterOn);
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class FixAcidKeyIndex method recoverFile.
static void recoverFile(Configuration conf, Path inputPath, String backup) throws IOException {
FileSystem fs = inputPath.getFileSystem(conf);
Path recoveredPath = getRecoveryFile(inputPath);
try (Reader reader = OrcFile.createReader(fs, inputPath)) {
if (OrcInputFormat.isOriginal(reader)) {
System.out.println(inputPath + " is not an acid file. No need to recover.");
return;
}
AcidKeyIndexValidationResult validationResult = validate(conf, inputPath);
if (validationResult.isValid) {
System.out.println(inputPath + " has a valid acid key index. No need to recover.");
return;
}
System.out.println("Recovering " + inputPath);
// make sure that file does not exist
try {
fs.delete(recoveredPath, false);
} catch (FileNotFoundException e) {
// no problem, we're just making sure the file doesn't exist
}
// Writer should match the orc configuration from the original file
OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf).compress(reader.getCompression()).version(reader.getFileVersion()).rowIndexStride(reader.getRowIndexStride()).inspector(reader.getObjectInspector());
// compression buffer size should only be set if compression is enabled
if (reader.getCompression() != org.apache.hadoop.hive.ql.io.orc.CompressionKind.NONE) {
writerOptions.bufferSize(reader.getCompressionSize()).enforceBufferSize();
}
try (Writer writer = OrcFile.createWriter(recoveredPath, writerOptions)) {
List<StripeInformation> stripes = reader.getStripes();
List<StripeStatistics> stripeStats = reader.getOrcProtoStripeStatistics();
try (FSDataInputStream inputStream = fs.open(inputPath)) {
for (int idx = 0; idx < stripes.size(); ++idx) {
// initialize buffer to read the entire stripe.
StripeInformation stripe = stripes.get(idx);
int stripeLength = (int) stripe.getLength();
byte[] buffer = new byte[stripeLength];
inputStream.readFully(stripe.getOffset(), buffer, 0, stripeLength);
// append the stripe buffer to the new ORC file
writer.appendStripe(buffer, 0, buffer.length, stripe, stripeStats.get(idx));
}
}
// Add the rest of the metadata keys.
for (String metadataKey : reader.getMetadataKeys()) {
if (!metadataKey.equals(OrcRecordUpdater.ACID_KEY_INDEX_NAME)) {
writer.addUserMetadata(metadataKey, reader.getMetadataValue(metadataKey));
}
}
StringBuilder sb = new StringBuilder();
validationResult.recordIdentifiers.stream().forEach(ri -> sb.append(ri.getWriteId()).append(",").append(ri.getBucketProperty()).append(",").append(ri.getRowId()).append(";"));
// Finally add the fixed acid key index.
writer.addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME, UTF8.encode(sb.toString()));
}
}
// Confirm the file is really fixed, and replace the old file.
AcidKeyIndexValidationResult fileFixed = validate(conf, recoveredPath);
if (fileFixed.isValid) {
Path backupDataPath;
String scheme = inputPath.toUri().getScheme();
String authority = inputPath.toUri().getAuthority();
String filePath = inputPath.toUri().getPath();
// use the same filesystem as input file if backup-path is not explicitly specified
if (backup.equals(DEFAULT_BACKUP_PATH)) {
backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
} else {
backupDataPath = Path.mergePaths(new Path(backup), inputPath);
}
// Move data file to backup path
moveFiles(fs, inputPath, backupDataPath);
// finally move recovered file to actual file
moveFiles(fs, recoveredPath, inputPath);
System.out.println("Fixed acid key index for " + inputPath);
} else {
System.out.println("Unable to fix acid key index for " + inputPath);
}
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestCrudCompactorOnTez method checkBloomFilterInAcidFile.
private void checkBloomFilterInAcidFile(FileSystem fs, Path bucketFilePath) throws IOException {
Reader orcReader = OrcFile.createReader(bucketFilePath, OrcFile.readerOptions(fs.getConf()).filesystem(fs));
StripeInformation stripe = orcReader.getStripes().get(0);
try (RecordReaderImpl rows = (RecordReaderImpl) orcReader.rows()) {
boolean bloomFilter = rows.readStripeFooter(stripe).getStreamsList().stream().anyMatch(s -> s.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER_UTF8 || s.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER);
Assert.assertTrue("Bloom filter is missing", bloomFilter);
}
}
Aggregations