use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcRawRecordMerger method discoverKeyBounds.
/**
* Find the key range for the split (of the base). These are used to filter delta files since
* both are sorted by key.
* @param reader the reader
* @param options the options for reading with
* @throws IOException
*/
private KeyInterval discoverKeyBounds(Reader reader, Reader.Options options) throws IOException {
RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader);
long offset = options.getOffset();
long maxOffset = options.getMaxOffset();
int firstStripe = 0;
int stripeCount = 0;
boolean isTail = true;
RecordIdentifier minKey = null;
RecordIdentifier maxKey = null;
List<StripeInformation> stripes = reader.getStripes();
for (StripeInformation stripe : stripes) {
if (offset > stripe.getOffset()) {
firstStripe += 1;
} else if (maxOffset > stripe.getOffset()) {
stripeCount += 1;
} else {
isTail = false;
break;
}
}
if (firstStripe != 0) {
minKey = keyIndex[firstStripe - 1];
}
if (!isTail) {
maxKey = keyIndex[firstStripe + stripeCount - 1];
}
return new KeyInterval(minKey, maxKey);
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestOrcFile method testReadFormat_0_11.
@Test
public void testReadFormat_0_11() throws Exception {
Path oldFilePath = new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"));
Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf).filesystem(fs));
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for (StripeInformation stripe : reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
}
}
assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(2, stripeCount);
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(7500, stats[1].getNumberOfValues());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000", stats[3].toString());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals("count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807", stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0", stats[7].toString());
assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
// check the inspectors
StructObjectInspector readerInspector = (StructObjectInspector) reader.getObjectInspector();
assertEquals(ObjectInspector.Category.STRUCT, readerInspector.getCategory());
assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint," + "int1:int,long1:bigint,float1:float,double1:double,bytes1:" + "binary,string1:string,middle:struct<list:array<struct<int1:int," + "string1:string>>>,list:array<struct<int1:int,string1:string>>," + "map:map<string,struct<int1:int,string1:string>>,ts:timestamp," + "decimal1:decimal(38,10)>", readerInspector.getTypeName());
List<? extends StructField> fields = readerInspector.getAllStructFieldRefs();
BooleanObjectInspector bo = (BooleanObjectInspector) readerInspector.getStructFieldRef("boolean1").getFieldObjectInspector();
ByteObjectInspector by = (ByteObjectInspector) readerInspector.getStructFieldRef("byte1").getFieldObjectInspector();
ShortObjectInspector sh = (ShortObjectInspector) readerInspector.getStructFieldRef("short1").getFieldObjectInspector();
IntObjectInspector in = (IntObjectInspector) readerInspector.getStructFieldRef("int1").getFieldObjectInspector();
LongObjectInspector lo = (LongObjectInspector) readerInspector.getStructFieldRef("long1").getFieldObjectInspector();
FloatObjectInspector fl = (FloatObjectInspector) readerInspector.getStructFieldRef("float1").getFieldObjectInspector();
DoubleObjectInspector dbl = (DoubleObjectInspector) readerInspector.getStructFieldRef("double1").getFieldObjectInspector();
BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector.getStructFieldRef("bytes1").getFieldObjectInspector();
StringObjectInspector st = (StringObjectInspector) readerInspector.getStructFieldRef("string1").getFieldObjectInspector();
StructObjectInspector mid = (StructObjectInspector) readerInspector.getStructFieldRef("middle").getFieldObjectInspector();
List<? extends StructField> midFields = mid.getAllStructFieldRefs();
ListObjectInspector midli = (ListObjectInspector) midFields.get(0).getFieldObjectInspector();
StructObjectInspector inner = (StructObjectInspector) midli.getListElementObjectInspector();
List<? extends StructField> inFields = inner.getAllStructFieldRefs();
ListObjectInspector li = (ListObjectInspector) readerInspector.getStructFieldRef("list").getFieldObjectInspector();
MapObjectInspector ma = (MapObjectInspector) readerInspector.getStructFieldRef("map").getFieldObjectInspector();
TimestampObjectInspector tso = (TimestampObjectInspector) readerInspector.getStructFieldRef("ts").getFieldObjectInspector();
HiveDecimalObjectInspector dco = (HiveDecimalObjectInspector) readerInspector.getStructFieldRef("decimal1").getFieldObjectInspector();
StringObjectInspector mk = (StringObjectInspector) ma.getMapKeyObjectInspector();
RecordReader rows = reader.rows();
Object row = rows.next(null);
assertNotNull(row);
// check the contents of the first row
assertEquals(false, bo.get(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals(1, by.get(readerInspector.getStructFieldData(row, fields.get(1))));
assertEquals(1024, sh.get(readerInspector.getStructFieldData(row, fields.get(2))));
assertEquals(65536, in.get(readerInspector.getStructFieldData(row, fields.get(3))));
assertEquals(Long.MAX_VALUE, lo.get(readerInspector.getStructFieldData(row, fields.get(4))));
assertEquals(1.0, fl.get(readerInspector.getStructFieldData(row, fields.get(5))), 0.00001);
assertEquals(-15.0, dbl.get(readerInspector.getStructFieldData(row, fields.get(6))), 0.00001);
assertEquals(bytes(0, 1, 2, 3, 4), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(7))));
assertEquals("hi", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(8))));
List<?> midRow = midli.getList(mid.getStructFieldData(readerInspector.getStructFieldData(row, fields.get(9)), midFields.get(0)));
assertNotNull(midRow);
assertEquals(2, midRow.size());
assertEquals(1, in.get(inner.getStructFieldData(midRow.get(0), inFields.get(0))));
assertEquals("bye", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(0), inFields.get(1))));
assertEquals(2, in.get(inner.getStructFieldData(midRow.get(1), inFields.get(0))));
assertEquals("sigh", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(1), inFields.get(1))));
List<?> list = li.getList(readerInspector.getStructFieldData(row, fields.get(10)));
assertEquals(2, list.size());
assertEquals(3, in.get(inner.getStructFieldData(list.get(0), inFields.get(0))));
assertEquals("good", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(0), inFields.get(1))));
assertEquals(4, in.get(inner.getStructFieldData(list.get(1), inFields.get(0))));
assertEquals("bad", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(1), inFields.get(1))));
Map<?, ?> map = ma.getMap(readerInspector.getStructFieldData(row, fields.get(11)));
assertEquals(0, map.size());
assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"), tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(12))));
assertEquals(HiveDecimal.create("12345678.6547456"), dco.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(13))));
// check the contents of second row
assertEquals(true, rows.hasNext());
rows.seekToRow(7499);
row = rows.next(null);
assertEquals(true, bo.get(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals(100, by.get(readerInspector.getStructFieldData(row, fields.get(1))));
assertEquals(2048, sh.get(readerInspector.getStructFieldData(row, fields.get(2))));
assertEquals(65536, in.get(readerInspector.getStructFieldData(row, fields.get(3))));
assertEquals(Long.MAX_VALUE, lo.get(readerInspector.getStructFieldData(row, fields.get(4))));
assertEquals(2.0, fl.get(readerInspector.getStructFieldData(row, fields.get(5))), 0.00001);
assertEquals(-5.0, dbl.get(readerInspector.getStructFieldData(row, fields.get(6))), 0.00001);
assertEquals(bytes(), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(7))));
assertEquals("bye", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(8))));
midRow = midli.getList(mid.getStructFieldData(readerInspector.getStructFieldData(row, fields.get(9)), midFields.get(0)));
assertNotNull(midRow);
assertEquals(2, midRow.size());
assertEquals(1, in.get(inner.getStructFieldData(midRow.get(0), inFields.get(0))));
assertEquals("bye", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(0), inFields.get(1))));
assertEquals(2, in.get(inner.getStructFieldData(midRow.get(1), inFields.get(0))));
assertEquals("sigh", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(1), inFields.get(1))));
list = li.getList(readerInspector.getStructFieldData(row, fields.get(10)));
assertEquals(3, list.size());
assertEquals(100000000, in.get(inner.getStructFieldData(list.get(0), inFields.get(0))));
assertEquals("cat", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(0), inFields.get(1))));
assertEquals(-100000, in.get(inner.getStructFieldData(list.get(1), inFields.get(0))));
assertEquals("in", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(1), inFields.get(1))));
assertEquals(1234, in.get(inner.getStructFieldData(list.get(2), inFields.get(0))));
assertEquals("hat", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(2), inFields.get(1))));
map = ma.getMap(readerInspector.getStructFieldData(row, fields.get(11)));
assertEquals(2, map.size());
boolean[] found = new boolean[2];
for (Object key : map.keySet()) {
String str = mk.getPrimitiveJavaObject(key);
if (str.equals("chani")) {
assertEquals(false, found[0]);
assertEquals(5, in.get(inner.getStructFieldData(map.get(key), inFields.get(0))));
assertEquals(str, st.getPrimitiveJavaObject(inner.getStructFieldData(map.get(key), inFields.get(1))));
found[0] = true;
} else if (str.equals("mauddib")) {
assertEquals(false, found[1]);
assertEquals(1, in.get(inner.getStructFieldData(map.get(key), inFields.get(0))));
assertEquals(str, st.getPrimitiveJavaObject(inner.getStructFieldData(map.get(key), inFields.get(1))));
found[1] = true;
} else {
throw new IllegalArgumentException("Unknown key " + str);
}
}
assertEquals(true, found[0]);
assertEquals(true, found[1]);
assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"), tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(12))));
assertEquals(HiveDecimal.create("12345678.6547457"), dco.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(13))));
// handle the close up
assertEquals(false, rows.hasNext());
rows.close();
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestOrcFile method testSeek.
@Test
public void testSeek() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(200000).bufferSize(65536).rowIndexStride(1000));
Random rand = new Random(42);
final int COUNT = 32768;
long[] intValues = new long[COUNT];
double[] doubleValues = new double[COUNT];
String[] stringValues = new String[COUNT];
BytesWritable[] byteValues = new BytesWritable[COUNT];
String[] words = new String[128];
for (int i = 0; i < words.length; ++i) {
words[i] = Integer.toHexString(rand.nextInt());
}
for (int i = 0; i < COUNT / 2; ++i) {
intValues[2 * i] = rand.nextLong();
intValues[2 * i + 1] = intValues[2 * i];
stringValues[2 * i] = words[rand.nextInt(words.length)];
stringValues[2 * i + 1] = stringValues[2 * i];
}
for (int i = 0; i < COUNT; ++i) {
doubleValues[i] = rand.nextDouble();
byte[] buf = new byte[20];
rand.nextBytes(buf);
byteValues[i] = new BytesWritable(buf);
}
for (int i = 0; i < COUNT; ++i) {
writer.addRow(createRandomRow(intValues, doubleValues, stringValues, byteValues, words, i));
}
writer.close();
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(COUNT, reader.getNumberOfRows());
RecordReader rows = reader.rows();
OrcStruct row = null;
for (int i = COUNT - 1; i >= 0; --i) {
// we load the previous buffer of rows
if (i % COUNT == COUNT - 1) {
rows.seekToRow(i - (COUNT - 1));
}
rows.seekToRow(i);
row = (OrcStruct) rows.next(row);
BigRow expected = createRandomRow(intValues, doubleValues, stringValues, byteValues, words, i);
// assertEquals(expected, row);
assertEquals(expected.boolean1.booleanValue(), ((BooleanWritable) row.getFieldValue(0)).get());
assertEquals(expected.byte1.byteValue(), ((ByteWritable) row.getFieldValue(1)).get());
assertEquals(expected.short1.shortValue(), ((ShortWritable) row.getFieldValue(2)).get());
assertEquals(expected.int1.intValue(), ((IntWritable) row.getFieldValue(3)).get());
assertEquals(expected.long1.longValue(), ((LongWritable) row.getFieldValue(4)).get());
assertEquals(expected.float1, ((FloatWritable) row.getFieldValue(5)).get(), 0.0001);
assertEquals(expected.double1, ((DoubleWritable) row.getFieldValue(6)).get(), 0.0001);
assertEquals(expected.bytes1, row.getFieldValue(7));
assertEquals(expected.string1, row.getFieldValue(8));
List<InnerStruct> expectedList = expected.middle.list;
List<OrcStruct> actualList = (List<OrcStruct>) ((OrcStruct) row.getFieldValue(9)).getFieldValue(0);
compareList(expectedList, actualList);
compareList(expected.list, (List<OrcStruct>) row.getFieldValue(10));
}
rows.close();
Iterator<StripeInformation> stripeIterator = reader.getStripes().iterator();
long offsetOfStripe2 = 0;
long offsetOfStripe4 = 0;
long lastRowOfStripe2 = 0;
for (int i = 0; i < 5; ++i) {
StripeInformation stripe = stripeIterator.next();
if (i < 2) {
lastRowOfStripe2 += stripe.getNumberOfRows();
} else if (i == 2) {
offsetOfStripe2 = stripe.getOffset();
lastRowOfStripe2 += stripe.getNumberOfRows() - 1;
} else if (i == 4) {
offsetOfStripe4 = stripe.getOffset();
}
}
boolean[] columns = new boolean[reader.getStatistics().length];
// long colulmn
columns[5] = true;
// text column
columns[9] = true;
rows = reader.rowsOptions(new Reader.Options().range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2).include(columns));
rows.seekToRow(lastRowOfStripe2);
for (int i = 0; i < 2; ++i) {
row = (OrcStruct) rows.next(row);
BigRow expected = createRandomRow(intValues, doubleValues, stringValues, byteValues, words, (int) (lastRowOfStripe2 + i));
assertEquals(expected.long1.longValue(), ((LongWritable) row.getFieldValue(4)).get());
assertEquals(expected.string1, row.getFieldValue(8));
}
rows.close();
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestOrcFile method testZeroCopySeek.
@Test
public void testZeroCopySeek() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(200000).bufferSize(65536).rowIndexStride(1000));
Random rand = new Random(42);
final int COUNT = 32768;
long[] intValues = new long[COUNT];
double[] doubleValues = new double[COUNT];
String[] stringValues = new String[COUNT];
BytesWritable[] byteValues = new BytesWritable[COUNT];
String[] words = new String[128];
for (int i = 0; i < words.length; ++i) {
words[i] = Integer.toHexString(rand.nextInt());
}
for (int i = 0; i < COUNT / 2; ++i) {
intValues[2 * i] = rand.nextLong();
intValues[2 * i + 1] = intValues[2 * i];
stringValues[2 * i] = words[rand.nextInt(words.length)];
stringValues[2 * i + 1] = stringValues[2 * i];
}
for (int i = 0; i < COUNT; ++i) {
doubleValues[i] = rand.nextDouble();
byte[] buf = new byte[20];
rand.nextBytes(buf);
byteValues[i] = new BytesWritable(buf);
}
for (int i = 0; i < COUNT; ++i) {
writer.addRow(createRandomRow(intValues, doubleValues, stringValues, byteValues, words, i));
}
writer.close();
writer = null;
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(COUNT, reader.getNumberOfRows());
/* enable zero copy record reader */
Configuration conf = new Configuration();
conf.setBoolean(OrcConf.USE_ZEROCOPY.getHiveConfName(), true);
RecordReader rows = reader.rows();
/* all tests are identical to the other seek() tests */
OrcStruct row = null;
for (int i = COUNT - 1; i >= 0; --i) {
// we load the previous buffer of rows
if (i % COUNT == COUNT - 1) {
rows.seekToRow(i - (COUNT - 1));
}
rows.seekToRow(i);
row = (OrcStruct) rows.next(row);
BigRow expected = createRandomRow(intValues, doubleValues, stringValues, byteValues, words, i);
assertEquals(expected.boolean1.booleanValue(), ((BooleanWritable) row.getFieldValue(0)).get());
assertEquals(expected.byte1.byteValue(), ((ByteWritable) row.getFieldValue(1)).get());
assertEquals(expected.short1.shortValue(), ((ShortWritable) row.getFieldValue(2)).get());
assertEquals(expected.int1.intValue(), ((IntWritable) row.getFieldValue(3)).get());
assertEquals(expected.long1.longValue(), ((LongWritable) row.getFieldValue(4)).get());
assertEquals(expected.float1.floatValue(), ((FloatWritable) row.getFieldValue(5)).get(), 0.0001);
assertEquals(expected.double1.doubleValue(), ((DoubleWritable) row.getFieldValue(6)).get(), 0.0001);
assertEquals(expected.bytes1, row.getFieldValue(7));
assertEquals(expected.string1, row.getFieldValue(8));
List<InnerStruct> expectedList = expected.middle.list;
List<OrcStruct> actualList = (List) ((OrcStruct) row.getFieldValue(9)).getFieldValue(0);
compareList(expectedList, actualList);
compareList(expected.list, (List) row.getFieldValue(10));
}
rows.close();
Iterator<StripeInformation> stripeIterator = reader.getStripes().iterator();
long offsetOfStripe2 = 0;
long offsetOfStripe4 = 0;
long lastRowOfStripe2 = 0;
for (int i = 0; i < 5; ++i) {
StripeInformation stripe = stripeIterator.next();
if (i < 2) {
lastRowOfStripe2 += stripe.getNumberOfRows();
} else if (i == 2) {
offsetOfStripe2 = stripe.getOffset();
lastRowOfStripe2 += stripe.getNumberOfRows() - 1;
} else if (i == 4) {
offsetOfStripe4 = stripe.getOffset();
}
}
boolean[] columns = new boolean[reader.getStatistics().length];
// long colulmn
columns[5] = true;
// text column
columns[9] = true;
/* use zero copy record reader */
rows = reader.rowsOptions(new Reader.Options().range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2).include(columns));
rows.seekToRow(lastRowOfStripe2);
for (int i = 0; i < 2; ++i) {
row = (OrcStruct) rows.next(row);
BigRow expected = createRandomRow(intValues, doubleValues, stringValues, byteValues, words, (int) (lastRowOfStripe2 + i));
assertEquals(expected.long1.longValue(), ((LongWritable) row.getFieldValue(4)).get());
assertEquals(expected.string1, row.getFieldValue(8));
}
rows.close();
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestOrcFile method testUnionAndTimestamp.
/**
* We test union, timestamp, and decimal separately since we need to make the
* object inspector manually. (The Hive reflection-based doesn't handle
* them properly.)
*/
@Test
public void testUnionAndTimestamp() throws Exception {
List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT).addFieldNames("time").addFieldNames("union").addFieldNames("decimal").addSubtypes(1).addSubtypes(2).addSubtypes(5).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.TIMESTAMP).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.UNION).addSubtypes(3).addSubtypes(4).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRING).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.DECIMAL).build());
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = OrcStruct.createObjectInspector(0, types);
}
HiveDecimal maxValue = HiveDecimal.create("10000000000000000000");
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(1000).compress(CompressionKind.NONE).batchSize(1000).bufferSize(100).blockPadding(false));
OrcStruct row = new OrcStruct(3);
OrcUnion union = new OrcUnion();
row.setFieldValue(1, union);
row.setFieldValue(0, new TimestampWritableV2(Timestamp.valueOf("2000-03-12 15:00:00")));
HiveDecimal value = HiveDecimal.create("12345678.6547456");
row.setFieldValue(2, new HiveDecimalWritable(value));
union.set((byte) 0, new IntWritable(42));
writer.addRow(row);
row.setFieldValue(0, new TimestampWritableV2(Timestamp.valueOf("2000-03-20 12:00:00.123456789")));
union.set((byte) 1, new Text("hello"));
value = HiveDecimal.create("-5643.234");
row.setFieldValue(2, new HiveDecimalWritable(value));
writer.addRow(row);
row.setFieldValue(0, null);
row.setFieldValue(1, null);
row.setFieldValue(2, null);
writer.addRow(row);
row.setFieldValue(1, union);
union.set((byte) 0, null);
writer.addRow(row);
union.set((byte) 1, null);
writer.addRow(row);
union.set((byte) 0, new IntWritable(200000));
row.setFieldValue(0, new TimestampWritableV2(Timestamp.valueOf("1970-01-01 00:00:00")));
value = HiveDecimal.create("10000000000000000000");
row.setFieldValue(2, new HiveDecimalWritable(value));
writer.addRow(row);
Random rand = new Random(42);
for (int i = 1970; i < 2038; ++i) {
row.setFieldValue(0, new TimestampWritableV2(Timestamp.valueOf(i + "-05-05 12:34:56." + i)));
if ((i & 1) == 0) {
union.set((byte) 0, new IntWritable(i * i));
} else {
union.set((byte) 1, new Text(Integer.toString(i * i)));
}
value = HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18));
row.setFieldValue(2, new HiveDecimalWritable(value));
if (maxValue.compareTo(value) < 0) {
maxValue = value;
}
writer.addRow(row);
}
// let's add a lot of constant rows to test the rle
row.setFieldValue(0, null);
union.set((byte) 0, new IntWritable(1732050807));
row.setFieldValue(2, null);
for (int i = 0; i < 5000; ++i) {
writer.addRow(row);
}
union.set((byte) 0, new IntWritable(0));
writer.addRow(row);
union.set((byte) 0, new IntWritable(10));
writer.addRow(row);
union.set((byte) 0, new IntWritable(138));
writer.addRow(row);
writer.close();
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
TypeDescription schema = writer.getSchema();
assertEquals(5, schema.getMaximumId());
boolean[] expected = new boolean[] { false, false, false, false, false, false };
boolean[] included = OrcUtils.includeColumns("", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] { false, true, false, false, false, true };
included = OrcUtils.includeColumns("time,decimal", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] { false, false, true, true, true, false };
included = OrcUtils.includeColumns("union", schema);
assertEquals(true, Arrays.equals(expected, included));
assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
assertEquals(5077, reader.getNumberOfRows());
DecimalColumnStatistics stats = (DecimalColumnStatistics) reader.getStatistics()[5];
assertEquals(71, stats.getNumberOfValues());
assertEquals(HiveDecimal.create("-5643.234"), stats.getMinimum());
assertEquals(maxValue, stats.getMaximum());
// TODO: fix this
// assertEquals(null,stats.getSum());
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for (StripeInformation stripe : reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getLength();
}
}
assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(2, stripeCount);
assertEquals(reader.getContentLength(), currentOffset);
RecordReader rows = reader.rows();
assertEquals(0, rows.getRowNumber());
assertEquals(0.0, rows.getProgress(), 0.000001);
assertEquals(true, rows.hasNext());
row = (OrcStruct) rows.next(null);
assertEquals(1, rows.getRowNumber());
inspector = reader.getObjectInspector();
assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>", inspector.getTypeName());
assertEquals(new TimestampWritableV2(Timestamp.valueOf("2000-03-12 15:00:00")), row.getFieldValue(0));
union = (OrcUnion) row.getFieldValue(1);
assertEquals(0, union.getTag());
assertEquals(new IntWritable(42), union.getObject());
assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")), row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(2, rows.getRowNumber());
assertEquals(new TimestampWritableV2(Timestamp.valueOf("2000-03-20 12:00:00.123456789")), row.getFieldValue(0));
assertEquals(1, union.getTag());
assertEquals(new Text("hello"), union.getObject());
assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(null, row.getFieldValue(0));
assertEquals(null, row.getFieldValue(1));
assertEquals(null, row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(null, row.getFieldValue(0));
union = (OrcUnion) row.getFieldValue(1);
assertEquals(0, union.getTag());
assertEquals(null, union.getObject());
assertEquals(null, row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(null, row.getFieldValue(0));
assertEquals(1, union.getTag());
assertEquals(null, union.getObject());
assertEquals(null, row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(new TimestampWritableV2(Timestamp.valueOf("1970-01-01 00:00:00")), row.getFieldValue(0));
assertEquals(new IntWritable(200000), union.getObject());
assertEquals(new HiveDecimalWritable(HiveDecimal.create("10000000000000000000")), row.getFieldValue(2));
rand = new Random(42);
for (int i = 1970; i < 2038; ++i) {
row = (OrcStruct) rows.next(row);
assertEquals(new TimestampWritableV2(Timestamp.valueOf(i + "-05-05 12:34:56." + i)), row.getFieldValue(0));
if ((i & 1) == 0) {
assertEquals(0, union.getTag());
assertEquals(new IntWritable(i * i), union.getObject());
} else {
assertEquals(1, union.getTag());
assertEquals(new Text(Integer.toString(i * i)), union.getObject());
}
assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18))), row.getFieldValue(2));
}
for (int i = 0; i < 5000; ++i) {
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(1732050807), union.getObject());
}
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(0), union.getObject());
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(10), union.getObject());
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(138), union.getObject());
assertEquals(false, rows.hasNext());
assertEquals(1.0, rows.getProgress(), 0.00001);
assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
rows.seekToRow(1);
row = (OrcStruct) rows.next(row);
assertEquals(new TimestampWritableV2(Timestamp.valueOf("2000-03-20 12:00:00.123456789")), row.getFieldValue(0));
assertEquals(1, union.getTag());
assertEquals(new Text("hello"), union.getObject());
assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), row.getFieldValue(2));
rows.close();
}
Aggregations