Search in sources :

Example 61 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class TestOrcFile method columnProjection.

@Test
public void columnProjection() throws Exception {
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(1000).compress(CompressionKind.NONE).bufferSize(100).rowIndexStride(1000));
    Random r1 = new Random(1);
    Random r2 = new Random(2);
    int x;
    int minInt = 0, maxInt = 0;
    String y;
    String minStr = null, maxStr = null;
    for (int i = 0; i < 21000; ++i) {
        x = r1.nextInt();
        y = Long.toHexString(r2.nextLong());
        if (i == 0 || x < minInt) {
            minInt = x;
        }
        if (i == 0 || x > maxInt) {
            maxInt = x;
        }
        if (i == 0 || y.compareTo(minStr) < 0) {
            minStr = y;
        }
        if (i == 0 || y.compareTo(maxStr) > 0) {
            maxStr = y;
        }
        writer.addRow(inner(x, y));
    }
    writer.close();
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    // check out the statistics
    ColumnStatistics[] stats = reader.getStatistics();
    assertEquals(3, stats.length);
    for (ColumnStatistics s : stats) {
        assertEquals(21000, s.getNumberOfValues());
        if (s instanceof IntegerColumnStatistics) {
            assertEquals(minInt, ((IntegerColumnStatistics) s).getMinimum());
            assertEquals(maxInt, ((IntegerColumnStatistics) s).getMaximum());
        } else if (s instanceof StringColumnStatistics) {
            assertEquals(maxStr, ((StringColumnStatistics) s).getMaximum());
            assertEquals(minStr, ((StringColumnStatistics) s).getMinimum());
        }
    }
    // check out the types
    List<OrcProto.Type> types = reader.getTypes();
    assertEquals(3, types.size());
    assertEquals(OrcProto.Type.Kind.STRUCT, types.get(0).getKind());
    assertEquals(2, types.get(0).getSubtypesCount());
    assertEquals(1, types.get(0).getSubtypes(0));
    assertEquals(2, types.get(0).getSubtypes(1));
    assertEquals(OrcProto.Type.Kind.INT, types.get(1).getKind());
    assertEquals(0, types.get(1).getSubtypesCount());
    assertEquals(OrcProto.Type.Kind.STRING, types.get(2).getKind());
    assertEquals(0, types.get(2).getSubtypesCount());
    // read the contents and make sure they match
    RecordReader rows1 = reader.rows(new boolean[] { true, true, false });
    RecordReader rows2 = reader.rows(new boolean[] { true, false, true });
    r1 = new Random(1);
    r2 = new Random(2);
    OrcStruct row1 = null;
    OrcStruct row2 = null;
    for (int i = 0; i < 21000; ++i) {
        assertEquals(true, rows1.hasNext());
        assertEquals(true, rows2.hasNext());
        row1 = (OrcStruct) rows1.next(row1);
        row2 = (OrcStruct) rows2.next(row2);
        assertEquals(r1.nextInt(), ((IntWritable) row1.getFieldValue(0)).get());
        assertEquals(Long.toHexString(r2.nextLong()), row2.getFieldValue(1).toString());
    }
    assertEquals(false, rows1.hasNext());
    assertEquals(false, rows2.hasNext());
    rows1.close();
    rows2.close();
}
Also used : DecimalColumnStatistics(org.apache.orc.DecimalColumnStatistics) BooleanColumnStatistics(org.apache.orc.BooleanColumnStatistics) StringColumnStatistics(org.apache.orc.StringColumnStatistics) DoubleColumnStatistics(org.apache.orc.DoubleColumnStatistics) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) ColumnStatistics(org.apache.orc.ColumnStatistics) BinaryColumnStatistics(org.apache.orc.BinaryColumnStatistics) HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) Random(java.util.Random) StringColumnStatistics(org.apache.orc.StringColumnStatistics) Test(org.junit.Test)

Example 62 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class TestOrcFile method testZeroCopySeek.

@Test
public void testZeroCopySeek() throws Exception {
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(200000).bufferSize(65536).rowIndexStride(1000));
    Random rand = new Random(42);
    final int COUNT = 32768;
    long[] intValues = new long[COUNT];
    double[] doubleValues = new double[COUNT];
    String[] stringValues = new String[COUNT];
    BytesWritable[] byteValues = new BytesWritable[COUNT];
    String[] words = new String[128];
    for (int i = 0; i < words.length; ++i) {
        words[i] = Integer.toHexString(rand.nextInt());
    }
    for (int i = 0; i < COUNT / 2; ++i) {
        intValues[2 * i] = rand.nextLong();
        intValues[2 * i + 1] = intValues[2 * i];
        stringValues[2 * i] = words[rand.nextInt(words.length)];
        stringValues[2 * i + 1] = stringValues[2 * i];
    }
    for (int i = 0; i < COUNT; ++i) {
        doubleValues[i] = rand.nextDouble();
        byte[] buf = new byte[20];
        rand.nextBytes(buf);
        byteValues[i] = new BytesWritable(buf);
    }
    for (int i = 0; i < COUNT; ++i) {
        writer.addRow(createRandomRow(intValues, doubleValues, stringValues, byteValues, words, i));
    }
    writer.close();
    writer = null;
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    assertEquals(COUNT, reader.getNumberOfRows());
    /* enable zero copy record reader */
    Configuration conf = new Configuration();
    conf.setBoolean(OrcConf.USE_ZEROCOPY.getHiveConfName(), true);
    RecordReader rows = reader.rows();
    /* all tests are identical to the other seek() tests */
    OrcStruct row = null;
    for (int i = COUNT - 1; i >= 0; --i) {
        // we load the previous buffer of rows
        if (i % COUNT == COUNT - 1) {
            rows.seekToRow(i - (COUNT - 1));
        }
        rows.seekToRow(i);
        row = (OrcStruct) rows.next(row);
        BigRow expected = createRandomRow(intValues, doubleValues, stringValues, byteValues, words, i);
        assertEquals(expected.boolean1.booleanValue(), ((BooleanWritable) row.getFieldValue(0)).get());
        assertEquals(expected.byte1.byteValue(), ((ByteWritable) row.getFieldValue(1)).get());
        assertEquals(expected.short1.shortValue(), ((ShortWritable) row.getFieldValue(2)).get());
        assertEquals(expected.int1.intValue(), ((IntWritable) row.getFieldValue(3)).get());
        assertEquals(expected.long1.longValue(), ((LongWritable) row.getFieldValue(4)).get());
        assertEquals(expected.float1.floatValue(), ((FloatWritable) row.getFieldValue(5)).get(), 0.0001);
        assertEquals(expected.double1.doubleValue(), ((DoubleWritable) row.getFieldValue(6)).get(), 0.0001);
        assertEquals(expected.bytes1, row.getFieldValue(7));
        assertEquals(expected.string1, row.getFieldValue(8));
        List<InnerStruct> expectedList = expected.middle.list;
        List<OrcStruct> actualList = (List) ((OrcStruct) row.getFieldValue(9)).getFieldValue(0);
        compareList(expectedList, actualList);
        compareList(expected.list, (List) row.getFieldValue(10));
    }
    rows.close();
    Iterator<StripeInformation> stripeIterator = reader.getStripes().iterator();
    long offsetOfStripe2 = 0;
    long offsetOfStripe4 = 0;
    long lastRowOfStripe2 = 0;
    for (int i = 0; i < 5; ++i) {
        StripeInformation stripe = stripeIterator.next();
        if (i < 2) {
            lastRowOfStripe2 += stripe.getNumberOfRows();
        } else if (i == 2) {
            offsetOfStripe2 = stripe.getOffset();
            lastRowOfStripe2 += stripe.getNumberOfRows() - 1;
        } else if (i == 4) {
            offsetOfStripe4 = stripe.getOffset();
        }
    }
    boolean[] columns = new boolean[reader.getStatistics().length];
    // long colulmn
    columns[5] = true;
    // text column
    columns[9] = true;
    /* use zero copy record reader */
    rows = reader.rowsOptions(new Reader.Options().range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2).include(columns));
    rows.seekToRow(lastRowOfStripe2);
    for (int i = 0; i < 2; ++i) {
        row = (OrcStruct) rows.next(row);
        BigRow expected = createRandomRow(intValues, doubleValues, stringValues, byteValues, words, (int) (lastRowOfStripe2 + i));
        assertEquals(expected.long1.longValue(), ((LongWritable) row.getFieldValue(4)).get());
        assertEquals(expected.string1, row.getFieldValue(8));
    }
    rows.close();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Random(java.util.Random) List(java.util.List) ArrayList(java.util.ArrayList) HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) BytesWritable(org.apache.hadoop.io.BytesWritable) StripeInformation(org.apache.orc.StripeInformation) Test(org.junit.Test)

Example 63 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class TestInputOutputFormat method testDefaultTypes.

@Test
public void testDefaultTypes() throws Exception {
    Properties properties = new Properties();
    properties.setProperty("columns", "str,str2");
    properties.setProperty("columns.types", "string:string");
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    AbstractSerDe serde = new OrcSerde();
    HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
    org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, StringRow.class, true, properties, Reporter.NULL);
    writer.write(serde.serialize(new StringRow("owen"), inspector));
    writer.write(serde.serialize(new StringRow("beth"), inspector));
    writer.write(serde.serialize(new StringRow("laurel"), inspector));
    writer.write(serde.serialize(new StringRow("hazen"), inspector));
    writer.write(serde.serialize(new StringRow("colin"), inspector));
    writer.write(serde.serialize(new StringRow("miles"), inspector));
    writer.close(true);
    serde = new OrcSerde();
    SerDeUtils.initializeSerDe(serde, conf, properties, null);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    assertEquals("struct<str:string,str2:string>", inspector.getTypeName());
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);
    // read the whole file
    conf.set("columns", StringRow.getColumnNamesProperty());
    conf.set("columns.types", StringRow.getColumnTypesProperty());
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Writable value = (Writable) reader.createValue();
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector();
    assertEquals(true, reader.next(key, value));
    assertEquals("owen", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(true, reader.next(key, value));
    assertEquals("beth", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(true, reader.next(key, value));
    assertEquals("laurel", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(true, reader.next(key, value));
    assertEquals("hazen", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(true, reader.next(key, value));
    assertEquals("colin", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(true, reader.next(key, value));
    assertEquals("miles", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(false, reader.next(key, value));
    reader.close();
}
Also used : NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) Properties(java.util.Properties) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) InputSplit(org.apache.hadoop.mapred.InputSplit) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 64 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class TestInputOutputFormat method testSplitElimination.

@Test
public void testSplitElimination() throws Exception {
    Properties properties = new Properties();
    properties.setProperty("columns", "z,r");
    properties.setProperty("columns.types", "int:struct<x:int,y:int>");
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    AbstractSerDe serde = new OrcSerde();
    OutputFormat<?, ?> outFormat = new OrcOutputFormat();
    conf.setInt("mapred.max.split.size", 50);
    RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd().lessThan("z", PredicateLeaf.Type.LONG, new Long(0)).end().build();
    conf.set("sarg.pushdown", toKryo(sarg));
    conf.set("hive.io.file.readcolumn.names", "z,r");
    SerDeUtils.initializeSerDe(serde, conf, properties, null);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(0, splits.length);
}
Also used : SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) Properties(java.util.Properties) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) RecordWriter(org.apache.hadoop.mapred.RecordWriter) InputSplit(org.apache.hadoop.mapred.InputSplit) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 65 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class TestOrcFileStripeMergeRecordReader method createOrcFile.

private void createOrcFile(int stripSize, int numberOfRows) throws IOException {
    ObjectInspector inspector;
    synchronized (TestOrcFileStripeMergeRecordReader.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(StringIntIntIntRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(tmpPath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(stripSize).compress(CompressionKind.ZLIB).bufferSize(5000).rowIndexStride(1000));
    Random rand = new Random(157);
    for (int i = 0; i < numberOfRows; i++) {
        writer.addRow(new StringIntIntIntRow(Integer.toBinaryString(i), rand.nextInt(), rand.nextInt(), rand.nextInt()));
    }
    writer.close();
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Random(java.util.Random)

Aggregations

StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)73 Test (org.junit.Test)64 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)60 Configuration (org.apache.hadoop.conf.Configuration)25 StringObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector)25 InputSplit (org.apache.hadoop.mapred.InputSplit)25 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)24 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)23 Properties (java.util.Properties)20 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)20 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)18 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)18 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)18 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)18 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)18 HiveDecimalObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector)18 LongObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector)18 ShortObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector)18 TimestampObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector)18 RecordWriter (org.apache.hadoop.mapred.RecordWriter)18