Search in sources :

Example 1 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class TestInputOutputFormat method testVectorReaderNoFooterSerialize.

@Test
public void testVectorReaderNoFooterSerialize() throws Exception {
    MockFileSystem fs = new MockFileSystem(conf);
    MockPath mockPath = new MockPath(fs, "mock:///mocktable3");
    conf.set("hive.orc.splits.include.file.footer", "false");
    conf.set("mapred.input.dir", mockPath.toString());
    conf.set("fs.defaultFS", "mock:///");
    conf.set("fs.mock.impl", MockFileSystem.class.getName());
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "mocktable3", inspector, true, 0);
    Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    OrcInputFormat orcInputFormat = new OrcInputFormat();
    InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
    assertEquals(2, splits.length);
    int readOpsBefore = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsBefore = statistics.getReadOps();
        }
    }
    assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
    for (InputSplit split : splits) {
        assertTrue("OrcSplit is expected", split instanceof OrcSplit);
        // ETL strategies will have start=3 (start of first stripe)
        assertTrue(split.toString().contains("start=3"));
        assertTrue(split.toString().contains("hasFooter=false"));
        assertTrue(split.toString().contains("hasBase=true"));
        assertTrue(split.toString().contains("deltas=0"));
        if (split instanceof OrcSplit) {
            assertFalse("No footer serialize test for vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
        }
        orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL);
    }
    int readOpsDelta = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsDelta = statistics.getReadOps() - readOpsBefore;
        }
    }
    // call-1: open to read footer - split 1 => mock:/mocktable3/0_0
    // call-2: open to read data - split 1 => mock:/mocktable3/0_0
    // call-3: open to read footer - split 2 => mock:/mocktable3/0_1
    // call-4: open to read data - split 2 => mock:/mocktable3/0_1
    assertEquals(4, readOpsDelta);
    // revert back to local fs
    conf.set("fs.defaultFS", "file:///");
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 2 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class TestInputOutputFormat method testMROutput.

@Test
public void testMROutput() throws Exception {
    Properties properties = new Properties();
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    AbstractSerDe serde = new OrcSerde();
    OutputFormat<?, ?> outFormat = new OrcOutputFormat();
    RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    properties.setProperty("columns", "z,r");
    properties.setProperty("columns.types", "int:struct<x:int,y:int>");
    SerDeUtils.initializeSerDe(serde, conf, properties, null);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);
    ColumnProjectionUtils.appendReadColumns(conf, Collections.singletonList(1));
    conf.set("columns", "z,r");
    conf.set("columns.types", "int:struct<x:int,y:int>");
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Object value = reader.createValue();
    int rowNum = 0;
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector();
    List<? extends StructField> inFields = inner.getAllStructFieldRefs();
    IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector();
    while (reader.next(key, value)) {
        assertEquals(null, inspector.getStructFieldData(value, fields.get(0)));
        Object sub = inspector.getStructFieldData(value, fields.get(1));
        assertEquals(3 * rowNum + 1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0))));
        assertEquals(3 * rowNum + 2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1))));
        rowNum += 1;
    }
    assertEquals(3, rowNum);
    reader.close();
}
Also used : IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) Properties(java.util.Properties) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) RecordWriter(org.apache.hadoop.mapred.RecordWriter) InputSplit(org.apache.hadoop.mapred.InputSplit) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 3 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class TestInputOutputFormat method testACIDReaderNoFooterSerialize.

@Test
public void testACIDReaderNoFooterSerialize() throws Exception {
    MockFileSystem fs = new MockFileSystem(conf);
    MockPath mockPath = new MockPath(fs, "mock:///mocktable5");
    conf.set("hive.transactional.table.scan", "true");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
    conf.set("hive.orc.splits.include.file.footer", "false");
    conf.set("mapred.input.dir", mockPath.toString());
    conf.set("fs.defaultFS", "mock:///");
    conf.set("fs.mock.impl", MockFileSystem.class.getName());
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    OrcInputFormat orcInputFormat = new OrcInputFormat();
    InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
    assertEquals(2, splits.length);
    int readOpsBefore = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsBefore = statistics.getReadOps();
        }
    }
    assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
    for (InputSplit split : splits) {
        assertTrue("OrcSplit is expected", split instanceof OrcSplit);
        // ETL strategies will have start=3 (start of first stripe)
        assertTrue(split.toString().contains("start=3"));
        assertTrue(split.toString().contains("hasFooter=false"));
        assertTrue(split.toString().contains("hasBase=true"));
        assertTrue(split.toString().contains("deltas=0"));
        if (split instanceof OrcSplit) {
            assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
        }
        orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
    }
    int readOpsDelta = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsDelta = statistics.getReadOps() - readOpsBefore;
        }
    }
    // call-1: open to read footer - split 1 => mock:/mocktable5/0_0
    // call-2: open to read data - split 1 => mock:/mocktable5/0_0
    // call-3: open to read footer - split 2 => mock:/mocktable5/0_1
    // call-4: open to read data - split 2 => mock:/mocktable5/0_1
    assertEquals(4, readOpsDelta);
    // revert back to local fs
    conf.set("fs.defaultFS", "file:///");
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 4 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class TestInputOutputFormat method testRowNumberUniquenessInDifferentSplits.

/**
   * also see {@link TestOrcFile#testPredicatePushdown()}
   * This tests that {@link RecordReader#getRowNumber()} works with multiple splits
   * @throws Exception
   */
@Test
public void testRowNumberUniquenessInDifferentSplits() throws Exception {
    Properties properties = new Properties();
    properties.setProperty("columns", "x,y");
    properties.setProperty("columns.types", "int:int");
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // Save the conf variable values so that they can be restored later.
    long oldDefaultStripeSize = conf.getLong(OrcConf.STRIPE_SIZE.getHiveConfName(), -1L);
    long oldMaxSplitSize = conf.getLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, -1L);
    // Set the conf variable values for this test.
    // 10000 bytes per stripe
    long newStripeSize = 10000L;
    // 1024 bytes per split
    long newMaxSplitSize = 100L;
    conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), newStripeSize);
    conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, newMaxSplitSize);
    AbstractSerDe serde = new OrcSerde();
    HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
    org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL);
    // The following loop should create 20 stripes in the orc file.
    for (int i = 0; i < newStripeSize * 10; ++i) {
        writer.write(serde.serialize(new MyRow(i, i + 1), inspector));
    }
    writer.close(true);
    serde = new OrcSerde();
    SerDeUtils.initializeSerDe(serde, conf, properties, null);
    assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
    inspector = (StructObjectInspector) serde.getObjectInspector();
    assertEquals("struct<x:int,y:int>", inspector.getTypeName());
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    int numExpectedSplits = 20;
    InputSplit[] splits = in.getSplits(conf, numExpectedSplits);
    assertEquals(numExpectedSplits, splits.length);
    for (int i = 0; i < numExpectedSplits; ++i) {
        OrcSplit split = (OrcSplit) splits[i];
        Reader.Options orcReaderOptions = new Reader.Options();
        orcReaderOptions.range(split.getStart(), split.getLength());
        OrcFile.ReaderOptions qlReaderOptions = OrcFile.readerOptions(conf).maxLength(split.getFileLength());
        Reader reader = OrcFile.createReader(split.getPath(), qlReaderOptions);
        RecordReader recordReader = reader.rowsOptions(orcReaderOptions);
        for (int j = 0; recordReader.hasNext(); j++) {
            long rowNum = (i * 5000) + j;
            long rowNumActual = recordReader.getRowNumber();
            assertEquals("rowNum=" + rowNum, rowNum, rowNumActual);
            Object row = recordReader.next(null);
        }
        recordReader.close();
    }
    // Reset the conf variable values that we changed for this test.
    if (oldDefaultStripeSize != -1L) {
        conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), oldDefaultStripeSize);
    } else {
        // this means that nothing was set for default stripe size previously, so we should unset it.
        conf.unset(OrcConf.STRIPE_SIZE.getHiveConfName());
    }
    if (oldMaxSplitSize != -1L) {
        conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, oldMaxSplitSize);
    } else {
        // this means that nothing was set for default stripe size previously, so we should unset it.
        conf.unset(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname);
    }
}
Also used : Properties(java.util.Properties) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) InputSplit(org.apache.hadoop.mapred.InputSplit) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 5 with ObjectInspectorFactory.getReflectionObjectInspector

use of org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector in project hive by apache.

the class OrcFileGenerator method generateOrcFile.

/**
   * Generates an orc file based on the provided record class in the specified file system
   * at the output path.
   *
   * @param conf the configuration used to initialize the orc writer
   * @param fs the file system to which will contain the generated orc file
   * @param outputPath the path where the generated orc will be placed
   * @param recordClass a class the defines the record format for the generated orc file, this
   * class must have exactly one constructor.
   */
public static void generateOrcFile(Configuration conf, FileSystem fs, Path outputPath, Class recordClass) throws IOException, InstantiationException, IllegalAccessException, InvocationTargetException {
    ObjectInspector inspector;
    synchronized (TestVectorizedORCReader.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(recordClass, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(fs, outputPath, conf, inspector, 100000, CompressionKind.ZLIB, 10000, 10000);
    try {
        Constructor[] constructors = recordClass.getConstructors();
        if (constructors.length != 1) {
            throw new UnsupportedOperationException("The provided recordClass must have exactly one constructor.");
        }
        BatchDataDistribution[] dataDist = BatchDataDistribution.values();
        Class[] columns = constructors[0].getParameterTypes();
        for (int i = 0; i < dataDist.length * 3; i++) {
            Object[][] rows = new Object[columns.length][VectorizedRowBatch.DEFAULT_SIZE];
            for (int c = 0; c < columns.length; c++) {
                if (!TYPE_TO_BATCH_GEN_MAP.containsKey(columns[c])) {
                    throw new UnsupportedOperationException("No batch generator defined for type " + columns[c].getName());
                }
                rows[c] = TYPE_TO_BATCH_GEN_MAP.get(columns[c]).generateBatch(dataDist[(i + c) % dataDist.length]);
            }
            for (int r = 0; r < VectorizedRowBatch.DEFAULT_SIZE; r++) {
                Object[] row = new Object[columns.length];
                for (int c = 0; c < columns.length; c++) {
                    row[c] = rows[c][r];
                }
                writer.addRow(constructors[0].newInstance(row));
            }
        }
    } finally {
        writer.close();
    }
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Constructor(java.lang.reflect.Constructor) TestVectorizedORCReader(org.apache.hadoop.hive.ql.io.orc.TestVectorizedORCReader) Writer(org.apache.hadoop.hive.ql.io.orc.Writer)

Aggregations

StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)72 Test (org.junit.Test)63 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)59 Path (org.apache.hadoop.fs.Path)26 StringObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector)25 InputSplit (org.apache.hadoop.mapred.InputSplit)25 Configuration (org.apache.hadoop.conf.Configuration)24 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)24 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)23 FileSystem (org.apache.hadoop.fs.FileSystem)21 Properties (java.util.Properties)20 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)20 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)18 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)18 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)18 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)18 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)18 HiveDecimalObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector)18 LongObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector)18 ShortObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector)18