Examples with InputSplit - org.apache.hadoop.mapred.InputSplit

Example 66 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestInputOutputFormat method testCombinationInputFormat.

// test non-vectorized, non-acid, combine
@Test
public void testCombinationInputFormat() throws Exception {
    // get the object inspector for MyRow
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "combination", inspector, false, 1);
    // write the orc file to the mock file system
    Path partDir = new Path(conf.get("mapred.input.dir"));
    Writer writer = OrcFile.createWriter(new Path(partDir, "0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    Path path = new Path("mock:/combination/p=0/0_0");
    setBlocks(path, conf, new MockBlock("host0", "host1"));
    MockFileSystem mockFs = (MockFileSystem) partDir.getFileSystem(conf);
    int length0 = getLength(path, conf);
    writer = OrcFile.createWriter(new Path(partDir, "1_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 10; i < 20; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    Path path1 = new Path("mock:/combination/p=0/1_0");
    setBlocks(path1, conf, new MockBlock("host1", "host2"));
    // call getsplits
    HiveInputFormat<?, ?> inputFormat = new CombineHiveInputFormat<WritableComparable, Writable>();
    InputSplit[] splits = inputFormat.getSplits(conf, 1);
    assertEquals(1, splits.length);
    CombineHiveInputFormat.CombineHiveInputSplit split = (CombineHiveInputFormat.CombineHiveInputSplit) splits[0];
    // check split
    assertEquals(2, split.getNumPaths());
    assertEquals(partDir.toString() + "/0_0", split.getPath(0).toString());
    assertEquals(partDir.toString() + "/1_0", split.getPath(1).toString());
    assertEquals(length0, split.getLength(0));
    assertEquals(getLength(path1, conf), split.getLength(1));
    assertEquals(0, split.getOffset(0));
    assertEquals(0, split.getOffset(1));
    // hadoop-1 gets 3 and hadoop-2 gets 0. *sigh*
    // best answer would be 1.
    assertTrue(3 >= split.getLocations().length);
    // read split
    org.apache.hadoop.mapred.RecordReader<CombineHiveKey, OrcStruct> reader = inputFormat.getRecordReader(split, conf, Reporter.NULL);
    CombineHiveKey key = reader.createKey();
    OrcStruct value = reader.createValue();
    for (int i = 0; i < 20; i++) {
        assertEquals(true, reader.next(key, value));
        assertEquals(i, ((IntWritable) value.getFieldValue(0)).get());
    }
    assertEquals(false, reader.next(key, value));
}

Also used : CombineHiveInputFormat(org.apache.hadoop.hive.ql.io.CombineHiveInputFormat) CombineHiveKey(org.apache.hadoop.hive.shims.CombineHiveKey) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 67 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestInputOutputFormat method testRowNumberUniquenessInDifferentSplits.

/**
 * also see {@link TestOrcFile#testPredicatePushdown()}
 * This tests that {@link RecordReader#getRowNumber()} works with multiple splits
 * @throws Exception
 */
@Test
public void testRowNumberUniquenessInDifferentSplits() throws Exception {
    Properties properties = new Properties();
    properties.setProperty("columns", "x,y");
    properties.setProperty("columns.types", "int:int");
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // Save the conf variable values so that they can be restored later.
    long oldDefaultStripeSize = conf.getLong(OrcConf.STRIPE_SIZE.getHiveConfName(), -1L);
    long oldMaxSplitSize = conf.getLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, -1L);
    // Set the conf variable values for this test.
    // 10000 bytes per stripe
    long newStripeSize = 10000L;
    // 1024 bytes per split
    long newMaxSplitSize = 100L;
    conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), newStripeSize);
    conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, newMaxSplitSize);
    AbstractSerDe serde = new OrcSerde();
    HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
    org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL);
    // The following loop should create 20 stripes in the orc file.
    for (int i = 0; i < newStripeSize * 10; ++i) {
        writer.write(serde.serialize(new MyRow(i, i + 1), inspector));
    }
    writer.close(true);
    serde = new OrcSerde();
    SerDeUtils.initializeSerDe(serde, conf, properties, null);
    assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
    inspector = (StructObjectInspector) serde.getObjectInspector();
    assertEquals("struct<x:int,y:int>", inspector.getTypeName());
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    int numExpectedSplits = 20;
    InputSplit[] splits = in.getSplits(conf, numExpectedSplits);
    assertEquals(numExpectedSplits, splits.length);
    for (int i = 0; i < numExpectedSplits; ++i) {
        OrcSplit split = (OrcSplit) splits[i];
        Reader.Options orcReaderOptions = new Reader.Options();
        orcReaderOptions.range(split.getStart(), split.getLength());
        OrcFile.ReaderOptions qlReaderOptions = OrcFile.readerOptions(conf).maxLength(split.getFileLength());
        Reader reader = OrcFile.createReader(split.getPath(), qlReaderOptions);
        RecordReader recordReader = reader.rowsOptions(orcReaderOptions);
        for (int j = 0; recordReader.hasNext(); j++) {
            long rowNum = (i * 5000) + j;
            long rowNumActual = recordReader.getRowNumber();
            assertEquals("rowNum=" + rowNum, rowNum, rowNumActual);
            Object row = recordReader.next(null);
        }
        recordReader.close();
    }
    // Reset the conf variable values that we changed for this test.
    if (oldDefaultStripeSize != -1L) {
        conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), oldDefaultStripeSize);
    } else {
        // this means that nothing was set for default stripe size previously, so we should unset it.
        conf.unset(OrcConf.STRIPE_SIZE.getHiveConfName());
    }
    if (oldMaxSplitSize != -1L) {
        conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, oldMaxSplitSize);
    } else {
        // this means that nothing was set for default stripe size previously, so we should unset it.
        conf.unset(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname);
    }
}

Also used : Properties(java.util.Properties) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) InputSplit(org.apache.hadoop.mapred.InputSplit) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 68 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestInputOutputFormat method testVectorizationWithBuckets.

/**
 * Test vectorization, non-acid, non-combine.
 * @throws Exception
 */
@Test
public void testVectorizationWithBuckets() throws Exception {
    // get the object inspector for MyRow
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "vectorBuckets", inspector, true, 1);
    // write the orc file to the mock file system
    Path path = new Path(conf.get("mapred.input.dir") + "/0_0");
    Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    setBlocks(path, conf, new MockBlock("host0", "host1"));
    // call getsplits
    conf.setInt(hive_metastoreConstants.BUCKET_COUNT, 3);
    HiveInputFormat<?, ?> inputFormat = new HiveInputFormat<WritableComparable, Writable>();
    InputSplit[] splits = inputFormat.getSplits(conf, 10);
    assertEquals(1, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> reader = inputFormat.getRecordReader(splits[0], conf, Reporter.NULL);
    NullWritable key = reader.createKey();
    VectorizedRowBatch value = reader.createValue();
    assertEquals(true, reader.next(key, value));
    assertEquals(10, value.count());
    LongColumnVector col0 = (LongColumnVector) value.cols[0];
    for (int i = 0; i < 10; i++) {
        assertEquals("checking " + i, i, col0.vector[i]);
    }
    assertEquals(false, reader.next(key, value));
}

Also used : NullWritable(org.apache.hadoop.io.NullWritable) CombineHiveInputFormat(org.apache.hadoop.hive.ql.io.CombineHiveInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 69 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestInputOutputFormat method testCombinationInputFormatWithAcid.

// test non-vectorized, acid, combine
@Test
public void testCombinationInputFormatWithAcid() throws Exception {
    // get the object inspector for MyRow
    StructObjectInspector inspector;
    final int PARTITIONS = 2;
    final int BUCKETS = 3;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "combinationAcid", inspector, false, PARTITIONS);
    // write the orc file to the mock file system
    Path[] partDir = new Path[PARTITIONS];
    String[] paths = conf.getStrings("mapred.input.dir");
    for (int p = 0; p < PARTITIONS; ++p) {
        partDir[p] = new Path(paths[p]);
    }
    // write a base file in partition 0
    OrcRecordUpdater writer = new OrcRecordUpdater(partDir[0], new AcidOutputFormat.Options(conf).maximumWriteId(10).writingBase(true).bucket(0).inspector(inspector).finalDestination(partDir[0]));
    for (int i = 0; i < 10; ++i) {
        writer.insert(10, new MyRow(i, 2 * i));
    }
    writer.close(false);
    // base file
    Path base0 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00000");
    setBlocks(base0, conf, new MockBlock("host1", "host2"));
    // write a delta file in partition 0
    writer = new OrcRecordUpdater(partDir[0], new AcidOutputFormat.Options(conf).maximumWriteId(10).writingBase(true).bucket(1).inspector(inspector).finalDestination(partDir[0]));
    for (int i = 10; i < 20; ++i) {
        writer.insert(10, new MyRow(i, 2 * i));
    }
    writer.close(false);
    Path base1 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00001");
    setBlocks(base1, conf, new MockBlock("host1", "host2"));
    // write three files in partition 1
    for (int bucket = 0; bucket < BUCKETS; ++bucket) {
        Path path = new Path(partDir[1], "00000" + bucket + "_0");
        Writer orc = OrcFile.createWriter(path, OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
        orc.addRow(new MyRow(1, 2));
        orc.close();
        setBlocks(path, conf, new MockBlock("host3", "host4"));
    }
    // call getsplits
    conf.setInt(hive_metastoreConstants.BUCKET_COUNT, BUCKETS);
    HiveInputFormat<?, ?> inputFormat = new CombineHiveInputFormat<WritableComparable, Writable>();
    InputSplit[] splits = inputFormat.getSplits(conf, 1);
    assertEquals(3, splits.length);
    HiveInputFormat.HiveInputSplit split = (HiveInputFormat.HiveInputSplit) splits[0];
    assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", split.inputFormatClassName());
    assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00000", split.getPath().toString());
    assertEquals(0, split.getStart());
    assertEquals(679, split.getLength());
    split = (HiveInputFormat.HiveInputSplit) splits[1];
    assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", split.inputFormatClassName());
    assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00001", split.getPath().toString());
    assertEquals(0, split.getStart());
    assertEquals(705, split.getLength());
    CombineHiveInputFormat.CombineHiveInputSplit combineSplit = (CombineHiveInputFormat.CombineHiveInputSplit) splits[2];
    assertEquals(BUCKETS, combineSplit.getNumPaths());
    for (int bucket = 0; bucket < BUCKETS; ++bucket) {
        assertEquals("mock:/combinationAcid/p=1/00000" + bucket + "_0", combineSplit.getPath(bucket).toString());
        assertEquals(0, combineSplit.getOffset(bucket));
        assertEquals(241, combineSplit.getLength(bucket));
    }
    String[] hosts = combineSplit.getLocations();
    assertEquals(2, hosts.length);
}

Also used : CombineHiveInputFormat(org.apache.hadoop.hive.ql.io.CombineHiveInputFormat) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) CombineHiveInputFormat(org.apache.hadoop.hive.ql.io.CombineHiveInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 70 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestInputOutputFormat method testVectorReaderNoFooterSerialize.

@Test
public void testVectorReaderNoFooterSerialize() throws Exception {
    MockFileSystem fs = new MockFileSystem(conf);
    MockPath mockPath = new MockPath(fs, "mock:///mocktable3");
    conf.set("hive.orc.splits.include.file.footer", "false");
    conf.set("mapred.input.dir", mockPath.toString());
    conf.set("fs.defaultFS", "mock:///");
    conf.set("fs.mock.impl", MockFileSystem.class.getName());
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "mocktable3", inspector, true, 0);
    Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    OrcInputFormat orcInputFormat = new OrcInputFormat();
    InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
    assertEquals(2, splits.length);
    int readOpsBefore = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsBefore = statistics.getReadOps();
        }
    }
    assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
    for (InputSplit split : splits) {
        assertTrue("OrcSplit is expected", split instanceof OrcSplit);
        // ETL strategies will have start=3 (start of first stripe)
        assertTrue(split.toString().contains("start=3"));
        assertTrue(split.toString().contains("hasFooter=false"));
        assertTrue(split.toString().contains("hasBase=true"));
        assertTrue(split.toString().contains("deltas=0"));
        if (split instanceof OrcSplit) {
            assertFalse("No footer serialize test for vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
        }
        orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL);
    }
    int readOpsDelta = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsDelta = statistics.getReadOps() - readOpsBefore;
        }
    }
    // call-1: open to read footer - split 1 => mock:/mocktable3/0_0
    // call-2: open to read data - split 1 => mock:/mocktable3/0_0
    // call-3: open to read footer - split 2 => mock:/mocktable3/0_1
    // call-4: open to read data - split 2 => mock:/mocktable3/0_1
    assertEquals(4, readOpsDelta);
    // revert back to local fs
    conf.set("fs.defaultFS", "file:///");
}

Also used : JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8