Search in sources :

Example 61 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestSymlinkTextInputFormat method testCombine.

/**
 * Test combine symlink text input file. Two input dir, and each contains one
 * file, and then create one symlink file containing these 2 files. Normally
 * without combine, it will return at least 2 splits
 */
public void testCombine() throws Exception {
    JobConf newJob = new JobConf(job);
    FileSystem fs = dataDir1.getFileSystem(newJob);
    int symbolLinkedFileSize = 0;
    Path dir1_file1 = new Path(dataDir1, "combinefile1_1");
    writeTextFile(dir1_file1, "dir1_file1_line1\n" + "dir1_file1_line2\n");
    symbolLinkedFileSize += fs.getFileStatus(dir1_file1).getLen();
    Path dir2_file1 = new Path(dataDir2, "combinefile2_1");
    writeTextFile(dir2_file1, "dir2_file1_line1\n" + "dir2_file1_line2\n");
    symbolLinkedFileSize += fs.getFileStatus(dir2_file1).getLen();
    // A symlink file, contains first file from first dir and second file from
    // second dir.
    writeSymlinkFile(new Path(symlinkDir, "symlink_file"), new Path(dataDir1, "combinefile1_1"), new Path(dataDir2, "combinefile2_1"));
    HiveConf hiveConf = new HiveConf(TestSymlinkTextInputFormat.class);
    hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
    HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVE_REWORK_MAPREDWORK, true);
    HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, false);
    Driver drv = new Driver(hiveConf);
    String tblName = "text_symlink_text";
    String createSymlinkTableCmd = "create table " + tblName + " (key int) stored as " + " inputformat 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' " + " outputformat 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'";
    SessionState.start(hiveConf);
    boolean tblCreated = false;
    try {
        int ecode = 0;
        ecode = drv.run(createSymlinkTableCmd).getResponseCode();
        if (ecode != 0) {
            throw new Exception("Create table command: " + createSymlinkTableCmd + " failed with exit code= " + ecode);
        }
        tblCreated = true;
        String loadFileCommand = "LOAD DATA LOCAL INPATH '" + new Path(symlinkDir, "symlink_file").toString() + "' INTO TABLE " + tblName;
        ecode = drv.run(loadFileCommand).getResponseCode();
        if (ecode != 0) {
            throw new Exception("Load data command: " + loadFileCommand + " failed with exit code= " + ecode);
        }
        String cmd = "select key*1 from " + tblName;
        ecode = drv.compile(cmd);
        if (ecode != 0) {
            throw new Exception("Select compile: " + cmd + " failed with exit code= " + ecode);
        }
        // create scratch dir
        Context ctx = new Context(newJob);
        Path emptyScratchDir = ctx.getMRTmpPath();
        FileSystem fileSys = emptyScratchDir.getFileSystem(newJob);
        fileSys.mkdirs(emptyScratchDir);
        QueryPlan plan = drv.getPlan();
        MapRedTask selectTask = (MapRedTask) plan.getRootTasks().get(0);
        List<Path> inputPaths = Utilities.getInputPaths(newJob, selectTask.getWork().getMapWork(), emptyScratchDir, ctx, false);
        Utilities.setInputPaths(newJob, inputPaths);
        Utilities.setMapRedWork(newJob, selectTask.getWork(), ctx.getMRTmpPath());
        CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(CombineHiveInputFormat.class, newJob);
        InputSplit[] retSplits = combineInputFormat.getSplits(newJob, 1);
        assertEquals(1, retSplits.length);
    } catch (Exception e) {
        e.printStackTrace();
        fail("Caught exception " + e);
    } finally {
        if (tblCreated) {
            drv.run("drop table text_symlink_text").getResponseCode();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Context(org.apache.hadoop.hive.ql.Context) Driver(org.apache.hadoop.hive.ql.Driver) QueryPlan(org.apache.hadoop.hive.ql.QueryPlan) IOException(java.io.IOException) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) FileSystem(org.apache.hadoop.fs.FileSystem) HiveConf(org.apache.hadoop.hive.conf.HiveConf) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 62 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestSymlinkTextInputFormat method testAccuracy1.

/**
 * Test scenario: Two data directories, one symlink file that contains two
 * paths each point to a file in one of data directories.
 */
public void testAccuracy1() throws IOException {
    // First data dir, contains 2 files.
    FileSystem fs = dataDir1.getFileSystem(job);
    int symbolLinkedFileSize = 0;
    Path dir1_file1 = new Path(dataDir1, "file1");
    writeTextFile(dir1_file1, "dir1_file1_line1\n" + "dir1_file1_line2\n");
    symbolLinkedFileSize += fs.getFileStatus(dir1_file1).getLen();
    Path dir1_file2 = new Path(dataDir1, "file2");
    writeTextFile(dir1_file2, "dir1_file2_line1\n" + "dir1_file2_line2\n");
    // Second data dir, contains 2 files.
    Path dir2_file1 = new Path(dataDir2, "file1");
    writeTextFile(dir2_file1, "dir2_file1_line1\n" + "dir2_file1_line2\n");
    Path dir2_file2 = new Path(dataDir2, "file2");
    writeTextFile(dir2_file2, "dir2_file2_line1\n" + "dir2_file2_line2\n");
    symbolLinkedFileSize += fs.getFileStatus(dir2_file2).getLen();
    // A symlink file, contains first file from first dir and second file from
    // second dir.
    writeSymlinkFile(new Path(symlinkDir, "symlink_file"), new Path(dataDir1, "file1"), new Path(dataDir2, "file2"));
    SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
    // test content summary
    ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
    assertEquals(symbolLinkedFileSize, cs.getLength());
    assertEquals(2, cs.getFileCount());
    assertEquals(0, cs.getDirectoryCount());
    FileInputFormat.setInputPaths(job, symlinkDir);
    InputSplit[] splits = inputFormat.getSplits(job, 2);
    log.info("Number of splits: " + splits.length);
    // Read all values.
    List<String> received = new ArrayList<String>();
    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = inputFormat.getRecordReader(split, job, reporter);
        LongWritable key = reader.createKey();
        Text value = reader.createValue();
        while (reader.next(key, value)) {
            received.add(value.toString());
        }
        reader.close();
    }
    List<String> expected = new ArrayList<String>();
    expected.add("dir1_file1_line1");
    expected.add("dir1_file1_line2");
    expected.add("dir2_file2_line1");
    expected.add("dir2_file2_line2");
    assertEquals(expected, received);
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) FileSystem(org.apache.hadoop.fs.FileSystem) ContentSummary(org.apache.hadoop.fs.ContentSummary) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 63 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestSymlinkTextInputFormat method testAccuracy2.

/**
 * Scenario: Empty input directory, i.e. no symlink file.
 *
 * Expected: Should return empty result set without any exception.
 */
public void testAccuracy2() throws IOException {
    fileSystem.mkdirs(symlinkDir);
    FileInputFormat.setInputPaths(job, symlinkDir);
    SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
    ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
    assertEquals(0, cs.getLength());
    assertEquals(0, cs.getFileCount());
    assertEquals(0, cs.getDirectoryCount());
    InputSplit[] splits = inputFormat.getSplits(job, 2);
    log.info("Number of splits: " + splits.length);
    // Read all values.
    List<String> received = new ArrayList<String>();
    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = inputFormat.getRecordReader(split, job, reporter);
        LongWritable key = reader.createKey();
        Text value = reader.createValue();
        while (reader.next(key, value)) {
            received.add(value.toString());
        }
        reader.close();
    }
    List<String> expected = new ArrayList<String>();
    assertEquals(expected, received);
}
Also used : ContentSummary(org.apache.hadoop.fs.ContentSummary) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 64 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestInputOutputFormat method testACIDReaderNoFooterSerialize.

@Test
public void testACIDReaderNoFooterSerialize() throws Exception {
    MockFileSystem fs = new MockFileSystem(conf);
    MockPath mockPath = new MockPath(fs, "mock:///mocktable5");
    conf.set(ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, "true");
    conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
    conf.set("hive.orc.splits.include.file.footer", "false");
    conf.set("mapred.input.dir", mockPath.toString());
    conf.set("fs.defaultFS", "mock:///");
    conf.set("fs.mock.impl", MockFileSystem.class.getName());
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    OrcInputFormat orcInputFormat = new OrcInputFormat();
    InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
    assertEquals(2, splits.length);
    int readOpsBefore = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsBefore = statistics.getReadOps();
        }
    }
    assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
    for (InputSplit split : splits) {
        assertTrue("OrcSplit is expected", split instanceof OrcSplit);
        // ETL strategies will have start=3 (start of first stripe)
        assertTrue(split.toString().contains("start=3"));
        assertTrue(split.toString().contains("hasFooter=false"));
        assertTrue(split.toString().contains("hasBase=true"));
        assertTrue(split.toString().contains("deltas=0"));
        assertTrue(split.toString().contains("isOriginal=true"));
        if (split instanceof OrcSplit) {
            assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
        }
        orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
    }
    int readOpsDelta = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsDelta = statistics.getReadOps() - readOpsBefore;
        }
    }
    // call-1: open to read footer - split 1 => mock:/mocktable5/0_0
    // call-2: open to read data - split 1 => mock:/mocktable5/0_0
    // call-3: getAcidState - split 1 => mock:/mocktable5 (to compute offset for original read)
    // call-4: open to read footer - split 2 => mock:/mocktable5/0_1
    // call-5: open to read data - split 2 => mock:/mocktable5/0_1
    // call-6: getAcidState - split 2 => mock:/mocktable5 (to compute offset for original read)
    // call-7: open to read footer - split 2 => mock:/mocktable5/0_0 (to get row count)
    // call-8: file status - split 2 => mock:/mocktable5/0_0
    assertEquals(8, readOpsDelta);
    // revert back to local fs
    conf.set("fs.defaultFS", "file:///");
}
Also used : InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 65 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestInputOutputFormat method testSplitGenReadOpsLocalCacheChangeFileLen.

@Test
public void testSplitGenReadOpsLocalCacheChangeFileLen() throws Exception {
    MockFileSystem fs = new MockFileSystem(conf);
    // creates the static cache
    MockPath mockPath = new MockPath(fs, "mock:///mocktbl1");
    conf.set("mapred.input.dir", mockPath.toString());
    conf.set("fs.defaultFS", "mock:///");
    conf.set("fs.mock.impl", MockFileSystem.class.getName());
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    int readOpsBefore = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsBefore = statistics.getReadOps();
        }
    }
    assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
    OrcInputFormat orcInputFormat = new OrcInputFormat();
    InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
    assertEquals(2, splits.length);
    int readOpsDelta = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsDelta = statistics.getReadOps() - readOpsBefore;
        }
    }
    // call-1: listLocatedStatus - mock:/mocktable
    // call-2: check side file for mock:/mocktbl1/0_0
    // call-3: open - mock:/mocktbl1/0_0
    // call-4: check side file for  mock:/mocktbl1/0_1
    // call-5: open - mock:/mocktbl1/0_1
    assertEquals(5, readOpsDelta);
    // change file length and look for cache misses
    fs.clear();
    writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 100; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 100; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsBefore = statistics.getReadOps();
        }
    }
    orcInputFormat = new OrcInputFormat();
    splits = orcInputFormat.getSplits(conf, 2);
    assertEquals(2, splits.length);
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsDelta = statistics.getReadOps() - readOpsBefore;
        }
    }
    // call-1: listLocatedStatus - mock:/mocktable
    // call-2: check side file for mock:/mocktbl1/0_0
    // call-3: open - mock:/mocktbl1/0_0
    // call-4: check side file for  mock:/mocktbl1/0_1
    // call-5: open - mock:/mocktbl1/0_1
    assertEquals(5, readOpsDelta);
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsBefore = statistics.getReadOps();
        }
    }
    orcInputFormat = new OrcInputFormat();
    splits = orcInputFormat.getSplits(conf, 2);
    assertEquals(2, splits.length);
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsDelta = statistics.getReadOps() - readOpsBefore;
        }
    }
    // call-1: listLocatedStatus - mock:/mocktbl1
    assertEquals(1, readOpsDelta);
    // revert back to local fs
    conf.set("fs.defaultFS", "file:///");
}
Also used : InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8