Search in sources :

Example 1 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hadoop by apache.

the class TestAutoInputFormat method testFormat.

@SuppressWarnings({ "unchecked", "deprecation" })
@Test
public void testFormat() throws IOException {
    JobConf job = new JobConf(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path txtFile = new Path(dir, "auto.txt");
    Path seqFile = new Path(dir, "auto.seq");
    fs.delete(dir, true);
    FileInputFormat.setInputPaths(job, dir);
    Writer txtWriter = new OutputStreamWriter(fs.create(txtFile));
    try {
        for (int i = 0; i < LINES_COUNT; i++) {
            txtWriter.write("" + (10 * i));
            txtWriter.write("\n");
        }
    } finally {
        txtWriter.close();
    }
    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, seqFile, IntWritable.class, LongWritable.class);
    try {
        for (int i = 0; i < RECORDS_COUNT; i++) {
            IntWritable key = new IntWritable(11 * i);
            LongWritable value = new LongWritable(12 * i);
            seqWriter.append(key, value);
        }
    } finally {
        seqWriter.close();
    }
    AutoInputFormat format = new AutoInputFormat();
    InputSplit[] splits = format.getSplits(job, SPLITS_COUNT);
    for (InputSplit split : splits) {
        RecordReader reader = format.getRecordReader(split, job, Reporter.NULL);
        Object key = reader.createKey();
        Object value = reader.createValue();
        try {
            while (reader.next(key, value)) {
                if (key instanceof LongWritable) {
                    assertEquals("Wrong value class.", Text.class, value.getClass());
                    assertTrue("Invalid value", Integer.parseInt(((Text) value).toString()) % 10 == 0);
                } else {
                    assertEquals("Wrong key class.", IntWritable.class, key.getClass());
                    assertEquals("Wrong value class.", LongWritable.class, value.getClass());
                    assertTrue("Invalid key.", ((IntWritable) key).get() % 11 == 0);
                    assertTrue("Invalid value.", ((LongWritable) value).get() % 12 == 0);
                }
            }
        } finally {
            reader.close();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) AutoInputFormat(org.apache.hadoop.streaming.AutoInputFormat) RecordReader(org.apache.hadoop.mapred.RecordReader) Text(org.apache.hadoop.io.Text) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStreamWriter(java.io.OutputStreamWriter) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 2 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hadoop by apache.

the class TestDelegatingInputFormat method testSplitting.

@Test
public void testSplitting() throws Exception {
    JobConf conf = new JobConf();
    MiniDFSCluster dfs = null;
    try {
        dfs = new MiniDFSCluster.Builder(conf).numDataNodes(4).racks(new String[] { "/rack0", "/rack0", "/rack1", "/rack1" }).hosts(new String[] { "host0", "host1", "host2", "host3" }).build();
        FileSystem fs = dfs.getFileSystem();
        Path path = getPath("/foo/bar", fs);
        Path path2 = getPath("/foo/baz", fs);
        Path path3 = getPath("/bar/bar", fs);
        Path path4 = getPath("/bar/baz", fs);
        final int numSplits = 100;
        MultipleInputs.addInputPath(conf, path, TextInputFormat.class, MapClass.class);
        MultipleInputs.addInputPath(conf, path2, TextInputFormat.class, MapClass2.class);
        MultipleInputs.addInputPath(conf, path3, KeyValueTextInputFormat.class, MapClass.class);
        MultipleInputs.addInputPath(conf, path4, TextInputFormat.class, MapClass2.class);
        DelegatingInputFormat inFormat = new DelegatingInputFormat();
        InputSplit[] splits = inFormat.getSplits(conf, numSplits);
        int[] bins = new int[3];
        for (InputSplit split : splits) {
            assertTrue(split instanceof TaggedInputSplit);
            final TaggedInputSplit tis = (TaggedInputSplit) split;
            int index = -1;
            if (tis.getInputFormatClass().equals(KeyValueTextInputFormat.class)) {
                // path3
                index = 0;
            } else if (tis.getMapperClass().equals(MapClass.class)) {
                // path
                index = 1;
            } else {
                // path2 and path4
                index = 2;
            }
            bins[index]++;
        }
        // regardless of the number of paths that use that Mapper/InputFormat
        for (int count : bins) {
            assertEquals(numSplits, count);
        }
        assertTrue(true);
    } finally {
        if (dfs != null) {
            dfs.shutdown();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Example 3 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hadoop by apache.

the class TestPipesNonJavaInputFormat method testFormat.

/**
   *  test PipesNonJavaInputFormat
    */
@Test
public void testFormat() throws IOException {
    PipesNonJavaInputFormat inputFormat = new PipesNonJavaInputFormat();
    JobConf conf = new JobConf();
    Reporter reporter = mock(Reporter.class);
    RecordReader<FloatWritable, NullWritable> reader = inputFormat.getRecordReader(new FakeSplit(), conf, reporter);
    assertEquals(0.0f, reader.getProgress(), 0.001);
    // input and output files
    File input1 = new File(workSpace + File.separator + "input1");
    if (!input1.getParentFile().exists()) {
        Assert.assertTrue(input1.getParentFile().mkdirs());
    }
    if (!input1.exists()) {
        Assert.assertTrue(input1.createNewFile());
    }
    File input2 = new File(workSpace + File.separator + "input2");
    if (!input2.exists()) {
        Assert.assertTrue(input2.createNewFile());
    }
    // set data for splits
    conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, StringUtils.escapeString(input1.getAbsolutePath()) + "," + StringUtils.escapeString(input2.getAbsolutePath()));
    InputSplit[] splits = inputFormat.getSplits(conf, 2);
    assertEquals(2, splits.length);
    PipesNonJavaInputFormat.PipesDummyRecordReader dummyRecordReader = new PipesNonJavaInputFormat.PipesDummyRecordReader(conf, splits[0]);
    // empty dummyRecordReader
    assertNull(dummyRecordReader.createKey());
    assertNull(dummyRecordReader.createValue());
    assertEquals(0, dummyRecordReader.getPos());
    assertEquals(0.0, dummyRecordReader.getProgress(), 0.001);
    // test method next
    assertTrue(dummyRecordReader.next(new FloatWritable(2.0f), NullWritable.get()));
    assertEquals(2.0, dummyRecordReader.getProgress(), 0.001);
    dummyRecordReader.close();
}
Also used : FloatWritable(org.apache.hadoop.io.FloatWritable) Reporter(org.apache.hadoop.mapred.Reporter) FakeSplit(org.apache.hadoop.mapred.pipes.TestPipeApplication.FakeSplit) JobConf(org.apache.hadoop.mapred.JobConf) NullWritable(org.apache.hadoop.io.NullWritable) File(java.io.File) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Example 4 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class GenericUDTFGetSplits method getSplits.

public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
    DAG dag = DAG.create(work.getName());
    dag.setCredentials(job.getCredentials());
    DagUtils utils = DagUtils.getInstance();
    Context ctx = new Context(job);
    MapWork mapWork = (MapWork) work.getAllWork().get(0);
    // bunch of things get setup in the context based on conf but we need only the MR tmp directory
    // for the following method.
    JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
    // TODO: should we also whitelist input formats here? from mapred.input.format.class
    Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
    FileSystem fs = scratchDir.getFileSystem(job);
    try {
        LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
        Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
        String vertexName = wx.getName();
        dag.addVertex(wx);
        utils.addCredentials(mapWork, dag);
        // we have the dag now proceed to get the splits:
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
        HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
        List<Event> eventList = splitGenerator.initialize();
        InputSplit[] result = new InputSplit[eventList.size() - 1];
        InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
        List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
        Preconditions.checkState(hints.size() == eventList.size() - 1);
        if (LOG.isDebugEnabled()) {
            LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
        }
        LlapCoordinator coordinator = LlapCoordinator.getInstance();
        if (coordinator == null) {
            throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
        }
        // See the discussion in the implementation as to why we generate app ID.
        ApplicationId applicationId = coordinator.createExtClientAppId();
        // This assumes LLAP cluster owner is always the HS2 user.
        String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
        String queryUser = null;
        byte[] tokenBytes = null;
        LlapSigner signer = null;
        if (UserGroupInformation.isSecurityEnabled()) {
            signer = coordinator.getLlapSigner(job);
            // 1. Generate the token for query user (applies to all splits).
            queryUser = SessionState.getUserFromAuthenticator();
            if (queryUser == null) {
                queryUser = UserGroupInformation.getCurrentUser().getUserName();
                LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
            }
            LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
            // We put the query user, not LLAP user, into the message and token.
            Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
            LOG.info("Created the token for remote user: {}", token);
            bos.reset();
            token.write(dos);
            tokenBytes = bos.toByteArray();
        } else {
            queryUser = UserGroupInformation.getCurrentUser().getUserName();
        }
        LOG.info("Number of splits: " + (eventList.size() - 1));
        SignedMessage signedSvs = null;
        for (int i = 0; i < eventList.size() - 1; i++) {
            TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
            // 2. Generate the vertex/submit information for all events.
            if (i == 0) {
                // The queryId could either be picked up from the current request being processed, or
                // generated. The current request isn't exactly correct since the query is 'done' once we
                // return the results. Generating a new one has the added benefit of working once this
                // is moved out of a UDTF into a proper API.
                // Setting this to the generated AppId which is unique.
                // Despite the differences in TaskSpec, the vertex spec should be the same.
                signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
            }
            SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
            byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
            // 3. Generate input event.
            SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
            // 4. Make location hints.
            SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
            result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
        }
        return result;
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) SubmitWorkInfo(org.apache.hadoop.hive.llap.SubmitWorkInfo) LlapTokenIdentifier(org.apache.hadoop.hive.llap.security.LlapTokenIdentifier) SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) HiveSplitGenerator(org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator) TaskSpecBuilder(org.apache.tez.dag.api.TaskSpecBuilder) LlapSigner(org.apache.hadoop.hive.llap.security.LlapSigner) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LlapTokenLocalClient(org.apache.hadoop.hive.llap.security.LlapTokenLocalClient) DagUtils(org.apache.hadoop.hive.ql.exec.tez.DagUtils) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) Context(org.apache.hadoop.hive.ql.Context) Path(org.apache.hadoop.fs.Path) TaskSpec(org.apache.tez.runtime.api.impl.TaskSpec) SignedMessage(org.apache.hadoop.hive.llap.security.LlapSigner.SignedMessage) DAG(org.apache.tez.dag.api.DAG) IOException(java.io.IOException) LlapCoordinator(org.apache.hadoop.hive.llap.coordinator.LlapCoordinator) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LoginException(javax.security.auth.login.LoginException) URISyntaxException(java.net.URISyntaxException) UDFArgumentLengthException(org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) UDFArgumentException(org.apache.hadoop.hive.ql.exec.UDFArgumentException) UDFArgumentTypeException(org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException) CommandNeedRetryException(org.apache.hadoop.hive.ql.CommandNeedRetryException) IOException(java.io.IOException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Event(org.apache.tez.runtime.api.Event) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId)

Example 5 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class GenericUDTFGetSplits method process.

@Override
public void process(Object[] arguments) throws HiveException {
    String query = stringOI.getPrimitiveJavaObject(arguments[0]);
    int num = intOI.get(arguments[1]);
    PlanFragment fragment = createPlanFragment(query, num);
    TezWork tezWork = fragment.work;
    Schema schema = fragment.schema;
    try {
        for (InputSplit s : getSplits(jc, num, tezWork, schema)) {
            Object[] os = new Object[1];
            bos.reset();
            s.write(dos);
            byte[] frozen = bos.toByteArray();
            os[0] = frozen;
            forward(os);
        }
    } catch (Exception e) {
        throw new HiveException(e);
    }
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Schema(org.apache.hadoop.hive.llap.Schema) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LoginException(javax.security.auth.login.LoginException) URISyntaxException(java.net.URISyntaxException) UDFArgumentLengthException(org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) UDFArgumentException(org.apache.hadoop.hive.ql.exec.UDFArgumentException) UDFArgumentTypeException(org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException) CommandNeedRetryException(org.apache.hadoop.hive.ql.CommandNeedRetryException) IOException(java.io.IOException) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)153 Path (org.apache.hadoop.fs.Path)71 JobConf (org.apache.hadoop.mapred.JobConf)50 Test (org.junit.Test)47 IOException (java.io.IOException)43 FileSystem (org.apache.hadoop.fs.FileSystem)32 ArrayList (java.util.ArrayList)28 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)22 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)20 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 Text (org.apache.hadoop.io.Text)18 NullWritable (org.apache.hadoop.io.NullWritable)17 LongWritable (org.apache.hadoop.io.LongWritable)11 Configuration (org.apache.hadoop.conf.Configuration)10 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 HashMap (java.util.HashMap)8 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)8