Search in sources :

Example 26 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project hive by apache.

the class TestRCFileMapReduceInputFormat method writeThenReadByRecordReader.

private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec) throws IOException, InterruptedException {
    Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
    Path testFile = new Path(testDir, "test_rcfile");
    fs.delete(testFile, true);
    Configuration cloneConf = new Configuration(conf);
    RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
    cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
    RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
    for (int i = 0; i < bytesArray.length; i++) {
        BytesRefWritable cu = null;
        cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
        bytes.set(i, cu);
    }
    for (int i = 0; i < writeCount; i++) {
        writer.append(bytes);
    }
    writer.close();
    RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable> inputFormat = new RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable>();
    Configuration jonconf = new Configuration(cloneConf);
    jonconf.set("mapred.input.dir", testDir.toString());
    JobContext context = new Job(jonconf);
    HiveConf.setLongVar(context.getConfiguration(), HiveConf.ConfVars.MAPREDMAXSPLITSIZE, maxSplitSize);
    List<InputSplit> splits = inputFormat.getSplits(context);
    assertEquals("splits length should be " + splitNumber, splits.size(), splitNumber);
    int readCount = 0;
    for (int i = 0; i < splits.size(); i++) {
        TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(jonconf, new TaskAttemptID());
        RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac);
        rr.initialize(splits.get(i), tac);
        while (rr.nextKeyValue()) {
            readCount++;
        }
    }
    assertEquals("readCount should be equal to writeCount", readCount, writeCount);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RCFile(org.apache.hadoop.hive.ql.io.RCFile) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 27 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project hive by apache.

the class LlapBaseInputFormat method getRecordReader.

@SuppressWarnings("unchecked")
@Override
public RecordReader<NullWritable, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    LlapInputSplit llapSplit = (LlapInputSplit) split;
    // Set conf to use LLAP user rather than current user for LLAP Zk registry.
    HiveConf.setVar(job, HiveConf.ConfVars.LLAP_ZK_REGISTRY_USER, llapSplit.getLlapUser());
    SubmitWorkInfo submitWorkInfo = SubmitWorkInfo.fromBytes(llapSplit.getPlanBytes());
    ServiceInstance serviceInstance = getServiceInstance(job, llapSplit);
    String host = serviceInstance.getHost();
    int llapSubmitPort = serviceInstance.getRpcPort();
    LOG.info("Found service instance for host " + host + " with rpc port " + llapSubmitPort + " and outputformat port " + serviceInstance.getOutputFormatPort());
    byte[] llapTokenBytes = llapSplit.getTokenBytes();
    Token<LlapTokenIdentifier> llapToken = null;
    if (llapTokenBytes != null) {
        DataInputBuffer in = new DataInputBuffer();
        in.reset(llapTokenBytes, 0, llapTokenBytes.length);
        llapToken = new Token<LlapTokenIdentifier>();
        llapToken.readFields(in);
    }
    LlapRecordReaderTaskUmbilicalExternalResponder umbilicalResponder = new LlapRecordReaderTaskUmbilicalExternalResponder();
    LlapTaskUmbilicalExternalClient llapClient = new LlapTaskUmbilicalExternalClient(job, submitWorkInfo.getTokenIdentifier(), submitWorkInfo.getToken(), umbilicalResponder, llapToken);
    llapClient.init(job);
    llapClient.start();
    int attemptNum = 0;
    // Use task attempt number from conf if provided
    TaskAttemptID taskAttemptId = TaskAttemptID.forName(job.get(MRJobConfig.TASK_ATTEMPT_ID));
    if (taskAttemptId != null) {
        attemptNum = taskAttemptId.getId();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Setting attempt number to " + attemptNum + " from task attempt ID in conf: " + job.get(MRJobConfig.TASK_ATTEMPT_ID));
        }
    }
    SubmitWorkRequestProto request = constructSubmitWorkRequestProto(submitWorkInfo, llapSplit.getSplitNum(), attemptNum, llapClient.getAddress(), submitWorkInfo.getToken(), llapSplit.getFragmentBytes(), llapSplit.getFragmentBytesSignature(), job);
    llapClient.submitWork(request, host, llapSubmitPort);
    Socket socket = new Socket(host, serviceInstance.getOutputFormatPort());
    LOG.debug("Socket connected");
    SignableVertexSpec vertex = SignableVertexSpec.parseFrom(submitWorkInfo.getVertexBinary());
    String fragmentId = Converters.createTaskAttemptId(vertex.getQueryIdentifier(), vertex.getVertexIndex(), request.getFragmentNumber(), request.getAttemptNumber()).toString();
    OutputStream socketStream = socket.getOutputStream();
    LlapOutputSocketInitMessage.Builder builder = LlapOutputSocketInitMessage.newBuilder().setFragmentId(fragmentId);
    if (llapSplit.getTokenBytes() != null) {
        builder.setToken(ByteString.copyFrom(llapSplit.getTokenBytes()));
    }
    builder.build().writeDelimitedTo(socketStream);
    socketStream.flush();
    LOG.info("Registered id: " + fragmentId);
    @SuppressWarnings("rawtypes") LlapBaseRecordReader recordReader = new LlapBaseRecordReader(socket.getInputStream(), llapSplit.getSchema(), Text.class, job, llapClient, (java.io.Closeable) socket);
    umbilicalResponder.setRecordReader(recordReader);
    return recordReader;
}
Also used : LlapTokenIdentifier(org.apache.hadoop.hive.llap.security.LlapTokenIdentifier) LlapOutputSocketInitMessage(org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.LlapOutputSocketInitMessage) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TezTaskAttemptID(org.apache.tez.dag.records.TezTaskAttemptID) OutputStream(java.io.OutputStream) ServiceInstance(org.apache.hadoop.hive.llap.registry.ServiceInstance) ByteString(com.google.protobuf.ByteString) LlapTaskUmbilicalExternalClient(org.apache.hadoop.hive.llap.ext.LlapTaskUmbilicalExternalClient) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) SubmitWorkRequestProto(org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.SubmitWorkRequestProto) SignableVertexSpec(org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.SignableVertexSpec) Socket(java.net.Socket)

Example 28 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project cassandra by apache.

the class CqlInputFormat method getSplits.

// Old Hadoop API
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    TaskAttemptContext tac = HadoopCompat.newTaskAttemptContext(jobConf, new TaskAttemptID());
    List<org.apache.hadoop.mapreduce.InputSplit> newInputSplits = this.getSplits(tac);
    InputSplit[] oldInputSplits = new InputSplit[newInputSplits.size()];
    for (int i = 0; i < newInputSplits.size(); i++) oldInputSplits[i] = (ColumnFamilySplit) newInputSplits.get(i);
    return oldInputSplits;
}
Also used : TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) org.apache.cassandra.hadoop(org.apache.cassandra.hadoop) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 29 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project flink by apache.

the class HadoopOutputFormatBase method finalizeGlobal.

@Override
public void finalizeGlobal(int parallelism) throws IOException {
    JobContext jobContext;
    TaskAttemptContext taskContext;
    try {
        TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_" + String.format("%" + (6 - Integer.toString(1).length()) + "s", " ").replace(" ", "0") + Integer.toString(1) + "_0");
        jobContext = HadoopUtils.instantiateJobContext(this.configuration, new JobID());
        taskContext = HadoopUtils.instantiateTaskAttemptContext(this.configuration, taskAttemptID);
        this.outputCommitter = this.mapreduceOutputFormat.getOutputCommitter(taskContext);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    jobContext.getCredentials().addAll(this.credentials);
    Credentials currentUserCreds = getCredentialsFromUGI(UserGroupInformation.getCurrentUser());
    if (currentUserCreds != null) {
        jobContext.getCredentials().addAll(currentUserCreds);
    }
    // finalize HDFS output format
    if (this.outputCommitter != null) {
        this.outputCommitter.commitJob(jobContext);
    }
}
Also used : TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) JobContext(org.apache.hadoop.mapreduce.JobContext) JobID(org.apache.hadoop.mapreduce.JobID) IOException(java.io.IOException) Credentials(org.apache.hadoop.security.Credentials)

Example 30 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project jena by apache.

the class AbstractBlankNodeTests method blank_node_divergence_02.

/**
     * Test that starts with two blank nodes with the same identity in a single
     * file, splits them over two files and shows that they diverge in the
     * subsequent job when the JENA-820 workaround is not enabled
     * 
     * @throws IOException
     * @throws InterruptedException
     */
@Test
public void blank_node_divergence_02() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
    // Temporary files
    File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();
    try {
        // Prepare the input data
        // Two mentions of the same blank node in the same file
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(a, tuples);
        // Set up fake job which will process the file as a single split
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());
        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);
            // Copy the input to the output - each triple goes to a separate
            // output file
            // This is how we force multiple files to be produced
            int taskID = 1;
            while (reader.nextKeyValue()) {
                // Prepare the output writing
                OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
                TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1));
                RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
                writer.close(outputTaskContext);
            }
        }
        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);
        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // As described in JENA-820 at this point the blank nodes are
        // consistent, however when we read them from different files they
        // by default get treated as different nodes and so the blank nodes
        // diverge which is incorrect and undesirable behaviour in
        // multi-stage pipelines. However it is the default behaviour
        // because when we start from external inputs we want them to be
        // file scoped.
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
        // Make sure JENA-820 flag is disabled
        job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, false);
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());
        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);
            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes should have diverged
        Assert.assertEquals(2, nodes.size());
    } finally {
        a.delete();
        deleteDirectory(intermediateOutputDir);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) Node(org.apache.jena.graph.Node) ArrayList(java.util.ArrayList) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) File(java.io.File) FileAttribute(java.nio.file.attribute.FileAttribute) Test(org.junit.Test)

Aggregations

TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)78 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)35 Test (org.junit.Test)34 Configuration (org.apache.hadoop.conf.Configuration)28 Path (org.apache.hadoop.fs.Path)25 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)22 IOException (java.io.IOException)19 JobID (org.apache.hadoop.mapreduce.JobID)16 TaskID (org.apache.hadoop.mapreduce.TaskID)15 File (java.io.File)14 Job (org.apache.hadoop.mapreduce.Job)14 ArrayList (java.util.ArrayList)13 JobContext (org.apache.hadoop.mapreduce.JobContext)12 LongWritable (org.apache.hadoop.io.LongWritable)11 InputSplit (org.apache.hadoop.mapreduce.InputSplit)10 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)10 FileSystem (org.apache.hadoop.fs.FileSystem)9 TaskAttemptInfo (org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo)8 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)8 HashMap (java.util.HashMap)7