Search in sources :

Example 26 with JobContextImpl

use of org.apache.hadoop.mapreduce.task.JobContextImpl in project hadoop by apache.

the class TestCopyCommitter method testDeleteMissing.

@Test
public void testDeleteMissing() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();
    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
        OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
        fs = FileSystem.get(conf);
        sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
        targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
        String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
        fs.rename(new Path(targetBaseAdd), new Path(targetBase));
        DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
        options.setSyncFolder(true);
        options.setDeleteMissing(true);
        options.appendToConf(conf);
        CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
        Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
        listing.buildListing(listingFile, options);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);
        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        //Test for idempotent commit
        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
    } catch (Throwable e) {
        LOG.error("Exception encountered while testing for delete missing", e);
        Assert.fail("Delete missing failure");
    } finally {
        TestDistCpUtils.delete(fs, "/tmp1");
        conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) CopyListing(org.apache.hadoop.tools.CopyListing) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) FileSystem(org.apache.hadoop.fs.FileSystem) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing)

Example 27 with JobContextImpl

use of org.apache.hadoop.mapreduce.task.JobContextImpl in project hadoop by apache.

the class TestCopyOutputFormat method testCheckOutputSpecs.

@Test
public void testCheckOutputSpecs() {
    try {
        OutputFormat outputFormat = new CopyOutputFormat();
        Job job = Job.getInstance(new Configuration());
        JobID jobID = new JobID("200707121733", 1);
        try {
            JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
            outputFormat.checkOutputSpecs(context);
            Assert.fail("No checking for invalid work/commit path");
        } catch (IllegalStateException ignore) {
        }
        CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work"));
        try {
            JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
            outputFormat.checkOutputSpecs(context);
            Assert.fail("No checking for invalid commit path");
        } catch (IllegalStateException ignore) {
        }
        job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, "");
        CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit"));
        try {
            JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
            outputFormat.checkOutputSpecs(context);
            Assert.fail("No checking for invalid work path");
        } catch (IllegalStateException ignore) {
        }
        CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work"));
        CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit"));
        try {
            JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
            outputFormat.checkOutputSpecs(context);
        } catch (IllegalStateException ignore) {
            Assert.fail("Output spec check failed.");
        }
    } catch (IOException e) {
        LOG.error("Exception encountered while testing checkoutput specs", e);
        Assert.fail("Checkoutput Spec failure");
    } catch (InterruptedException e) {
        LOG.error("Exception encountered while testing checkoutput specs", e);
        Assert.fail("Checkoutput Spec failure");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) Test(org.junit.Test)

Example 28 with JobContextImpl

use of org.apache.hadoop.mapreduce.task.JobContextImpl in project carbondata by apache.

the class CarbonInputFormat method getTableBlockInfo.

/**
   * Below method will be used to get the table block info
   *
   * @param job       job context
   * @param segmentId number of segment id
   * @return list of table block
   * @throws IOException
   */
private List<TableBlockInfo> getTableBlockInfo(JobContext job, TableSegmentUniqueIdentifier tableSegmentUniqueIdentifier, Set<SegmentTaskIndexStore.TaskBucketHolder> taskKeys, UpdateVO updateDetails, SegmentUpdateStatusManager updateStatusManager, String segmentId, Set<SegmentTaskIndexStore.TaskBucketHolder> validTaskKeys) throws IOException {
    List<TableBlockInfo> tableBlockInfoList = new ArrayList<TableBlockInfo>();
    // get file location of all files of given segment
    JobContext newJob = new JobContextImpl(new Configuration(job.getConfiguration()), job.getJobID());
    newJob.getConfiguration().set(CarbonInputFormat.INPUT_SEGMENT_NUMBERS, tableSegmentUniqueIdentifier.getSegmentId() + "");
    // identify table blocks
    for (InputSplit inputSplit : getSplitsInternal(newJob)) {
        CarbonInputSplit carbonInputSplit = (CarbonInputSplit) inputSplit;
        // then add as TableInfo object.
        if (isValidBlockBasedOnUpdateDetails(taskKeys, carbonInputSplit, updateDetails, updateStatusManager, segmentId, validTaskKeys)) {
            BlockletInfos blockletInfos = new BlockletInfos(carbonInputSplit.getNumberOfBlocklets(), 0, carbonInputSplit.getNumberOfBlocklets());
            tableBlockInfoList.add(new TableBlockInfo(carbonInputSplit.getPath().toString(), carbonInputSplit.getStart(), tableSegmentUniqueIdentifier.getSegmentId(), carbonInputSplit.getLocations(), carbonInputSplit.getLength(), blockletInfos, carbonInputSplit.getVersion(), carbonInputSplit.getBlockStorageIdMap()));
        }
    }
    return tableBlockInfoList;
}
Also used : TableBlockInfo(org.apache.carbondata.core.datastore.block.TableBlockInfo) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) BlockletInfos(org.apache.carbondata.core.datastore.block.BlockletInfos) JobContext(org.apache.hadoop.mapreduce.JobContext) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 29 with JobContextImpl

use of org.apache.hadoop.mapreduce.task.JobContextImpl in project jena by apache.

the class AbstractBlankNodeTests method blank_node_identity_02.

/**
     * Test that starts with two blank nodes in two different files and checks
     * that writing them to a single file does not conflate them
     * 
     * @throws IOException
     * @throws InterruptedException
     */
@Test
public void blank_node_identity_02() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
    // Temporary files
    File a = File.createTempFile("bnode_identity", getInitialInputExtension());
    File b = File.createTempFile("bnode_identity", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile();
    try {
        // Prepare the input data
        // Same blank node but in different files so must be treated as
        // different blank nodes and not converge
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        writeTuples(a, tuples);
        tuples.clear();
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(b, tuples);
        // Set up fake job which will process the two files
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());
        // Prepare the output writing - putting all output to a single file
        OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
        TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 2, 1));
        RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);
            // output
            while (reader.nextKeyValue()) {
                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
            }
        }
        writer.close(outputTaskContext);
        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);
        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // The Blank nodes should have been given separate identities so we
        // should not be conflating them, this is the opposite problem to
        // that described in JENA-820
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());
        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);
            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes must not diverge
        Assert.assertEquals(2, nodes.size());
    } finally {
        a.delete();
        b.delete();
        deleteDirectory(intermediateOutputDir);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) Node(org.apache.jena.graph.Node) ArrayList(java.util.ArrayList) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) File(java.io.File) InputSplit(org.apache.hadoop.mapreduce.InputSplit) FileAttribute(java.nio.file.attribute.FileAttribute) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 30 with JobContextImpl

use of org.apache.hadoop.mapreduce.task.JobContextImpl in project jena by apache.

the class AbstractBlankNodeTests method blank_node_divergence_01.

/**
     * Test that starts with two blank nodes with the same identity in a single
     * file, splits them over two files and checks that we can workaround
     * JENA-820 successfully by setting the
     * {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job
     * 
     * @throws IOException
     * @throws InterruptedException
     */
@Test
public final void blank_node_divergence_01() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
    // Temporary files
    File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();
    try {
        // Prepare the input data
        // Two mentions of the same blank node in the same file
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(a, tuples);
        // Set up fake job which will process the file as a single split
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());
        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);
            // Copy the input to the output - each triple goes to a separate
            // output file
            // This is how we force multiple files to be produced
            int taskID = 1;
            while (reader.nextKeyValue()) {
                // Prepare the output writing
                OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
                TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1));
                RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
                writer.close(outputTaskContext);
            }
        }
        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);
        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // As described in JENA-820 at this point the blank nodes are
        // consistent, however when we read them from different files they
        // by default get treated as different nodes and so the blank nodes
        // diverge which is incorrect and undesirable behaviour in
        // multi-stage pipelines
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
        // Enabling this flag works around the JENA-820 issue
        job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true);
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());
        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);
            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes should not have diverged
        Assert.assertEquals(1, nodes.size());
    } finally {
        a.delete();
        deleteDirectory(intermediateOutputDir);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) Node(org.apache.jena.graph.Node) ArrayList(java.util.ArrayList) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) File(java.io.File) FileAttribute(java.nio.file.attribute.FileAttribute) Test(org.junit.Test)

Aggregations

JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)32 Configuration (org.apache.hadoop.conf.Configuration)29 Job (org.apache.hadoop.mapreduce.Job)22 JobContext (org.apache.hadoop.mapreduce.JobContext)22 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)21 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)21 Path (org.apache.hadoop.fs.Path)17 File (java.io.File)16 IOException (java.io.IOException)11 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)10 MapFile (org.apache.hadoop.io.MapFile)9 InputSplit (org.apache.hadoop.mapreduce.InputSplit)8 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)8 LongWritable (org.apache.hadoop.io.LongWritable)7 FileSystem (org.apache.hadoop.fs.FileSystem)6 Test (org.junit.Test)6 DistCpOptions (org.apache.hadoop.tools.DistCpOptions)5 FileAttribute (java.nio.file.attribute.FileAttribute)4 ArrayList (java.util.ArrayList)4 HashSet (java.util.HashSet)4