Search in sources :

Example 61 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project jena by apache.

the class AbstractBlankNodeTests method blank_node_divergence_01.

/**
     * Test that starts with two blank nodes with the same identity in a single
     * file, splits them over two files and checks that we can workaround
     * JENA-820 successfully by setting the
     * {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job
     * 
     * @throws IOException
     * @throws InterruptedException
     */
@Test
public final void blank_node_divergence_01() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
    // Temporary files
    File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();
    try {
        // Prepare the input data
        // Two mentions of the same blank node in the same file
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(a, tuples);
        // Set up fake job which will process the file as a single split
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());
        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);
            // Copy the input to the output - each triple goes to a separate
            // output file
            // This is how we force multiple files to be produced
            int taskID = 1;
            while (reader.nextKeyValue()) {
                // Prepare the output writing
                OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
                TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1));
                RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
                writer.close(outputTaskContext);
            }
        }
        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);
        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // As described in JENA-820 at this point the blank nodes are
        // consistent, however when we read them from different files they
        // by default get treated as different nodes and so the blank nodes
        // diverge which is incorrect and undesirable behaviour in
        // multi-stage pipelines
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
        // Enabling this flag works around the JENA-820 issue
        job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true);
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());
        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);
            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes should not have diverged
        Assert.assertEquals(1, nodes.size());
    } finally {
        a.delete();
        deleteDirectory(intermediateOutputDir);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) Node(org.apache.jena.graph.Node) ArrayList(java.util.ArrayList) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) File(java.io.File) FileAttribute(java.nio.file.attribute.FileAttribute) Test(org.junit.Test)

Example 62 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project jena by apache.

the class AbstractNodeTupleInputFormatTests method testSplitInputs.

protected final void testSplitInputs(Configuration config, File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException {
    // Set up fake job
    InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
    Job job = Job.getInstance(config);
    job.setInputFormatClass(inputFormat.getClass());
    for (File input : inputs) {
        this.addInputPath(input, job.getConfiguration(), job);
    }
    JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
    Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length);
    // Check splits
    List<InputSplit> splits = inputFormat.getSplits(context);
    Assert.assertEquals(expectedSplits, splits.size());
    // Check tuples
    int count = 0;
    for (InputSplit split : splits) {
        // Validate split
        Assert.assertTrue(this.isValidSplit(split, config));
        // Read split
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
        RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
        reader.initialize(split, taskContext);
        count += this.countTuples(reader);
    }
    Assert.assertEquals(expectedTuples, count);
}
Also used : JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) File(java.io.File) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 63 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project jena by apache.

the class AbstractNodeTupleInputFormatTests method testMultipleInputs.

/**
     * Runs a multiple input test
     * 
     * @param inputs
     *            Inputs
     * @param expectedSplits
     *            Number of splits expected
     * @param expectedTuples
     *            Number of tuples expected
     * @throws IOException
     * @throws InterruptedException
     */
protected final void testMultipleInputs(File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException {
    // Prepare configuration and inputs
    Configuration config = this.prepareConfiguration();
    // Set up fake job
    InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
    Job job = Job.getInstance(config);
    job.setInputFormatClass(inputFormat.getClass());
    for (File input : inputs) {
        this.addInputPath(input, job.getConfiguration(), job);
    }
    JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
    Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length);
    NLineInputFormat.setNumLinesPerSplit(job, expectedTuples);
    // Check splits
    List<InputSplit> splits = inputFormat.getSplits(context);
    Assert.assertEquals(expectedSplits, splits.size());
    // Check tuples
    int count = 0;
    for (InputSplit split : splits) {
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
        RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
        reader.initialize(split, taskContext);
        count += this.countTuples(reader);
    }
    Assert.assertEquals(expectedTuples, count);
}
Also used : JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) File(java.io.File) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 64 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project incubator-rya by apache.

the class GraphXEdgeInputFormatTest method testInputFormat.

@SuppressWarnings("rawtypes")
@Test
public void testInputFormat() throws Exception {
    RyaStatement input = RyaStatement.builder().setSubject(new RyaURI("http://www.google.com")).setPredicate(new RyaURI("http://some_other_uri")).setObject(new RyaURI("http://www.yahoo.com")).setColumnVisibility(new byte[0]).setValue(new byte[0]).build();
    apiImpl.add(input);
    Job jobConf = Job.getInstance();
    GraphXEdgeInputFormat.setMockInstance(jobConf, instance.getInstanceName());
    GraphXEdgeInputFormat.setConnectorInfo(jobConf, username, password);
    GraphXEdgeInputFormat.setTableLayout(jobConf, TABLE_LAYOUT.SPO);
    GraphXEdgeInputFormat.setInputTableName(jobConf, table);
    GraphXEdgeInputFormat.setInputTableName(jobConf, table);
    GraphXEdgeInputFormat.setScanIsolation(jobConf, false);
    GraphXEdgeInputFormat.setLocalIterators(jobConf, false);
    GraphXEdgeInputFormat.setOfflineTableScan(jobConf, false);
    GraphXEdgeInputFormat inputFormat = new GraphXEdgeInputFormat();
    JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID());
    List<InputSplit> splits = inputFormat.getSplits(context);
    Assert.assertEquals(1, splits.size());
    TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(), new TaskAttemptID(new TaskID(), 1));
    RecordReader reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext);
    RecordReader ryaStatementRecordReader = (RecordReader) reader;
    ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext);
    List<Edge> results = new ArrayList<Edge>();
    while (ryaStatementRecordReader.nextKeyValue()) {
        Edge writable = (Edge) ryaStatementRecordReader.getCurrentValue();
        long srcId = writable.srcId();
        long destId = writable.dstId();
        RyaTypeWritable rtw = null;
        Object text = ryaStatementRecordReader.getCurrentKey();
        Edge<RyaTypeWritable> edge = new Edge<RyaTypeWritable>(srcId, destId, rtw);
        results.add(edge);
        System.out.println(text);
    }
    System.out.println(results.size());
    System.out.println(results);
    Assert.assertTrue(results.size() == 2);
}
Also used : JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) TaskID(org.apache.hadoop.mapreduce.TaskID) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) RecordReader(org.apache.hadoop.mapreduce.RecordReader) ArrayList(java.util.ArrayList) RyaStatement(org.apache.rya.api.domain.RyaStatement) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RyaURI(org.apache.rya.api.domain.RyaURI) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Edge(org.apache.spark.graphx.Edge) Test(org.junit.Test)

Example 65 with JobContext

use of org.apache.hadoop.mapreduce.JobContext in project cdap by caskdata.

the class HiveStreamInputFormat method getSplitFinder.

private StreamInputSplitFinder<InputSplit> getSplitFinder(JobConf conf) throws IOException {
    // first get the context we are in
    ContextManager.Context context = ContextManager.getContext(conf);
    Preconditions.checkNotNull(context);
    StreamConfig streamConfig = context.getStreamConfig(getStreamId(conf));
    // make sure we get the current generation so we don't read events that occurred before a truncate.
    Location streamPath = StreamUtils.createGenerationLocation(streamConfig.getLocation(), StreamUtils.getGeneration(streamConfig));
    StreamInputSplitFinder.Builder builder = StreamInputSplitFinder.builder(streamPath.toURI());
    // Get the Hive table path for the InputSplit created. It is just to satisfy hive. The InputFormat never uses it.
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(conf));
    final Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
    return setupBuilder(conf, streamConfig, builder).build(new StreamInputSplitFactory<InputSplit>() {

        @Override
        public InputSplit createSplit(Path eventPath, Path indexPath, long startTime, long endTime, long start, long length, @Nullable String[] locations) {
            return new StreamInputSplit(tablePaths[0], eventPath, indexPath, startTime, endTime, start, length, locations);
        }
    });
}
Also used : Path(org.apache.hadoop.fs.Path) StreamConfig(co.cask.cdap.data2.transaction.stream.StreamConfig) StreamInputSplitFinder(co.cask.cdap.data.stream.StreamInputSplitFinder) ContextManager(co.cask.cdap.hive.context.ContextManager) JobContext(org.apache.hadoop.mapreduce.JobContext) InputSplit(org.apache.hadoop.mapred.InputSplit) Location(org.apache.twill.filesystem.Location)

Aggregations

JobContext (org.apache.hadoop.mapreduce.JobContext)85 Configuration (org.apache.hadoop.conf.Configuration)41 Job (org.apache.hadoop.mapreduce.Job)35 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)34 Test (org.junit.Test)31 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)29 InputSplit (org.apache.hadoop.mapreduce.InputSplit)28 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)25 Path (org.apache.hadoop.fs.Path)24 IOException (java.io.IOException)22 File (java.io.File)19 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)16 ArrayList (java.util.ArrayList)13 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)11 JobConf (org.apache.hadoop.mapred.JobConf)10 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)10 LongWritable (org.apache.hadoop.io.LongWritable)9 MapFile (org.apache.hadoop.io.MapFile)9 JobID (org.apache.hadoop.mapreduce.JobID)7 FileSystem (org.apache.hadoop.fs.FileSystem)6