use of org.apache.hadoop.mapreduce.JobContext in project jena by apache.
the class AbstractBlankNodeTests method blank_node_divergence_01.
/**
* Test that starts with two blank nodes with the same identity in a single
* file, splits them over two files and checks that we can workaround
* JENA-820 successfully by setting the
* {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job
*
* @throws IOException
* @throws InterruptedException
*/
@Test
public final void blank_node_divergence_01() throws IOException, InterruptedException {
Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
// Temporary files
File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();
try {
// Prepare the input data
// Two mentions of the same blank node in the same file
List<T> tuples = new ArrayList<>();
Node bnode = NodeFactory.createBlankNode();
Node pred = NodeFactory.createURI("http://example.org/predicate");
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
writeTuples(a, tuples);
// Set up fake job which will process the file as a single split
Configuration config = new Configuration(true);
InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
Job job = Job.getInstance(config);
job.setInputFormatClass(inputFormat.getClass());
NLineInputFormat.setNumLinesPerSplit(job, 100);
FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
List<InputSplit> splits = inputFormat.getSplits(context);
Assert.assertEquals(1, splits.size());
for (InputSplit split : splits) {
// Initialize the input reading
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
// Copy the input to the output - each triple goes to a separate
// output file
// This is how we force multiple files to be produced
int taskID = 1;
while (reader.nextKeyValue()) {
// Prepare the output writing
OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1));
RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
writer.write(reader.getCurrentKey(), reader.getCurrentValue());
writer.close(outputTaskContext);
}
}
// Promote outputs from temporary status
promoteInputs(intermediateOutputDir);
// Now we need to create a subsequent job that reads the
// intermediate outputs
// As described in JENA-820 at this point the blank nodes are
// consistent, however when we read them from different files they
// by default get treated as different nodes and so the blank nodes
// diverge which is incorrect and undesirable behaviour in
// multi-stage pipelines
LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
job = Job.getInstance(config);
inputFormat = createIntermediateInputFormat();
job.setInputFormatClass(inputFormat.getClass());
FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
// Enabling this flag works around the JENA-820 issue
job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true);
context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
splits = inputFormat.getSplits(context);
Assert.assertEquals(2, splits.size());
// Expect to end up with a single blank node
Set<Node> nodes = new HashSet<Node>();
for (InputSplit split : splits) {
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
while (reader.nextKeyValue()) {
nodes.add(getSubject(reader.getCurrentValue().get()));
}
}
// Nodes should not have diverged
Assert.assertEquals(1, nodes.size());
} finally {
a.delete();
deleteDirectory(intermediateOutputDir);
}
}
use of org.apache.hadoop.mapreduce.JobContext in project jena by apache.
the class AbstractNodeTupleInputFormatTests method testSplitInputs.
protected final void testSplitInputs(Configuration config, File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException {
// Set up fake job
InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
Job job = Job.getInstance(config);
job.setInputFormatClass(inputFormat.getClass());
for (File input : inputs) {
this.addInputPath(input, job.getConfiguration(), job);
}
JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length);
// Check splits
List<InputSplit> splits = inputFormat.getSplits(context);
Assert.assertEquals(expectedSplits, splits.size());
// Check tuples
int count = 0;
for (InputSplit split : splits) {
// Validate split
Assert.assertTrue(this.isValidSplit(split, config));
// Read split
TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
reader.initialize(split, taskContext);
count += this.countTuples(reader);
}
Assert.assertEquals(expectedTuples, count);
}
use of org.apache.hadoop.mapreduce.JobContext in project jena by apache.
the class AbstractNodeTupleInputFormatTests method testMultipleInputs.
/**
* Runs a multiple input test
*
* @param inputs
* Inputs
* @param expectedSplits
* Number of splits expected
* @param expectedTuples
* Number of tuples expected
* @throws IOException
* @throws InterruptedException
*/
protected final void testMultipleInputs(File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException {
// Prepare configuration and inputs
Configuration config = this.prepareConfiguration();
// Set up fake job
InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
Job job = Job.getInstance(config);
job.setInputFormatClass(inputFormat.getClass());
for (File input : inputs) {
this.addInputPath(input, job.getConfiguration(), job);
}
JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length);
NLineInputFormat.setNumLinesPerSplit(job, expectedTuples);
// Check splits
List<InputSplit> splits = inputFormat.getSplits(context);
Assert.assertEquals(expectedSplits, splits.size());
// Check tuples
int count = 0;
for (InputSplit split : splits) {
TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
reader.initialize(split, taskContext);
count += this.countTuples(reader);
}
Assert.assertEquals(expectedTuples, count);
}
use of org.apache.hadoop.mapreduce.JobContext in project incubator-rya by apache.
the class GraphXEdgeInputFormatTest method testInputFormat.
@SuppressWarnings("rawtypes")
@Test
public void testInputFormat() throws Exception {
RyaStatement input = RyaStatement.builder().setSubject(new RyaURI("http://www.google.com")).setPredicate(new RyaURI("http://some_other_uri")).setObject(new RyaURI("http://www.yahoo.com")).setColumnVisibility(new byte[0]).setValue(new byte[0]).build();
apiImpl.add(input);
Job jobConf = Job.getInstance();
GraphXEdgeInputFormat.setMockInstance(jobConf, instance.getInstanceName());
GraphXEdgeInputFormat.setConnectorInfo(jobConf, username, password);
GraphXEdgeInputFormat.setTableLayout(jobConf, TABLE_LAYOUT.SPO);
GraphXEdgeInputFormat.setInputTableName(jobConf, table);
GraphXEdgeInputFormat.setInputTableName(jobConf, table);
GraphXEdgeInputFormat.setScanIsolation(jobConf, false);
GraphXEdgeInputFormat.setLocalIterators(jobConf, false);
GraphXEdgeInputFormat.setOfflineTableScan(jobConf, false);
GraphXEdgeInputFormat inputFormat = new GraphXEdgeInputFormat();
JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID());
List<InputSplit> splits = inputFormat.getSplits(context);
Assert.assertEquals(1, splits.size());
TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(), new TaskAttemptID(new TaskID(), 1));
RecordReader reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext);
RecordReader ryaStatementRecordReader = (RecordReader) reader;
ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext);
List<Edge> results = new ArrayList<Edge>();
while (ryaStatementRecordReader.nextKeyValue()) {
Edge writable = (Edge) ryaStatementRecordReader.getCurrentValue();
long srcId = writable.srcId();
long destId = writable.dstId();
RyaTypeWritable rtw = null;
Object text = ryaStatementRecordReader.getCurrentKey();
Edge<RyaTypeWritable> edge = new Edge<RyaTypeWritable>(srcId, destId, rtw);
results.add(edge);
System.out.println(text);
}
System.out.println(results.size());
System.out.println(results);
Assert.assertTrue(results.size() == 2);
}
use of org.apache.hadoop.mapreduce.JobContext in project cdap by caskdata.
the class HiveStreamInputFormat method getSplitFinder.
private StreamInputSplitFinder<InputSplit> getSplitFinder(JobConf conf) throws IOException {
// first get the context we are in
ContextManager.Context context = ContextManager.getContext(conf);
Preconditions.checkNotNull(context);
StreamConfig streamConfig = context.getStreamConfig(getStreamId(conf));
// make sure we get the current generation so we don't read events that occurred before a truncate.
Location streamPath = StreamUtils.createGenerationLocation(streamConfig.getLocation(), StreamUtils.getGeneration(streamConfig));
StreamInputSplitFinder.Builder builder = StreamInputSplitFinder.builder(streamPath.toURI());
// Get the Hive table path for the InputSplit created. It is just to satisfy hive. The InputFormat never uses it.
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(conf));
final Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
return setupBuilder(conf, streamConfig, builder).build(new StreamInputSplitFactory<InputSplit>() {
@Override
public InputSplit createSplit(Path eventPath, Path indexPath, long startTime, long endTime, long start, long length, @Nullable String[] locations) {
return new StreamInputSplit(tablePaths[0], eventPath, indexPath, startTime, endTime, start, length, locations);
}
});
}
Aggregations