use of org.apache.hadoop.mapreduce.task.JobContextImpl in project hadoop by apache.
the class TestCopyCommitter method testDeleteMissing.
@Test
public void testDeleteMissing() {
TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID());
Configuration conf = jobContext.getConfiguration();
String sourceBase;
String targetBase;
FileSystem fs = null;
try {
OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
fs = FileSystem.get(conf);
sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
fs.rename(new Path(targetBaseAdd), new Path(targetBase));
DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
options.setSyncFolder(true);
options.setDeleteMissing(true);
options.appendToConf(conf);
CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
listing.buildListing(listingFile, options);
conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);
committer.commitJob(jobContext);
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
Assert.fail("Source and target folders are not in sync");
}
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
Assert.fail("Source and target folders are not in sync");
}
//Test for idempotent commit
committer.commitJob(jobContext);
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
Assert.fail("Source and target folders are not in sync");
}
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
Assert.fail("Source and target folders are not in sync");
}
} catch (Throwable e) {
LOG.error("Exception encountered while testing for delete missing", e);
Assert.fail("Delete missing failure");
} finally {
TestDistCpUtils.delete(fs, "/tmp1");
conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
}
}
use of org.apache.hadoop.mapreduce.task.JobContextImpl in project hadoop by apache.
the class TestCopyOutputFormat method testCheckOutputSpecs.
@Test
public void testCheckOutputSpecs() {
try {
OutputFormat outputFormat = new CopyOutputFormat();
Job job = Job.getInstance(new Configuration());
JobID jobID = new JobID("200707121733", 1);
try {
JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
outputFormat.checkOutputSpecs(context);
Assert.fail("No checking for invalid work/commit path");
} catch (IllegalStateException ignore) {
}
CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work"));
try {
JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
outputFormat.checkOutputSpecs(context);
Assert.fail("No checking for invalid commit path");
} catch (IllegalStateException ignore) {
}
job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, "");
CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit"));
try {
JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
outputFormat.checkOutputSpecs(context);
Assert.fail("No checking for invalid work path");
} catch (IllegalStateException ignore) {
}
CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work"));
CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit"));
try {
JobContext context = new JobContextImpl(job.getConfiguration(), jobID);
outputFormat.checkOutputSpecs(context);
} catch (IllegalStateException ignore) {
Assert.fail("Output spec check failed.");
}
} catch (IOException e) {
LOG.error("Exception encountered while testing checkoutput specs", e);
Assert.fail("Checkoutput Spec failure");
} catch (InterruptedException e) {
LOG.error("Exception encountered while testing checkoutput specs", e);
Assert.fail("Checkoutput Spec failure");
}
}
use of org.apache.hadoop.mapreduce.task.JobContextImpl in project carbondata by apache.
the class CarbonInputFormat method getTableBlockInfo.
/**
* Below method will be used to get the table block info
*
* @param job job context
* @param segmentId number of segment id
* @return list of table block
* @throws IOException
*/
private List<TableBlockInfo> getTableBlockInfo(JobContext job, TableSegmentUniqueIdentifier tableSegmentUniqueIdentifier, Set<SegmentTaskIndexStore.TaskBucketHolder> taskKeys, UpdateVO updateDetails, SegmentUpdateStatusManager updateStatusManager, String segmentId, Set<SegmentTaskIndexStore.TaskBucketHolder> validTaskKeys) throws IOException {
List<TableBlockInfo> tableBlockInfoList = new ArrayList<TableBlockInfo>();
// get file location of all files of given segment
JobContext newJob = new JobContextImpl(new Configuration(job.getConfiguration()), job.getJobID());
newJob.getConfiguration().set(CarbonInputFormat.INPUT_SEGMENT_NUMBERS, tableSegmentUniqueIdentifier.getSegmentId() + "");
// identify table blocks
for (InputSplit inputSplit : getSplitsInternal(newJob)) {
CarbonInputSplit carbonInputSplit = (CarbonInputSplit) inputSplit;
// then add as TableInfo object.
if (isValidBlockBasedOnUpdateDetails(taskKeys, carbonInputSplit, updateDetails, updateStatusManager, segmentId, validTaskKeys)) {
BlockletInfos blockletInfos = new BlockletInfos(carbonInputSplit.getNumberOfBlocklets(), 0, carbonInputSplit.getNumberOfBlocklets());
tableBlockInfoList.add(new TableBlockInfo(carbonInputSplit.getPath().toString(), carbonInputSplit.getStart(), tableSegmentUniqueIdentifier.getSegmentId(), carbonInputSplit.getLocations(), carbonInputSplit.getLength(), blockletInfos, carbonInputSplit.getVersion(), carbonInputSplit.getBlockStorageIdMap()));
}
}
return tableBlockInfoList;
}
use of org.apache.hadoop.mapreduce.task.JobContextImpl in project jena by apache.
the class AbstractBlankNodeTests method blank_node_identity_02.
/**
* Test that starts with two blank nodes in two different files and checks
* that writing them to a single file does not conflate them
*
* @throws IOException
* @throws InterruptedException
*/
@Test
public void blank_node_identity_02() throws IOException, InterruptedException {
Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
// Temporary files
File a = File.createTempFile("bnode_identity", getInitialInputExtension());
File b = File.createTempFile("bnode_identity", getInitialInputExtension());
File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile();
try {
// Prepare the input data
// Same blank node but in different files so must be treated as
// different blank nodes and not converge
List<T> tuples = new ArrayList<>();
Node bnode = NodeFactory.createBlankNode();
Node pred = NodeFactory.createURI("http://example.org/predicate");
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
writeTuples(a, tuples);
tuples.clear();
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
writeTuples(b, tuples);
// Set up fake job which will process the two files
Configuration config = new Configuration(true);
InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
Job job = Job.getInstance(config);
job.setInputFormatClass(inputFormat.getClass());
NLineInputFormat.setNumLinesPerSplit(job, 100);
FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath()));
FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
List<InputSplit> splits = inputFormat.getSplits(context);
Assert.assertEquals(2, splits.size());
// Prepare the output writing - putting all output to a single file
OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 2, 1));
RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
for (InputSplit split : splits) {
// Initialize the input reading
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
// output
while (reader.nextKeyValue()) {
writer.write(reader.getCurrentKey(), reader.getCurrentValue());
}
}
writer.close(outputTaskContext);
// Promote outputs from temporary status
promoteInputs(intermediateOutputDir);
// Now we need to create a subsequent job that reads the
// intermediate outputs
// The Blank nodes should have been given separate identities so we
// should not be conflating them, this is the opposite problem to
// that described in JENA-820
LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
job = Job.getInstance(config);
inputFormat = createIntermediateInputFormat();
job.setInputFormatClass(inputFormat.getClass());
NLineInputFormat.setNumLinesPerSplit(job, 100);
FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
splits = inputFormat.getSplits(context);
Assert.assertEquals(1, splits.size());
// Expect to end up with a single blank node
Set<Node> nodes = new HashSet<Node>();
for (InputSplit split : splits) {
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
while (reader.nextKeyValue()) {
nodes.add(getSubject(reader.getCurrentValue().get()));
}
}
// Nodes must not diverge
Assert.assertEquals(2, nodes.size());
} finally {
a.delete();
b.delete();
deleteDirectory(intermediateOutputDir);
}
}
use of org.apache.hadoop.mapreduce.task.JobContextImpl in project jena by apache.
the class AbstractBlankNodeTests method blank_node_divergence_01.
/**
* Test that starts with two blank nodes with the same identity in a single
* file, splits them over two files and checks that we can workaround
* JENA-820 successfully by setting the
* {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job
*
* @throws IOException
* @throws InterruptedException
*/
@Test
public final void blank_node_divergence_01() throws IOException, InterruptedException {
Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
// Temporary files
File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();
try {
// Prepare the input data
// Two mentions of the same blank node in the same file
List<T> tuples = new ArrayList<>();
Node bnode = NodeFactory.createBlankNode();
Node pred = NodeFactory.createURI("http://example.org/predicate");
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
writeTuples(a, tuples);
// Set up fake job which will process the file as a single split
Configuration config = new Configuration(true);
InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
Job job = Job.getInstance(config);
job.setInputFormatClass(inputFormat.getClass());
NLineInputFormat.setNumLinesPerSplit(job, 100);
FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
List<InputSplit> splits = inputFormat.getSplits(context);
Assert.assertEquals(1, splits.size());
for (InputSplit split : splits) {
// Initialize the input reading
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
// Copy the input to the output - each triple goes to a separate
// output file
// This is how we force multiple files to be produced
int taskID = 1;
while (reader.nextKeyValue()) {
// Prepare the output writing
OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1));
RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
writer.write(reader.getCurrentKey(), reader.getCurrentValue());
writer.close(outputTaskContext);
}
}
// Promote outputs from temporary status
promoteInputs(intermediateOutputDir);
// Now we need to create a subsequent job that reads the
// intermediate outputs
// As described in JENA-820 at this point the blank nodes are
// consistent, however when we read them from different files they
// by default get treated as different nodes and so the blank nodes
// diverge which is incorrect and undesirable behaviour in
// multi-stage pipelines
LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
job = Job.getInstance(config);
inputFormat = createIntermediateInputFormat();
job.setInputFormatClass(inputFormat.getClass());
FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
// Enabling this flag works around the JENA-820 issue
job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true);
context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
splits = inputFormat.getSplits(context);
Assert.assertEquals(2, splits.size());
// Expect to end up with a single blank node
Set<Node> nodes = new HashSet<Node>();
for (InputSplit split : splits) {
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
while (reader.nextKeyValue()) {
nodes.add(getSubject(reader.getCurrentValue().get()));
}
}
// Nodes should not have diverged
Assert.assertEquals(1, nodes.size());
} finally {
a.delete();
deleteDirectory(intermediateOutputDir);
}
}
Aggregations