use of org.apache.hadoop.mapreduce.TaskAttemptID in project hive by apache.
the class TestRCFileMapReduceInputFormat method writeThenReadByRecordReader.
private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec) throws IOException, InterruptedException {
Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
Path testFile = new Path(testDir, "test_rcfile");
fs.delete(testFile, true);
Configuration cloneConf = new Configuration(conf);
RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
for (int i = 0; i < bytesArray.length; i++) {
BytesRefWritable cu = null;
cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
bytes.set(i, cu);
}
for (int i = 0; i < writeCount; i++) {
writer.append(bytes);
}
writer.close();
RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable> inputFormat = new RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable>();
Configuration jonconf = new Configuration(cloneConf);
jonconf.set("mapred.input.dir", testDir.toString());
JobContext context = new Job(jonconf);
HiveConf.setLongVar(context.getConfiguration(), HiveConf.ConfVars.MAPREDMAXSPLITSIZE, maxSplitSize);
List<InputSplit> splits = inputFormat.getSplits(context);
assertEquals("splits length should be " + splitNumber, splits.size(), splitNumber);
int readCount = 0;
for (int i = 0; i < splits.size(); i++) {
TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(jonconf, new TaskAttemptID());
RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac);
rr.initialize(splits.get(i), tac);
while (rr.nextKeyValue()) {
readCount++;
}
}
assertEquals("readCount should be equal to writeCount", readCount, writeCount);
}
use of org.apache.hadoop.mapreduce.TaskAttemptID in project hive by apache.
the class LlapBaseInputFormat method getRecordReader.
@SuppressWarnings("unchecked")
@Override
public RecordReader<NullWritable, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
LlapInputSplit llapSplit = (LlapInputSplit) split;
// Set conf to use LLAP user rather than current user for LLAP Zk registry.
HiveConf.setVar(job, HiveConf.ConfVars.LLAP_ZK_REGISTRY_USER, llapSplit.getLlapUser());
SubmitWorkInfo submitWorkInfo = SubmitWorkInfo.fromBytes(llapSplit.getPlanBytes());
ServiceInstance serviceInstance = getServiceInstance(job, llapSplit);
String host = serviceInstance.getHost();
int llapSubmitPort = serviceInstance.getRpcPort();
LOG.info("Found service instance for host " + host + " with rpc port " + llapSubmitPort + " and outputformat port " + serviceInstance.getOutputFormatPort());
byte[] llapTokenBytes = llapSplit.getTokenBytes();
Token<LlapTokenIdentifier> llapToken = null;
if (llapTokenBytes != null) {
DataInputBuffer in = new DataInputBuffer();
in.reset(llapTokenBytes, 0, llapTokenBytes.length);
llapToken = new Token<LlapTokenIdentifier>();
llapToken.readFields(in);
}
LlapRecordReaderTaskUmbilicalExternalResponder umbilicalResponder = new LlapRecordReaderTaskUmbilicalExternalResponder();
LlapTaskUmbilicalExternalClient llapClient = new LlapTaskUmbilicalExternalClient(job, submitWorkInfo.getTokenIdentifier(), submitWorkInfo.getToken(), umbilicalResponder, llapToken);
llapClient.init(job);
llapClient.start();
int attemptNum = 0;
// Use task attempt number from conf if provided
TaskAttemptID taskAttemptId = TaskAttemptID.forName(job.get(MRJobConfig.TASK_ATTEMPT_ID));
if (taskAttemptId != null) {
attemptNum = taskAttemptId.getId();
if (LOG.isDebugEnabled()) {
LOG.debug("Setting attempt number to " + attemptNum + " from task attempt ID in conf: " + job.get(MRJobConfig.TASK_ATTEMPT_ID));
}
}
SubmitWorkRequestProto request = constructSubmitWorkRequestProto(submitWorkInfo, llapSplit.getSplitNum(), attemptNum, llapClient.getAddress(), submitWorkInfo.getToken(), llapSplit.getFragmentBytes(), llapSplit.getFragmentBytesSignature(), job);
llapClient.submitWork(request, host, llapSubmitPort);
Socket socket = new Socket(host, serviceInstance.getOutputFormatPort());
LOG.debug("Socket connected");
SignableVertexSpec vertex = SignableVertexSpec.parseFrom(submitWorkInfo.getVertexBinary());
String fragmentId = Converters.createTaskAttemptId(vertex.getQueryIdentifier(), vertex.getVertexIndex(), request.getFragmentNumber(), request.getAttemptNumber()).toString();
OutputStream socketStream = socket.getOutputStream();
LlapOutputSocketInitMessage.Builder builder = LlapOutputSocketInitMessage.newBuilder().setFragmentId(fragmentId);
if (llapSplit.getTokenBytes() != null) {
builder.setToken(ByteString.copyFrom(llapSplit.getTokenBytes()));
}
builder.build().writeDelimitedTo(socketStream);
socketStream.flush();
LOG.info("Registered id: " + fragmentId);
@SuppressWarnings("rawtypes") LlapBaseRecordReader recordReader = new LlapBaseRecordReader(socket.getInputStream(), llapSplit.getSchema(), Text.class, job, llapClient, (java.io.Closeable) socket);
umbilicalResponder.setRecordReader(recordReader);
return recordReader;
}
use of org.apache.hadoop.mapreduce.TaskAttemptID in project cassandra by apache.
the class CqlInputFormat method getSplits.
// Old Hadoop API
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
TaskAttemptContext tac = HadoopCompat.newTaskAttemptContext(jobConf, new TaskAttemptID());
List<org.apache.hadoop.mapreduce.InputSplit> newInputSplits = this.getSplits(tac);
InputSplit[] oldInputSplits = new InputSplit[newInputSplits.size()];
for (int i = 0; i < newInputSplits.size(); i++) oldInputSplits[i] = (ColumnFamilySplit) newInputSplits.get(i);
return oldInputSplits;
}
use of org.apache.hadoop.mapreduce.TaskAttemptID in project flink by apache.
the class HadoopOutputFormatBase method finalizeGlobal.
@Override
public void finalizeGlobal(int parallelism) throws IOException {
JobContext jobContext;
TaskAttemptContext taskContext;
try {
TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_" + String.format("%" + (6 - Integer.toString(1).length()) + "s", " ").replace(" ", "0") + Integer.toString(1) + "_0");
jobContext = HadoopUtils.instantiateJobContext(this.configuration, new JobID());
taskContext = HadoopUtils.instantiateTaskAttemptContext(this.configuration, taskAttemptID);
this.outputCommitter = this.mapreduceOutputFormat.getOutputCommitter(taskContext);
} catch (Exception e) {
throw new RuntimeException(e);
}
jobContext.getCredentials().addAll(this.credentials);
Credentials currentUserCreds = getCredentialsFromUGI(UserGroupInformation.getCurrentUser());
if (currentUserCreds != null) {
jobContext.getCredentials().addAll(currentUserCreds);
}
// finalize HDFS output format
if (this.outputCommitter != null) {
this.outputCommitter.commitJob(jobContext);
}
}
use of org.apache.hadoop.mapreduce.TaskAttemptID in project jena by apache.
the class AbstractBlankNodeTests method blank_node_divergence_02.
/**
* Test that starts with two blank nodes with the same identity in a single
* file, splits them over two files and shows that they diverge in the
* subsequent job when the JENA-820 workaround is not enabled
*
* @throws IOException
* @throws InterruptedException
*/
@Test
public void blank_node_divergence_02() throws IOException, InterruptedException {
Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());
// Temporary files
File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();
try {
// Prepare the input data
// Two mentions of the same blank node in the same file
List<T> tuples = new ArrayList<>();
Node bnode = NodeFactory.createBlankNode();
Node pred = NodeFactory.createURI("http://example.org/predicate");
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
writeTuples(a, tuples);
// Set up fake job which will process the file as a single split
Configuration config = new Configuration(true);
InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
Job job = Job.getInstance(config);
job.setInputFormatClass(inputFormat.getClass());
NLineInputFormat.setNumLinesPerSplit(job, 100);
FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
List<InputSplit> splits = inputFormat.getSplits(context);
Assert.assertEquals(1, splits.size());
for (InputSplit split : splits) {
// Initialize the input reading
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1));
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
// Copy the input to the output - each triple goes to a separate
// output file
// This is how we force multiple files to be produced
int taskID = 1;
while (reader.nextKeyValue()) {
// Prepare the output writing
OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1));
RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);
writer.write(reader.getCurrentKey(), reader.getCurrentValue());
writer.close(outputTaskContext);
}
}
// Promote outputs from temporary status
promoteInputs(intermediateOutputDir);
// Now we need to create a subsequent job that reads the
// intermediate outputs
// As described in JENA-820 at this point the blank nodes are
// consistent, however when we read them from different files they
// by default get treated as different nodes and so the blank nodes
// diverge which is incorrect and undesirable behaviour in
// multi-stage pipelines. However it is the default behaviour
// because when we start from external inputs we want them to be
// file scoped.
LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
job = Job.getInstance(config);
inputFormat = createIntermediateInputFormat();
job.setInputFormatClass(inputFormat.getClass());
FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
// Make sure JENA-820 flag is disabled
job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, false);
context = new JobContextImpl(job.getConfiguration(), job.getJobID());
// Get the splits
splits = inputFormat.getSplits(context);
Assert.assertEquals(2, splits.size());
// Expect to end up with a single blank node
Set<Node> nodes = new HashSet<Node>();
for (InputSplit split : splits) {
TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
reader.initialize(split, inputTaskContext);
while (reader.nextKeyValue()) {
nodes.add(getSubject(reader.getCurrentValue().get()));
}
}
// Nodes should have diverged
Assert.assertEquals(2, nodes.size());
} finally {
a.delete();
deleteDirectory(intermediateOutputDir);
}
}
Aggregations