use of org.apache.hadoop.mapreduce.lib.output.TextOutputFormat in project hadoop by apache.
the class TestRecovery method writeOutput.
private void writeOutput(TaskAttempt attempt, Configuration conf) throws Exception {
TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, TypeConverter.fromYarn(attempt.getID()));
TextOutputFormat<?, ?> theOutputFormat = new TextOutputFormat();
RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(tContext);
NullWritable nullWritable = NullWritable.get();
try {
theRecordWriter.write(key1, val1);
theRecordWriter.write(null, nullWritable);
theRecordWriter.write(null, val1);
theRecordWriter.write(nullWritable, val2);
theRecordWriter.write(key2, nullWritable);
theRecordWriter.write(key1, null);
theRecordWriter.write(null, null);
theRecordWriter.write(key2, val2);
} finally {
theRecordWriter.close(tContext);
}
OutputFormat outputFormat = ReflectionUtils.newInstance(tContext.getOutputFormatClass(), conf);
OutputCommitter committer = outputFormat.getOutputCommitter(tContext);
committer.commitTask(tContext);
}
use of org.apache.hadoop.mapreduce.lib.output.TextOutputFormat in project tez by apache.
the class WordCount method createDAG.
private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions) throws IOException {
// Create the descriptor that describes the input data to Tez. Using MRInput to read text
// data from the given input path. The TextInputFormat is used to read the text data.
DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!isDisableSplitGrouping()).generateSplitsInAM(!isGenerateSplitInClient()).build();
// Create a descriptor that describes the output data to Tez. Using MROoutput to write text
// data to the given output path. The TextOutputFormat is used to write the text data.
DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build();
// Create a vertex that reads the data from the data source and tokenizes it using the
// TokenProcessor. The number of tasks that will do the work for this vertex will be decided
// using the information provided by the data source descriptor.
Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(TokenProcessor.class.getName())).addDataSource(INPUT, dataSource);
// Create the edge that represents the movement and semantics of data between the producer
// Tokenizer vertex and the consumer Summation vertex. In order to perform the summation in
// parallel the tokenized data will be partitioned by word such that a given word goes to the
// same partition. The counts for the words should be grouped together per word. To achieve this
// we can use an edge that contains an input/output pair that handles partitioning and grouping
// of key value data. We use the helper OrderedPartitionedKVEdgeConfig to create such an
// edge. Internally, it sets up matching Tez inputs and outputs that can perform this logic.
// We specify the key, value and partitioner type. Here the key type is Text (for word), the
// value type is IntWritable (for count) and we using a hash based partitioner. This is a helper
// object. The edge can be configured by configuring the input, output etc individually without
// using this helper. The setFromConfiguration call is optional and allows overriding the config
// options with command line parameters.
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).setFromConfiguration(tezConf).build();
// Create a vertex that reads the tokenized data and calculates the sum using the SumProcessor.
// The number of tasks that do the work of this vertex depends on the number of partitions used
// to distribute the sum processing. In this case, its been made configurable via the
// numPartitions parameter.
Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions).addDataSink(OUTPUT, dataSink);
// No need to add jar containing this class as assumed to be part of the Tez jars. Otherwise
// we would have to add the jars for this code as local files to the vertices.
// Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
DAG dag = DAG.create("WordCount");
dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
return dag;
}
use of org.apache.hadoop.mapreduce.lib.output.TextOutputFormat in project hadoop by apache.
the class TestRecovery method writeBadOutput.
private void writeBadOutput(TaskAttempt attempt, Configuration conf) throws Exception {
TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, TypeConverter.fromYarn(attempt.getID()));
TextOutputFormat<?, ?> theOutputFormat = new TextOutputFormat();
RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(tContext);
NullWritable nullWritable = NullWritable.get();
try {
theRecordWriter.write(key2, val2);
theRecordWriter.write(null, nullWritable);
theRecordWriter.write(null, val2);
theRecordWriter.write(nullWritable, val1);
theRecordWriter.write(key1, nullWritable);
theRecordWriter.write(key2, null);
theRecordWriter.write(null, null);
theRecordWriter.write(key1, val1);
} finally {
theRecordWriter.close(tContext);
}
OutputFormat outputFormat = ReflectionUtils.newInstance(tContext.getOutputFormatClass(), conf);
OutputCommitter committer = outputFormat.getOutputCommitter(tContext);
committer.commitTask(tContext);
}
use of org.apache.hadoop.mapreduce.lib.output.TextOutputFormat in project elephant-bird by twitter.
the class TestLzoTextInputFormat method createTestInput.
/**
* Creates an lzo file with random data.
*
* @param outputDir Output directory.
* @param fs File system we're using.
* @param attemptContext Task attempt context, contains task id etc.
* @throws IOException
* @throws InterruptedException
*/
private byte[] createTestInput(Path outputDir, FileSystem fs, TaskAttemptContext attemptContext, int charsToOutput) throws IOException, InterruptedException {
TextOutputFormat<Text, Text> output = new TextOutputFormat<Text, Text>();
RecordWriter<Text, Text> rw = null;
md5_.reset();
try {
rw = output.getRecordWriter(attemptContext);
char[] chars = "abcdefghijklmnopqrstuvwxyz\u00E5\u00E4\u00F6".toCharArray();
Random r = new Random(System.currentTimeMillis());
Text key = new Text();
Text value = new Text();
int charsMax = chars.length - 1;
for (int i = 0; i < charsToOutput; ) {
i += fillText(chars, r, charsMax, key);
i += fillText(chars, r, charsMax, value);
rw.write(key, value);
md5_.update(key.getBytes(), 0, key.getLength());
// text output format writes tab between the key and value
md5_.update("\t".getBytes("UTF-8"));
md5_.update(value.getBytes(), 0, value.getLength());
}
} finally {
if (rw != null) {
rw.close(attemptContext);
OutputCommitter committer = output.getOutputCommitter(attemptContext);
committer.commitTask(attemptContext);
committer.commitJob(attemptContext);
}
}
byte[] result = md5_.digest();
md5_.reset();
return result;
}
Aggregations