Examples with FileOutputFormat - org.apache.hadoop.mapreduce.lib.output.FileOutputFormat

Example 1 with FileOutputFormat

use of org.apache.hadoop.mapreduce.lib.output.FileOutputFormat in project spark-dataflow by cloudera.

the class HadoopFileFormatPipelineTest method testSequenceFile.

@Test
public void testSequenceFile() throws Exception {
    populateFile();
    Pipeline p = Pipeline.create(PipelineOptionsFactory.create());
    @SuppressWarnings("unchecked") Class<? extends FileInputFormat<IntWritable, Text>> inputFormatClass = (Class<? extends FileInputFormat<IntWritable, Text>>) (Class<?>) SequenceFileInputFormat.class;
    HadoopIO.Read.Bound<IntWritable, Text> read = HadoopIO.Read.from(inputFile.getAbsolutePath(), inputFormatClass, IntWritable.class, Text.class);
    PCollection<KV<IntWritable, Text>> input = p.apply(read);
    @SuppressWarnings("unchecked") Class<? extends FileOutputFormat<IntWritable, Text>> outputFormatClass = (Class<? extends FileOutputFormat<IntWritable, Text>>) (Class<?>) TemplatedSequenceFileOutputFormat.class;
    @SuppressWarnings("unchecked") HadoopIO.Write.Bound<IntWritable, Text> write = HadoopIO.Write.to(outputFile.getAbsolutePath(), outputFormatClass, IntWritable.class, Text.class);
    input.apply(write.withoutSharding());
    EvaluationResult res = SparkPipelineRunner.create().run(p);
    res.close();
    IntWritable key = new IntWritable();
    Text value = new Text();
    try (Reader reader = new Reader(new Configuration(), Reader.file(new Path(outputFile.toURI())))) {
        int i = 0;
        while (reader.next(key, value)) {
            assertEquals(i, key.get());
            assertEquals("value-" + i, value.toString());
            i++;
        }
    }
}

Also used : FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SequenceFileInputFormat(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat) Reader(org.apache.hadoop.io.SequenceFile.Reader) Text(org.apache.hadoop.io.Text) KV(com.google.cloud.dataflow.sdk.values.KV) SequenceFileInputFormat(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) Pipeline(com.google.cloud.dataflow.sdk.Pipeline) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 2 with FileOutputFormat

use of org.apache.hadoop.mapreduce.lib.output.FileOutputFormat in project goldenorb by jzachr.

the class OrbPartition method dumpData.

private void dumpData() {
    Configuration conf = new Configuration();
    Job job = null;
    JobContext jobContext = null;
    TaskAttemptContext tao = null;
    RecordWriter rw;
    VertexWriter vw;
    FileOutputFormat outputFormat;
    boolean tryAgain = true;
    int count = 0;
    while (tryAgain && count < 15) try {
        count++;
        tryAgain = false;
        if (job == null) {
            job = new Job(conf);
            job.setOutputFormatClass(TextOutputFormat.class);
            FileOutputFormat.setOutputPath(job, new Path(new String(getOrbConf().getNameNode() + getOrbConf().getFileOutputPath())));
        }
        if (jobContext == null) {
            jobContext = new JobContext(job.getConfiguration(), new JobID());
        }
        System.out.println(jobContext.getConfiguration().get("mapred.output.dir"));
        tao = new TaskAttemptContext(jobContext.getConfiguration(), new TaskAttemptID(new TaskID(jobContext.getJobID(), true, getPartitionID()), 0));
        outputFormat = (FileOutputFormat) tao.getOutputFormatClass().newInstance();
        rw = outputFormat.getRecordWriter(tao);
        vw = (VertexWriter) getOrbConf().getVertexOutputFormatClass().newInstance();
        for (Vertex v : vertices.values()) {
            OrbContext oc = vw.vertexWrite(v);
            rw.write(oc.getKey(), oc.getValue());
        // orbLogger.info("Partition: " + Integer.toString(partitionId) + "writing: " +
        // oc.getKey().toString() + ", " + oc.getValue().toString());
        }
        rw.close(tao);
        FileOutputCommitter cm = (FileOutputCommitter) outputFormat.getOutputCommitter(tao);
        if (cm.needsTaskCommit(tao)) {
            cm.commitTask(tao);
            cm.cleanupJob(jobContext);
        } else {
            cm.cleanupJob(jobContext);
            tryAgain = true;
        }
    } catch (IOException e) {
        tryAgain = true;
        e.printStackTrace();
    } catch (InstantiationException e) {
        tryAgain = true;
        e.printStackTrace();
    } catch (IllegalAccessException e) {
        tryAgain = true;
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        tryAgain = true;
        e.printStackTrace();
    } catch (InterruptedException e) {
        tryAgain = true;
        e.printStackTrace();
    }
    if (tryAgain) {
        synchronized (this) {
            try {
                wait(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }
}

Also used : FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) Path(org.apache.hadoop.fs.Path) TaskID(org.apache.hadoop.mapreduce.TaskID) Configuration(org.apache.hadoop.conf.Configuration) OrbConfiguration(org.goldenorb.conf.OrbConfiguration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) OrbContext(org.goldenorb.io.output.OrbContext) VertexWriter(org.goldenorb.io.output.VertexWriter) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) JobID(org.apache.hadoop.mapreduce.JobID)

Aggregations

Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 FileOutputFormat (org.apache.hadoop.mapreduce.lib.output.FileOutputFormat)2 Pipeline (com.google.cloud.dataflow.sdk.Pipeline)1 KV (com.google.cloud.dataflow.sdk.values.KV)1 IOException (java.io.IOException)1 IntWritable (org.apache.hadoop.io.IntWritable)1 Reader (org.apache.hadoop.io.SequenceFile.Reader)1 Text (org.apache.hadoop.io.Text)1 Job (org.apache.hadoop.mapreduce.Job)1 JobContext (org.apache.hadoop.mapreduce.JobContext)1 JobID (org.apache.hadoop.mapreduce.JobID)1 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)1 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)1 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)1 TaskID (org.apache.hadoop.mapreduce.TaskID)1 FileInputFormat (org.apache.hadoop.mapreduce.lib.input.FileInputFormat)1 SequenceFileInputFormat (org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat)1 FileOutputCommitter (org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter)1 TextOutputFormat (org.apache.hadoop.mapreduce.lib.output.TextOutputFormat)1