Search in sources :

Example 86 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project flink by apache.

the class HadoopInputFormatTest method testOpen.

@Test
public void testOpen() throws Exception {
    DummyInputFormat inputFormat = mock(DummyInputFormat.class);
    when(inputFormat.createRecordReader(any(InputSplit.class), any(TaskAttemptContext.class))).thenReturn(new DummyRecordReader());
    HadoopInputSplit inputSplit = mock(HadoopInputSplit.class);
    HadoopInputFormat<String, Long> hadoopInputFormat = setupHadoopInputFormat(inputFormat, Job.getInstance(), null);
    hadoopInputFormat.open(inputSplit);
    verify(inputFormat, times(1)).createRecordReader(any(InputSplit.class), any(TaskAttemptContext.class));
    assertThat(hadoopInputFormat.fetched, is(false));
}
Also used : TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) HadoopInputSplit(org.apache.flink.api.java.hadoop.mapreduce.wrapper.HadoopInputSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HadoopInputSplit(org.apache.flink.api.java.hadoop.mapreduce.wrapper.HadoopInputSplit) Test(org.junit.Test)

Example 87 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hive by apache.

the class DataReaderSlave method main.

public static void main(String[] args) throws IOException, ClassNotFoundException {
    ObjectInputStream ois = new ObjectInputStream(new FileInputStream(new File(args[0])));
    ReaderContext cntxt = (ReaderContext) ois.readObject();
    ois.close();
    String[] inpSlitsToRead = args[1].split(",");
    List<InputSplit> splits = cntxt.getSplits();
    for (int i = 0; i < inpSlitsToRead.length; i++) {
        InputSplit split = splits.get(Integer.parseInt(inpSlitsToRead[i]));
        HCatReader reader = DataTransferFactory.getHCatReader(split, cntxt.getConf());
        Iterator<HCatRecord> itr = reader.read();
        File f = new File(args[2] + "-" + i);
        f.delete();
        BufferedWriter outFile = new BufferedWriter(new FileWriter(f));
        while (itr.hasNext()) {
            String rec = itr.next().toString().replaceFirst("\\s+$", "");
            System.err.println(rec);
            outFile.write(rec + "\n");
        }
        outFile.close();
    }
}
Also used : FileWriter(java.io.FileWriter) FileInputStream(java.io.FileInputStream) HCatReader(org.apache.hive.hcatalog.data.transfer.HCatReader) BufferedWriter(java.io.BufferedWriter) ReaderContext(org.apache.hive.hcatalog.data.transfer.ReaderContext) File(java.io.File) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord) ObjectInputStream(java.io.ObjectInputStream)

Example 88 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hive by apache.

the class TestE2EScenarios method copyTable.

private void copyTable(String in, String out) throws IOException, InterruptedException {
    Job ijob = new Job();
    Job ojob = new Job();
    HCatInputFormat inpy = new HCatInputFormat();
    inpy.setInput(ijob, null, in);
    HCatOutputFormat oupy = new HCatOutputFormat();
    oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>()));
    // Test HCatContext
    System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent());
    if (HCatContext.INSTANCE.getConf().isPresent()) {
        System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get().getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT));
    }
    HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration());
    System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString());
    oupy.setSchema(ojob, tableSchema);
    oupy.checkOutputSpecs(ojob);
    OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration()));
    oc.setupJob(ojob);
    for (InputSplit split : inpy.getSplits(ijob)) {
        TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration());
        TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration());
        RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext);
        rr.initialize(split, rtaskContext);
        OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext);
        taskOc.setupTask(wtaskContext);
        RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext);
        while (rr.nextKeyValue()) {
            rw.write(rr.getCurrentKey(), rr.getCurrentValue());
        }
        rw.close(wtaskContext);
        taskOc.commitTask(wtaskContext);
        rr.close();
    }
    oc.commitJob(ojob);
}
Also used : OutputCommitter(org.apache.hadoop.mapreduce.OutputCommitter) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HashMap(java.util.HashMap) WritableComparable(org.apache.hadoop.io.WritableComparable) HCatOutputFormat(org.apache.hive.hcatalog.mapreduce.HCatOutputFormat) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HCatInputFormat(org.apache.hive.hcatalog.mapreduce.HCatInputFormat) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord)

Example 89 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hive by apache.

the class ReaderContextImpl method writeExternal.

@Override
public void writeExternal(ObjectOutput out) throws IOException {
    conf.write(out);
    out.writeInt(splits.size());
    for (InputSplit split : splits) {
        ((HCatSplit) split).write(out);
    }
}
Also used : InputSplit(org.apache.hadoop.mapreduce.InputSplit) HCatSplit(org.apache.hive.hcatalog.mapreduce.HCatSplit)

Example 90 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project crunch by cloudera.

the class CrunchInputFormat method getSplits.

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = Lists.newArrayList();
    Configuration conf = job.getConfiguration();
    Map<InputBundle, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs.getFormatNodeMap(job);
    // First, build a map of InputFormats to Paths
    for (Map.Entry<InputBundle, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
        InputBundle inputBundle = entry.getKey();
        Job jobCopy = new Job(conf);
        InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getInputFormatClass(), jobCopy.getConfiguration());
        for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
            Integer nodeIndex = nodeEntry.getKey();
            List<Path> paths = nodeEntry.getValue();
            FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));
            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            List<InputSplit> pathSplits = format.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new CrunchInputSplit(pathSplit, inputBundle.getInputFormatClass(), inputBundle.getExtraConfiguration(), nodeIndex, jobCopy.getConfiguration()));
            }
        }
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) InputFormat(org.apache.hadoop.mapreduce.InputFormat) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) InputBundle(org.apache.crunch.io.impl.InputBundle) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Map(java.util.Map)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9