Search in sources :

Example 56 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project akela by mozilla-metrics.

the class MultiScanTableInputFormat method getSplits.

/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
	 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    if (table == null) {
        throw new IOException("No table was provided.");
    }
    Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
    if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
        throw new IOException("Expecting at least one region.");
    }
    Set<InputSplit> splits = new HashSet<InputSplit>();
    for (int i = 0; i < keys.getFirst().length; i++) {
        String regionLocation = table.getRegionLocation(keys.getFirst()[i]).getServerAddress().getHostname();
        for (Scan s : scans) {
            byte[] startRow = s.getStartRow();
            byte[] stopRow = s.getStopRow();
            // determine if the given start an stop key fall into the region
            if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow;
                byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
                InputSplit split = new TableSplit(table.getTableName(), splitStart, splitStop, regionLocation);
                splits.add(split);
            }
        }
    }
    return new ArrayList<InputSplit>(splits);
}
Also used : TableSplit(org.apache.hadoop.hbase.mapreduce.TableSplit) ArrayList(java.util.ArrayList) Scan(org.apache.hadoop.hbase.client.Scan) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HashSet(java.util.HashSet)

Example 57 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project flink by apache.

the class HCatInputFormatBase method createInputSplits.

@Override
public HadoopInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    configuration.setInt("mapreduce.input.fileinputformat.split.minsize", minNumSplits);
    JobContext jobContext = null;
    try {
        jobContext = HadoopUtils.instantiateJobContext(configuration, new JobID());
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    List<InputSplit> splits;
    try {
        splits = this.hCatInputFormat.getSplits(jobContext);
    } catch (InterruptedException e) {
        throw new IOException("Could not get Splits.", e);
    }
    HadoopInputSplit[] hadoopInputSplits = new HadoopInputSplit[splits.size()];
    for (int i = 0; i < hadoopInputSplits.length; i++) {
        hadoopInputSplits[i] = new HadoopInputSplit(i, splits.get(i), jobContext);
    }
    return hadoopInputSplits;
}
Also used : HadoopInputSplit(org.apache.flink.api.java.hadoop.mapreduce.wrapper.HadoopInputSplit) JobContext(org.apache.hadoop.mapreduce.JobContext) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HadoopInputSplit(org.apache.flink.api.java.hadoop.mapreduce.wrapper.HadoopInputSplit) JobID(org.apache.hadoop.mapreduce.JobID) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException)

Example 58 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project alluxio by Alluxio.

the class KeyValueInputFormat method getSplits.

/**
   * Returns a list of {@link KeyValueInputSplit} where each split is one key-value partition.
   *
   * @param jobContext MapReduce job configuration
   * @return list of {@link InputSplit}s, each split is a partition
   * @throws IOException if information about the partition cannot be retrieved
   */
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
    // The paths are MapReduce program's inputs specified in
    // {@code mapreduce.input.fileinputformat.inputdir}, each path should be a key-value store.
    Path[] paths = FileInputFormat.getInputPaths(jobContext);
    List<InputSplit> splits = new ArrayList<>();
    try {
        for (Path path : paths) {
            List<PartitionInfo> partitionInfos = mKeyValueMasterClient.getPartitionInfo(new AlluxioURI(path.toString()));
            for (PartitionInfo partitionInfo : partitionInfos) {
                splits.add(new KeyValueInputSplit(partitionInfo));
            }
        }
    } catch (AlluxioException e) {
        throw new IOException(e);
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) PartitionInfo(alluxio.thrift.PartitionInfo) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) AlluxioURI(alluxio.AlluxioURI) AlluxioException(alluxio.exception.AlluxioException)

Example 59 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project beam by apache.

the class HadoopInputFormatIOTest method testComputeSplitsIfGetSplitsReturnsEmptyList.

/**
   * This test validates behavior of
   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
   * InputFormat's {@link InputFormat#getSplits(JobContext)} returns empty list.
   */
@Test
public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception {
    InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
    SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(new ArrayList<InputSplit>());
    HadoopInputFormatBoundedSource<Text, Employee> hifSource = new HadoopInputFormatBoundedSource<Text, Employee>(serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), // No key translation required.
    null, // No value translation required.
    null, mockInputSplit);
    thrown.expect(IOException.class);
    thrown.expectMessage("Error in computing splits, getSplits() returns a empty list");
    hifSource.setInputFormatObj(mockInputFormat);
    hifSource.computeSplitsIfNecessary();
}
Also used : HadoopInputFormatBoundedSource(org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.HadoopInputFormatBoundedSource) SerializableSplit(org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableSplit) Text(org.apache.hadoop.io.Text) JobContext(org.apache.hadoop.mapreduce.JobContext) InputSplit(org.apache.hadoop.mapreduce.InputSplit) NewObjectsEmployeeInputSplit(org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.NewObjectsEmployeeInputSplit) Test(org.junit.Test)

Example 60 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project beam by apache.

the class HadoopInputFormatIOTest method testComputeSplitsIfGetSplitsReturnsListHavingNullValues.

/**
   * This test validates behavior of
   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} if Hadoop
   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplit list having some
   * null values.
   */
@Test
public void testComputeSplitsIfGetSplitsReturnsListHavingNullValues() throws Exception {
    // InputSplit list having null value.
    InputSplit mockInputSplit = Mockito.mock(InputSplit.class, Mockito.withSettings().extraInterfaces(Writable.class));
    List<InputSplit> inputSplitList = new ArrayList<InputSplit>();
    inputSplitList.add(mockInputSplit);
    inputSplitList.add(null);
    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(inputSplitList);
    HadoopInputFormatBoundedSource<Text, Employee> hifSource = new HadoopInputFormatBoundedSource<Text, Employee>(serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), // No key translation required.
    null, // No value translation required.
    null, new SerializableSplit());
    thrown.expect(IOException.class);
    thrown.expectMessage("Error in computing splits, split is null in InputSplits list populated " + "by getSplits() : ");
    hifSource.setInputFormatObj(mockInputFormat);
    hifSource.computeSplitsIfNecessary();
}
Also used : ArrayList(java.util.ArrayList) HadoopInputFormatBoundedSource(org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.HadoopInputFormatBoundedSource) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) SerializableSplit(org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableSplit) Text(org.apache.hadoop.io.Text) JobContext(org.apache.hadoop.mapreduce.JobContext) InputSplit(org.apache.hadoop.mapreduce.InputSplit) NewObjectsEmployeeInputSplit(org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.NewObjectsEmployeeInputSplit) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9