Examples with InputSplit - org.apache.hadoop.mapreduce.InputSplit

Example 36 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class TestWALRecordReader method testPartialRead.

/**
   * Test partial reads from the log based on passed time range
   * @throws Exception
   */
@Test
public void testPartialRead() throws Exception {
    final WALFactory walfactory = new WALFactory(conf, null, getName());
    WAL log = walfactory.getWAL(info.getEncodedNameAsBytes(), info.getTable().getNamespace());
    // This test depends on timestamp being millisecond based and the filename of the WAL also
    // being millisecond based.
    long ts = System.currentTimeMillis();
    WALEdit edit = new WALEdit();
    edit.add(new KeyValue(rowName, family, Bytes.toBytes("1"), ts, value));
    log.append(info, getWalKey(ts, scopes), edit, true);
    edit = new WALEdit();
    edit.add(new KeyValue(rowName, family, Bytes.toBytes("2"), ts + 1, value));
    log.append(info, getWalKey(ts + 1, scopes), edit, true);
    log.sync();
    LOG.info("Before 1st WAL roll " + log.toString());
    log.rollWriter();
    LOG.info("Past 1st WAL roll " + log.toString());
    Thread.sleep(1);
    long ts1 = System.currentTimeMillis();
    edit = new WALEdit();
    edit.add(new KeyValue(rowName, family, Bytes.toBytes("3"), ts1 + 1, value));
    log.append(info, getWalKey(ts1 + 1, scopes), edit, true);
    edit = new WALEdit();
    edit.add(new KeyValue(rowName, family, Bytes.toBytes("4"), ts1 + 2, value));
    log.append(info, getWalKey(ts1 + 2, scopes), edit, true);
    log.sync();
    log.shutdown();
    walfactory.shutdown();
    LOG.info("Closed WAL " + log.toString());
    WALInputFormat input = new WALInputFormat();
    Configuration jobConf = new Configuration(conf);
    jobConf.set("mapreduce.input.fileinputformat.inputdir", logDir.toString());
    jobConf.setLong(WALInputFormat.END_TIME_KEY, ts);
    // only 1st file is considered, and only its 1st entry is used
    List<InputSplit> splits = input.getSplits(MapreduceTestingShim.createJobContext(jobConf));
    assertEquals(1, splits.size());
    testSplit(splits.get(0), Bytes.toBytes("1"));
    jobConf.setLong(WALInputFormat.START_TIME_KEY, ts + 1);
    jobConf.setLong(WALInputFormat.END_TIME_KEY, ts1 + 1);
    splits = input.getSplits(MapreduceTestingShim.createJobContext(jobConf));
    // both files need to be considered
    assertEquals(2, splits.size());
    // only the 2nd entry from the 1st file is used
    testSplit(splits.get(0), Bytes.toBytes("2"));
    // only the 1nd entry from the 2nd file is used
    testSplit(splits.get(1), Bytes.toBytes("3"));
}

Also used : WAL(org.apache.hadoop.hbase.wal.WAL) KeyValue(org.apache.hadoop.hbase.KeyValue) WALEdit(org.apache.hadoop.hbase.regionserver.wal.WALEdit) Configuration(org.apache.hadoop.conf.Configuration) WALFactory(org.apache.hadoop.hbase.wal.WALFactory) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 37 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class TestTableSnapshotInputFormat method verifyWithMockedMapReduce.

private void verifyWithMockedMapReduce(Job job, int numRegions, int expectedNumSplits, byte[] startRow, byte[] stopRow) throws IOException, InterruptedException {
    TableSnapshotInputFormat tsif = new TableSnapshotInputFormat();
    List<InputSplit> splits = tsif.getSplits(job);
    Assert.assertEquals(expectedNumSplits, splits.size());
    HBaseTestingUtility.SeenRowTracker rowTracker = new HBaseTestingUtility.SeenRowTracker(startRow, stopRow);
    for (int i = 0; i < splits.size(); i++) {
        // validate input split
        InputSplit split = splits.get(i);
        Assert.assertTrue(split instanceof TableSnapshotRegionSplit);
        // validate record reader
        TaskAttemptContext taskAttemptContext = mock(TaskAttemptContext.class);
        when(taskAttemptContext.getConfiguration()).thenReturn(job.getConfiguration());
        RecordReader<ImmutableBytesWritable, Result> rr = tsif.createRecordReader(split, taskAttemptContext);
        rr.initialize(split, taskAttemptContext);
        // validate we can read all the data back
        while (rr.nextKeyValue()) {
            byte[] row = rr.getCurrentKey().get();
            verifyRowFromMap(rr.getCurrentKey(), rr.getCurrentValue());
            rowTracker.addRow(row);
        }
        rr.close();
    }
    // validate all rows are seen
    rowTracker.validate();
}

Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Result(org.apache.hadoop.hbase.client.Result) HBaseTestingUtility(org.apache.hadoop.hbase.HBaseTestingUtility) TableSnapshotRegionSplit(org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat.TableSnapshotRegionSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 38 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class TestTableInputFormatScanBase method testNumOfSplits.

/**
   * Tests a MR scan using data skew auto-balance
   *
   * @throws IOException
   * @throws ClassNotFoundException
   * @throws InterruptedException
   */
public void testNumOfSplits(String ratio, int expectedNumOfSplits) throws IOException, InterruptedException, ClassNotFoundException {
    String jobName = "TestJobForNumOfSplits";
    LOG.info("Before map/reduce startup - job " + jobName);
    Configuration c = new Configuration(TEST_UTIL.getConfiguration());
    Scan scan = new Scan();
    scan.addFamily(INPUT_FAMILYS[0]);
    scan.addFamily(INPUT_FAMILYS[1]);
    c.set("hbase.mapreduce.input.autobalance", "true");
    c.set("hbase.mapreduce.input.autobalance.maxskewratio", ratio);
    c.set(KEY_STARTROW, "");
    c.set(KEY_LASTROW, "");
    Job job = new Job(c, jobName);
    TableMapReduceUtil.initTableMapperJob(TABLE_NAME.getNameAsString(), scan, ScanMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
    TableInputFormat tif = new TableInputFormat();
    tif.setConf(job.getConfiguration());
    Assert.assertEquals(TABLE_NAME, table.getName());
    List<InputSplit> splits = tif.getSplits(job);
    Assert.assertEquals(expectedNumOfSplits, splits.size());
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) Scan(org.apache.hadoop.hbase.client.Scan) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 39 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hive by apache.

the class HCatBaseInputFormat method getSplits.

/**
   * Logically split the set of input files for the job. Returns the
   * underlying InputFormat's splits
   * @param jobContext the job context object
   * @return the splits, an HCatInputSplit wrapper over the storage
   *         handler InputSplits
   * @throws IOException or InterruptedException
   */
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();
    //Get the job info from the configuration,
    //throws exception if not initialized
    InputJobInfo inputJobInfo;
    try {
        inputJobInfo = getJobInfo(conf);
    } catch (Exception e) {
        throw new IOException(e);
    }
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<PartInfo> partitionInfoList = inputJobInfo.getPartitions();
    if (partitionInfoList == null) {
        //No partitions match the specified partition filter
        return splits;
    }
    HiveStorageHandler storageHandler;
    JobConf jobConf;
    //For each matching partition, call getSplits on the underlying InputFormat
    for (PartInfo partitionInfo : partitionInfoList) {
        jobConf = HCatUtil.getJobConfFromContext(jobContext);
        List<String> setInputPath = setInputPath(jobConf, partitionInfo.getLocation());
        if (setInputPath.isEmpty()) {
            continue;
        }
        Map<String, String> jobProperties = partitionInfo.getJobProperties();
        HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);
        storageHandler = HCatUtil.getStorageHandler(jobConf, partitionInfo);
        //Get the input format
        Class inputFormatClass = storageHandler.getInputFormatClass();
        org.apache.hadoop.mapred.InputFormat inputFormat = getMapRedInputFormat(jobConf, inputFormatClass);
        //Call getSplit on the InputFormat, create an HCatSplit for each
        //underlying split. When the desired number of input splits is missing,
        //use a default number (denoted by zero).
        //TODO(malewicz): Currently each partition is split independently into
        //a desired number. However, we want the union of all partitions to be
        //split into a desired number while maintaining balanced sizes of input
        //splits.
        int desiredNumSplits = conf.getInt(HCatConstants.HCAT_DESIRED_PARTITION_NUM_SPLITS, 0);
        org.apache.hadoop.mapred.InputSplit[] baseSplits = inputFormat.getSplits(jobConf, desiredNumSplits);
        for (org.apache.hadoop.mapred.InputSplit split : baseSplits) {
            splits.add(new HCatSplit(partitionInfo, split));
        }
    }
    return splits;
}

Also used : HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Example 40 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hive by apache.

the class HCatSplit method readFields.

/* (non-Javadoc)
   * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
   */
@SuppressWarnings("unchecked")
@Override
public void readFields(DataInput input) throws IOException {
    String partitionInfoString = WritableUtils.readString(input);
    partitionInfo = (PartInfo) HCatUtil.deserialize(partitionInfoString);
    String baseSplitClassName = WritableUtils.readString(input);
    org.apache.hadoop.mapred.InputSplit split;
    try {
        Class<? extends org.apache.hadoop.mapred.InputSplit> splitClass = (Class<? extends org.apache.hadoop.mapred.InputSplit>) JavaUtils.loadClass(baseSplitClassName);
        //Class.forName().newInstance() does not work if the underlying
        //InputSplit has package visibility
        Constructor<? extends org.apache.hadoop.mapred.InputSplit> constructor = splitClass.getDeclaredConstructor(new Class[] {});
        constructor.setAccessible(true);
        split = constructor.newInstance();
        // read baseSplit from input
        ((Writable) split).readFields(input);
        this.baseMapRedSplit = split;
    } catch (Exception e) {
        throw new IOException("Exception from " + baseSplitClassName, e);
    }
}

Also used : Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9