Search in sources :

Example 31 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class WALInputFormat method getSplits.

/**
   * implementation shared with deprecated HLogInputFormat
   */
List<InputSplit> getSplits(final JobContext context, final String startKey, final String endKey) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    Path[] inputPaths = getInputPaths(conf);
    long startTime = conf.getLong(startKey, Long.MIN_VALUE);
    long endTime = conf.getLong(endKey, Long.MAX_VALUE);
    List<FileStatus> allFiles = new ArrayList<FileStatus>();
    for (Path inputPath : inputPaths) {
        FileSystem fs = inputPath.getFileSystem(conf);
        List<FileStatus> files = getFiles(fs, inputPath, startTime, endTime);
        allFiles.addAll(files);
    }
    List<InputSplit> splits = new ArrayList<InputSplit>(allFiles.size());
    for (FileStatus file : allFiles) {
        splits.add(new WALSplit(file.getPath().toString(), file.getLen(), startTime, endTime));
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 32 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class MultiTableInputFormatBase method createRecordReader.

/**
   * Builds a TableRecordReader. If no TableRecordReader was provided, uses the
   * default.
   *
   * @param split The split to work with.
   * @param context The current context.
   * @return The newly created record reader.
   * @throws IOException When creating the reader fails.
   * @throws InterruptedException when record reader initialization fails
   * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
   *      org.apache.hadoop.mapreduce.InputSplit,
   *      org.apache.hadoop.mapreduce.TaskAttemptContext)
   */
@Override
public RecordReader<ImmutableBytesWritable, Result> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    TableSplit tSplit = (TableSplit) split;
    LOG.info(MessageFormat.format("Input split length: {0} bytes.", tSplit.getLength()));
    if (tSplit.getTable() == null) {
        throw new IOException("Cannot create a record reader because of a" + " previous error. Please look at the previous logs lines from" + " the task's full log for more details.");
    }
    final Connection connection = ConnectionFactory.createConnection(context.getConfiguration());
    Table table = connection.getTable(tSplit.getTable());
    if (this.tableRecordReader == null) {
        this.tableRecordReader = new TableRecordReader();
    }
    final TableRecordReader trr = this.tableRecordReader;
    try {
        Scan sc = tSplit.getScan();
        sc.setStartRow(tSplit.getStartRow());
        sc.setStopRow(tSplit.getEndRow());
        trr.setScan(sc);
        trr.setTable(table);
        return new RecordReader<ImmutableBytesWritable, Result>() {

            @Override
            public void close() throws IOException {
                trr.close();
                connection.close();
            }

            @Override
            public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
                return trr.getCurrentKey();
            }

            @Override
            public Result getCurrentValue() throws IOException, InterruptedException {
                return trr.getCurrentValue();
            }

            @Override
            public float getProgress() throws IOException, InterruptedException {
                return trr.getProgress();
            }

            @Override
            public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException {
                trr.initialize(inputsplit, context);
            }

            @Override
            public boolean nextKeyValue() throws IOException, InterruptedException {
                return trr.nextKeyValue();
            }
        };
    } catch (IOException ioe) {
        // If there is an exception make sure that all
        // resources are closed and released.
        trr.close();
        connection.close();
        throw ioe;
    }
}
Also used : Table(org.apache.hadoop.hbase.client.Table) RecordReader(org.apache.hadoop.mapreduce.RecordReader) Connection(org.apache.hadoop.hbase.client.Connection) Scan(org.apache.hadoop.hbase.client.Scan) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 33 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class TableInputFormatBase method getSplits.

/**
   * Calculates the splits that will serve as input for the map tasks. The
   * number of splits matches the number of regions in a table.
   *
   * @param context  The current job context.
   * @return The list of input splits.
   * @throws IOException When creating the list of splits fails.
   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
   *   org.apache.hadoop.mapreduce.JobContext)
   */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    boolean closeOnFinish = false;
    // Just in case a subclass is relying on JobConfigurable magic.
    if (table == null) {
        initialize(context);
        closeOnFinish = true;
    }
    // null check in case our child overrides getTable to not throw.
    try {
        if (getTable() == null) {
            // initialize() must not have been implemented in the subclass.
            throw new IOException(INITIALIZATION_ERROR);
        }
    } catch (IllegalStateException exception) {
        throw new IOException(INITIALIZATION_ERROR, exception);
    }
    try {
        RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(getRegionLocator(), getAdmin());
        TableName tableName = getTable().getName();
        Pair<byte[][], byte[][]> keys = getStartEndKeys();
        if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
            HRegionLocation regLoc = getRegionLocator().getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false);
            if (null == regLoc) {
                throw new IOException("Expecting at least one region.");
            }
            List<InputSplit> splits = new ArrayList<>(1);
            long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName());
            TableSplit split = new TableSplit(tableName, scan, HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc.getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize);
            splits.add(split);
            return splits;
        }
        List<InputSplit> splits = new ArrayList<>(keys.getFirst().length);
        for (int i = 0; i < keys.getFirst().length; i++) {
            if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
                continue;
            }
            byte[] startRow = scan.getStartRow();
            byte[] stopRow = scan.getStopRow();
            // determine if the given start an stop key fall into the region
            if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow;
                byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
                HRegionLocation location = getRegionLocator().getRegionLocation(keys.getFirst()[i], false);
                // The below InetSocketAddress creation does a name resolution.
                InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort());
                if (isa.isUnresolved()) {
                    LOG.warn("Failed resolve " + isa);
                }
                InetAddress regionAddress = isa.getAddress();
                String regionLocation;
                regionLocation = reverseDNS(regionAddress);
                byte[] regionName = location.getRegionInfo().getRegionName();
                String encodedRegionName = location.getRegionInfo().getEncodedName();
                long regionSize = sizeCalculator.getRegionSize(regionName);
                TableSplit split = new TableSplit(tableName, scan, splitStart, splitStop, regionLocation, encodedRegionName, regionSize);
                splits.add(split);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("getSplits: split -> " + i + " -> " + split);
                }
            }
        }
        //The default value of "hbase.mapreduce.input.autobalance" is false, which means not enabled.
        boolean enableAutoBalance = context.getConfiguration().getBoolean(MAPREDUCE_INPUT_AUTOBALANCE, false);
        if (enableAutoBalance) {
            long totalRegionSize = 0;
            for (int i = 0; i < splits.size(); i++) {
                TableSplit ts = (TableSplit) splits.get(i);
                totalRegionSize += ts.getLength();
            }
            long averageRegionSize = totalRegionSize / splits.size();
            // the averageRegionSize must be positive.
            if (averageRegionSize <= 0) {
                LOG.warn("The averageRegionSize is not positive: " + averageRegionSize + ", " + "set it to 1.");
                averageRegionSize = 1;
            }
            return calculateRebalancedSplits(splits, context, averageRegionSize);
        } else {
            return splits;
        }
    } finally {
        if (closeOnFinish) {
            closeTable();
        }
    }
}
Also used : RegionSizeCalculator(org.apache.hadoop.hbase.util.RegionSizeCalculator) InetSocketAddress(java.net.InetSocketAddress) ArrayList(java.util.ArrayList) IOException(java.io.IOException) TableName(org.apache.hadoop.hbase.TableName) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) InputSplit(org.apache.hadoop.mapreduce.InputSplit) InetAddress(java.net.InetAddress)

Example 34 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hive by apache.

the class OrcNewInputFormat method getSplits.

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("getSplits started");
    }
    Configuration conf = ShimLoader.getHadoopShims().getConfiguration(jobContext);
    List<OrcSplit> splits = OrcInputFormat.generateSplitsInfo(conf, createContext(conf, -1));
    List<InputSplit> result = new ArrayList<InputSplit>(splits.size());
    for (OrcSplit split : splits) {
        result.add(new OrcNewSplit(split));
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("getSplits finished");
    }
    return result;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 35 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class TestWALRecordReader method testWALRecordReader.

/**
   * Test basic functionality
   * @throws Exception
   */
@Test
public void testWALRecordReader() throws Exception {
    final WALFactory walfactory = new WALFactory(conf, null, getName());
    WAL log = walfactory.getWAL(info.getEncodedNameAsBytes(), info.getTable().getNamespace());
    byte[] value = Bytes.toBytes("value");
    final AtomicLong sequenceId = new AtomicLong(0);
    WALEdit edit = new WALEdit();
    edit.add(new KeyValue(rowName, family, Bytes.toBytes("1"), System.currentTimeMillis(), value));
    long txid = log.append(info, getWalKey(System.currentTimeMillis(), scopes), edit, true);
    log.sync(txid);
    // make sure 2nd log gets a later timestamp
    Thread.sleep(1);
    long secondTs = System.currentTimeMillis();
    log.rollWriter();
    edit = new WALEdit();
    edit.add(new KeyValue(rowName, family, Bytes.toBytes("2"), System.currentTimeMillis(), value));
    txid = log.append(info, getWalKey(System.currentTimeMillis(), scopes), edit, true);
    log.sync(txid);
    log.shutdown();
    walfactory.shutdown();
    long thirdTs = System.currentTimeMillis();
    // should have 2 log files now
    WALInputFormat input = new WALInputFormat();
    Configuration jobConf = new Configuration(conf);
    jobConf.set("mapreduce.input.fileinputformat.inputdir", logDir.toString());
    // make sure both logs are found
    List<InputSplit> splits = input.getSplits(MapreduceTestingShim.createJobContext(jobConf));
    assertEquals(2, splits.size());
    // should return exactly one KV
    testSplit(splits.get(0), Bytes.toBytes("1"));
    // same for the 2nd split
    testSplit(splits.get(1), Bytes.toBytes("2"));
    // now test basic time ranges:
    // set an endtime, the 2nd log file can be ignored completely.
    jobConf.setLong(WALInputFormat.END_TIME_KEY, secondTs - 1);
    splits = input.getSplits(MapreduceTestingShim.createJobContext(jobConf));
    assertEquals(1, splits.size());
    testSplit(splits.get(0), Bytes.toBytes("1"));
    // now set a start time
    jobConf.setLong(WALInputFormat.END_TIME_KEY, Long.MAX_VALUE);
    jobConf.setLong(WALInputFormat.START_TIME_KEY, thirdTs);
    splits = input.getSplits(MapreduceTestingShim.createJobContext(jobConf));
    // both logs need to be considered
    assertEquals(2, splits.size());
    // but both readers skip all edits
    testSplit(splits.get(0));
    testSplit(splits.get(1));
}
Also used : AtomicLong(java.util.concurrent.atomic.AtomicLong) WAL(org.apache.hadoop.hbase.wal.WAL) KeyValue(org.apache.hadoop.hbase.KeyValue) WALEdit(org.apache.hadoop.hbase.regionserver.wal.WALEdit) Configuration(org.apache.hadoop.conf.Configuration) WALFactory(org.apache.hadoop.hbase.wal.WALFactory) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9