Examples with InputSplit - org.apache.hadoop.mapreduce.InputSplit

Example 81 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project cdap by caskdata.

the class TaggedInputSplit method readFields.

@SuppressWarnings("unchecked")
@Override
public final void readFields(DataInput in) throws IOException {
    Class<? extends InputSplit> inputSplitClass = (Class<? extends InputSplit>) readClass(in);
    readAdditionalFields(in);
    inputSplit = ReflectionUtils.newInstance(inputSplitClass, conf);
    SerializationFactory factory = new SerializationFactory(conf);
    Deserializer deserializer = factory.getDeserializer(inputSplitClass);
    deserializer.open((DataInputStream) in);
    inputSplit = (InputSplit) deserializer.deserialize(inputSplit);
}

Also used : Deserializer(org.apache.hadoop.io.serializer.Deserializer) SerializationFactory(org.apache.hadoop.io.serializer.SerializationFactory) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 82 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project cdap by caskdata.

the class DelegatingRecordReader method initialize.

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    // We need to be sure not to pass the TaggedInputSplit to the underlying RecordReader. Otherwise, it can result
    // in ClassCastExceptions
    InputSplit inputSplit = ((TaggedInputSplit) split).getInputSplit();
    originalRR.initialize(inputSplit, context);
}

Also used : InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 83 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class TableInputFormatBase method calculateRebalancedSplits.

/**
   * Calculates the number of MapReduce input splits for the map tasks. The number of
   * MapReduce input splits depends on the average region size and the "data skew ratio" user set in
   * configuration.
   *
   * @param list  The list of input splits before balance.
   * @param context  The current job context.
   * @param average  The average size of all regions .
   * @return The list of input splits.
   * @throws IOException When creating the list of splits fails.
   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
   *   org.apache.hadoop.mapreduce.JobContext)
   */
private List<InputSplit> calculateRebalancedSplits(List<InputSplit> list, JobContext context, long average) throws IOException {
    List<InputSplit> resultList = new ArrayList<>();
    Configuration conf = context.getConfiguration();
    //The default data skew ratio is 3
    long dataSkewRatio = conf.getLong(INPUT_AUTOBALANCE_MAXSKEWRATIO, 3);
    //It determines which mode to use: text key mode or binary key mode. The default is text mode.
    boolean isTextKey = context.getConfiguration().getBoolean(TABLE_ROW_TEXTKEY, true);
    long dataSkewThreshold = dataSkewRatio * average;
    int count = 0;
    while (count < list.size()) {
        TableSplit ts = (TableSplit) list.get(count);
        TableName tableName = ts.getTable();
        String regionLocation = ts.getRegionLocation();
        String encodedRegionName = ts.getEncodedRegionName();
        long regionSize = ts.getLength();
        if (regionSize >= dataSkewThreshold) {
            // if the current region size is large than the data skew threshold,
            // split the region into two MapReduce input splits.
            byte[] splitKey = getSplitKey(ts.getStartRow(), ts.getEndRow(), isTextKey);
            if (Arrays.equals(ts.getEndRow(), splitKey)) {
                // Not splitting since the end key is the same as the split key
                resultList.add(ts);
            } else {
                //Set the size of child TableSplit as 1/2 of the region size. The exact size of the
                // MapReduce input splits is not far off.
                TableSplit t1 = new TableSplit(tableName, scan, ts.getStartRow(), splitKey, regionLocation, regionSize / 2);
                TableSplit t2 = new TableSplit(tableName, scan, splitKey, ts.getEndRow(), regionLocation, regionSize - regionSize / 2);
                resultList.add(t1);
                resultList.add(t2);
            }
            count++;
        } else if (regionSize >= average) {
            // if the region size between average size and data skew threshold size,
            // make this region as one MapReduce input split.
            resultList.add(ts);
            count++;
        } else {
            // if the total size of several small continuous regions less than the average region size,
            // combine them into one MapReduce input split.
            long totalSize = regionSize;
            byte[] splitStartKey = ts.getStartRow();
            byte[] splitEndKey = ts.getEndRow();
            count++;
            for (; count < list.size(); count++) {
                TableSplit nextRegion = (TableSplit) list.get(count);
                long nextRegionSize = nextRegion.getLength();
                if (totalSize + nextRegionSize <= dataSkewThreshold) {
                    totalSize = totalSize + nextRegionSize;
                    splitEndKey = nextRegion.getEndRow();
                } else {
                    break;
                }
            }
            TableSplit t = new TableSplit(tableName, scan, splitStartKey, splitEndKey, regionLocation, encodedRegionName, totalSize);
            resultList.add(t);
        }
    }
    return resultList;
}

Also used : TableName(org.apache.hadoop.hbase.TableName) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 84 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class TableInputFormatBase method createRecordReader.

/**
   * Builds a {@link TableRecordReader}. If no {@link TableRecordReader} was provided, uses
   * the default.
   *
   * @param split  The split to work with.
   * @param context  The current context.
   * @return The newly created record reader.
   * @throws IOException When creating the reader fails.
   * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
   *   org.apache.hadoop.mapreduce.InputSplit,
   *   org.apache.hadoop.mapreduce.TaskAttemptContext)
   */
@Override
public RecordReader<ImmutableBytesWritable, Result> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
    // Just in case a subclass is relying on JobConfigurable magic.
    if (table == null) {
        initialize(context);
    }
    // null check in case our child overrides getTable to not throw.
    try {
        if (getTable() == null) {
            // initialize() must not have been implemented in the subclass.
            throw new IOException(INITIALIZATION_ERROR);
        }
    } catch (IllegalStateException exception) {
        throw new IOException(INITIALIZATION_ERROR, exception);
    }
    TableSplit tSplit = (TableSplit) split;
    LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes.");
    final TableRecordReader trr = this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader();
    Scan sc = new Scan(this.scan);
    sc.setStartRow(tSplit.getStartRow());
    sc.setStopRow(tSplit.getEndRow());
    trr.setScan(sc);
    trr.setTable(getTable());
    return new RecordReader<ImmutableBytesWritable, Result>() {

        @Override
        public void close() throws IOException {
            trr.close();
            closeTable();
        }

        @Override
        public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
            return trr.getCurrentKey();
        }

        @Override
        public Result getCurrentValue() throws IOException, InterruptedException {
            return trr.getCurrentValue();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return trr.getProgress();
        }

        @Override
        public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException {
            trr.initialize(inputsplit, context);
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return trr.nextKeyValue();
        }
    };
}

Also used : RecordReader(org.apache.hadoop.mapreduce.RecordReader) Scan(org.apache.hadoop.hbase.client.Scan) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 85 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.

the class MultiTableInputFormatBase method getSplits.

/**
   * Calculates the splits that will serve as input for the map tasks. The
   * number of splits matches the number of regions in a table.
   *
   * @param context The current job context.
   * @return The list of input splits.
   * @throws IOException When creating the list of splits fails.
   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
   */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    if (scans.isEmpty()) {
        throw new IOException("No scans were provided.");
    }
    Map<TableName, List<Scan>> tableMaps = new HashMap<>();
    for (Scan scan : scans) {
        byte[] tableNameBytes = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME);
        if (tableNameBytes == null)
            throw new IOException("A scan object did not have a table name");
        TableName tableName = TableName.valueOf(tableNameBytes);
        List<Scan> scanList = tableMaps.get(tableName);
        if (scanList == null) {
            scanList = new ArrayList<>();
            tableMaps.put(tableName, scanList);
        }
        scanList.add(scan);
    }
    List<InputSplit> splits = new ArrayList<>();
    Iterator iter = tableMaps.entrySet().iterator();
    while (iter.hasNext()) {
        Map.Entry<TableName, List<Scan>> entry = (Map.Entry<TableName, List<Scan>>) iter.next();
        TableName tableName = entry.getKey();
        List<Scan> scanList = entry.getValue();
        try (Connection conn = ConnectionFactory.createConnection(context.getConfiguration());
            Table table = conn.getTable(tableName);
            RegionLocator regionLocator = conn.getRegionLocator(tableName)) {
            RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, conn.getAdmin());
            Pair<byte[][], byte[][]> keys = regionLocator.getStartEndKeys();
            for (Scan scan : scanList) {
                if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
                    throw new IOException("Expecting at least one region for table : " + tableName.getNameAsString());
                }
                int count = 0;
                byte[] startRow = scan.getStartRow();
                byte[] stopRow = scan.getStopRow();
                for (int i = 0; i < keys.getFirst().length; i++) {
                    if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
                        continue;
                    }
                    if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                        byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow;
                        byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
                        HRegionLocation hregionLocation = regionLocator.getRegionLocation(keys.getFirst()[i], false);
                        String regionHostname = hregionLocation.getHostname();
                        HRegionInfo regionInfo = hregionLocation.getRegionInfo();
                        String encodedRegionName = regionInfo.getEncodedName();
                        long regionSize = sizeCalculator.getRegionSize(regionInfo.getRegionName());
                        TableSplit split = new TableSplit(table.getName(), scan, splitStart, splitStop, regionHostname, encodedRegionName, regionSize);
                        splits.add(split);
                        if (LOG.isDebugEnabled())
                            LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
                    }
                }
            }
        }
    }
    return splits;
}

Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HRegionInfo(org.apache.hadoop.hbase.HRegionInfo) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) InputSplit(org.apache.hadoop.mapreduce.InputSplit) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) Table(org.apache.hadoop.hbase.client.Table) RegionSizeCalculator(org.apache.hadoop.hbase.util.RegionSizeCalculator) Connection(org.apache.hadoop.hbase.client.Connection) IOException(java.io.IOException) TableName(org.apache.hadoop.hbase.TableName) Scan(org.apache.hadoop.hbase.client.Scan) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9