use of org.apache.hadoop.mapreduce.InputSplit in project cdap by caskdata.
the class TaggedInputSplit method readFields.
@SuppressWarnings("unchecked")
@Override
public final void readFields(DataInput in) throws IOException {
Class<? extends InputSplit> inputSplitClass = (Class<? extends InputSplit>) readClass(in);
readAdditionalFields(in);
inputSplit = ReflectionUtils.newInstance(inputSplitClass, conf);
SerializationFactory factory = new SerializationFactory(conf);
Deserializer deserializer = factory.getDeserializer(inputSplitClass);
deserializer.open((DataInputStream) in);
inputSplit = (InputSplit) deserializer.deserialize(inputSplit);
}
use of org.apache.hadoop.mapreduce.InputSplit in project cdap by caskdata.
the class DelegatingRecordReader method initialize.
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
// We need to be sure not to pass the TaggedInputSplit to the underlying RecordReader. Otherwise, it can result
// in ClassCastExceptions
InputSplit inputSplit = ((TaggedInputSplit) split).getInputSplit();
originalRR.initialize(inputSplit, context);
}
use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.
the class TableInputFormatBase method calculateRebalancedSplits.
/**
* Calculates the number of MapReduce input splits for the map tasks. The number of
* MapReduce input splits depends on the average region size and the "data skew ratio" user set in
* configuration.
*
* @param list The list of input splits before balance.
* @param context The current job context.
* @param average The average size of all regions .
* @return The list of input splits.
* @throws IOException When creating the list of splits fails.
* @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
* org.apache.hadoop.mapreduce.JobContext)
*/
private List<InputSplit> calculateRebalancedSplits(List<InputSplit> list, JobContext context, long average) throws IOException {
List<InputSplit> resultList = new ArrayList<>();
Configuration conf = context.getConfiguration();
//The default data skew ratio is 3
long dataSkewRatio = conf.getLong(INPUT_AUTOBALANCE_MAXSKEWRATIO, 3);
//It determines which mode to use: text key mode or binary key mode. The default is text mode.
boolean isTextKey = context.getConfiguration().getBoolean(TABLE_ROW_TEXTKEY, true);
long dataSkewThreshold = dataSkewRatio * average;
int count = 0;
while (count < list.size()) {
TableSplit ts = (TableSplit) list.get(count);
TableName tableName = ts.getTable();
String regionLocation = ts.getRegionLocation();
String encodedRegionName = ts.getEncodedRegionName();
long regionSize = ts.getLength();
if (regionSize >= dataSkewThreshold) {
// if the current region size is large than the data skew threshold,
// split the region into two MapReduce input splits.
byte[] splitKey = getSplitKey(ts.getStartRow(), ts.getEndRow(), isTextKey);
if (Arrays.equals(ts.getEndRow(), splitKey)) {
// Not splitting since the end key is the same as the split key
resultList.add(ts);
} else {
//Set the size of child TableSplit as 1/2 of the region size. The exact size of the
// MapReduce input splits is not far off.
TableSplit t1 = new TableSplit(tableName, scan, ts.getStartRow(), splitKey, regionLocation, regionSize / 2);
TableSplit t2 = new TableSplit(tableName, scan, splitKey, ts.getEndRow(), regionLocation, regionSize - regionSize / 2);
resultList.add(t1);
resultList.add(t2);
}
count++;
} else if (regionSize >= average) {
// if the region size between average size and data skew threshold size,
// make this region as one MapReduce input split.
resultList.add(ts);
count++;
} else {
// if the total size of several small continuous regions less than the average region size,
// combine them into one MapReduce input split.
long totalSize = regionSize;
byte[] splitStartKey = ts.getStartRow();
byte[] splitEndKey = ts.getEndRow();
count++;
for (; count < list.size(); count++) {
TableSplit nextRegion = (TableSplit) list.get(count);
long nextRegionSize = nextRegion.getLength();
if (totalSize + nextRegionSize <= dataSkewThreshold) {
totalSize = totalSize + nextRegionSize;
splitEndKey = nextRegion.getEndRow();
} else {
break;
}
}
TableSplit t = new TableSplit(tableName, scan, splitStartKey, splitEndKey, regionLocation, encodedRegionName, totalSize);
resultList.add(t);
}
}
return resultList;
}
use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.
the class TableInputFormatBase method createRecordReader.
/**
* Builds a {@link TableRecordReader}. If no {@link TableRecordReader} was provided, uses
* the default.
*
* @param split The split to work with.
* @param context The current context.
* @return The newly created record reader.
* @throws IOException When creating the reader fails.
* @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
* org.apache.hadoop.mapreduce.InputSplit,
* org.apache.hadoop.mapreduce.TaskAttemptContext)
*/
@Override
public RecordReader<ImmutableBytesWritable, Result> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
// Just in case a subclass is relying on JobConfigurable magic.
if (table == null) {
initialize(context);
}
// null check in case our child overrides getTable to not throw.
try {
if (getTable() == null) {
// initialize() must not have been implemented in the subclass.
throw new IOException(INITIALIZATION_ERROR);
}
} catch (IllegalStateException exception) {
throw new IOException(INITIALIZATION_ERROR, exception);
}
TableSplit tSplit = (TableSplit) split;
LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes.");
final TableRecordReader trr = this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader();
Scan sc = new Scan(this.scan);
sc.setStartRow(tSplit.getStartRow());
sc.setStopRow(tSplit.getEndRow());
trr.setScan(sc);
trr.setTable(getTable());
return new RecordReader<ImmutableBytesWritable, Result>() {
@Override
public void close() throws IOException {
trr.close();
closeTable();
}
@Override
public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
return trr.getCurrentKey();
}
@Override
public Result getCurrentValue() throws IOException, InterruptedException {
return trr.getCurrentValue();
}
@Override
public float getProgress() throws IOException, InterruptedException {
return trr.getProgress();
}
@Override
public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException {
trr.initialize(inputsplit, context);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return trr.nextKeyValue();
}
};
}
use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.
the class MultiTableInputFormatBase method getSplits.
/**
* Calculates the splits that will serve as input for the map tasks. The
* number of splits matches the number of regions in a table.
*
* @param context The current job context.
* @return The list of input splits.
* @throws IOException When creating the list of splits fails.
* @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
*/
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
if (scans.isEmpty()) {
throw new IOException("No scans were provided.");
}
Map<TableName, List<Scan>> tableMaps = new HashMap<>();
for (Scan scan : scans) {
byte[] tableNameBytes = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME);
if (tableNameBytes == null)
throw new IOException("A scan object did not have a table name");
TableName tableName = TableName.valueOf(tableNameBytes);
List<Scan> scanList = tableMaps.get(tableName);
if (scanList == null) {
scanList = new ArrayList<>();
tableMaps.put(tableName, scanList);
}
scanList.add(scan);
}
List<InputSplit> splits = new ArrayList<>();
Iterator iter = tableMaps.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry<TableName, List<Scan>> entry = (Map.Entry<TableName, List<Scan>>) iter.next();
TableName tableName = entry.getKey();
List<Scan> scanList = entry.getValue();
try (Connection conn = ConnectionFactory.createConnection(context.getConfiguration());
Table table = conn.getTable(tableName);
RegionLocator regionLocator = conn.getRegionLocator(tableName)) {
RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, conn.getAdmin());
Pair<byte[][], byte[][]> keys = regionLocator.getStartEndKeys();
for (Scan scan : scanList) {
if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
throw new IOException("Expecting at least one region for table : " + tableName.getNameAsString());
}
int count = 0;
byte[] startRow = scan.getStartRow();
byte[] stopRow = scan.getStopRow();
for (int i = 0; i < keys.getFirst().length; i++) {
if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
continue;
}
if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow;
byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
HRegionLocation hregionLocation = regionLocator.getRegionLocation(keys.getFirst()[i], false);
String regionHostname = hregionLocation.getHostname();
HRegionInfo regionInfo = hregionLocation.getRegionInfo();
String encodedRegionName = regionInfo.getEncodedName();
long regionSize = sizeCalculator.getRegionSize(regionInfo.getRegionName());
TableSplit split = new TableSplit(table.getName(), scan, splitStart, splitStop, regionHostname, encodedRegionName, regionSize);
splits.add(split);
if (LOG.isDebugEnabled())
LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
}
}
}
}
}
return splits;
}
Aggregations