use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.
the class TestWALRecordReader method testPartialRead.
/**
* Test partial reads from the log based on passed time range
* @throws Exception
*/
@Test
public void testPartialRead() throws Exception {
final WALFactory walfactory = new WALFactory(conf, null, getName());
WAL log = walfactory.getWAL(info.getEncodedNameAsBytes(), info.getTable().getNamespace());
// This test depends on timestamp being millisecond based and the filename of the WAL also
// being millisecond based.
long ts = System.currentTimeMillis();
WALEdit edit = new WALEdit();
edit.add(new KeyValue(rowName, family, Bytes.toBytes("1"), ts, value));
log.append(info, getWalKey(ts, scopes), edit, true);
edit = new WALEdit();
edit.add(new KeyValue(rowName, family, Bytes.toBytes("2"), ts + 1, value));
log.append(info, getWalKey(ts + 1, scopes), edit, true);
log.sync();
LOG.info("Before 1st WAL roll " + log.toString());
log.rollWriter();
LOG.info("Past 1st WAL roll " + log.toString());
Thread.sleep(1);
long ts1 = System.currentTimeMillis();
edit = new WALEdit();
edit.add(new KeyValue(rowName, family, Bytes.toBytes("3"), ts1 + 1, value));
log.append(info, getWalKey(ts1 + 1, scopes), edit, true);
edit = new WALEdit();
edit.add(new KeyValue(rowName, family, Bytes.toBytes("4"), ts1 + 2, value));
log.append(info, getWalKey(ts1 + 2, scopes), edit, true);
log.sync();
log.shutdown();
walfactory.shutdown();
LOG.info("Closed WAL " + log.toString());
WALInputFormat input = new WALInputFormat();
Configuration jobConf = new Configuration(conf);
jobConf.set("mapreduce.input.fileinputformat.inputdir", logDir.toString());
jobConf.setLong(WALInputFormat.END_TIME_KEY, ts);
// only 1st file is considered, and only its 1st entry is used
List<InputSplit> splits = input.getSplits(MapreduceTestingShim.createJobContext(jobConf));
assertEquals(1, splits.size());
testSplit(splits.get(0), Bytes.toBytes("1"));
jobConf.setLong(WALInputFormat.START_TIME_KEY, ts + 1);
jobConf.setLong(WALInputFormat.END_TIME_KEY, ts1 + 1);
splits = input.getSplits(MapreduceTestingShim.createJobContext(jobConf));
// both files need to be considered
assertEquals(2, splits.size());
// only the 2nd entry from the 1st file is used
testSplit(splits.get(0), Bytes.toBytes("2"));
// only the 1nd entry from the 2nd file is used
testSplit(splits.get(1), Bytes.toBytes("3"));
}
use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.
the class TestTableSnapshotInputFormat method verifyWithMockedMapReduce.
private void verifyWithMockedMapReduce(Job job, int numRegions, int expectedNumSplits, byte[] startRow, byte[] stopRow) throws IOException, InterruptedException {
TableSnapshotInputFormat tsif = new TableSnapshotInputFormat();
List<InputSplit> splits = tsif.getSplits(job);
Assert.assertEquals(expectedNumSplits, splits.size());
HBaseTestingUtility.SeenRowTracker rowTracker = new HBaseTestingUtility.SeenRowTracker(startRow, stopRow);
for (int i = 0; i < splits.size(); i++) {
// validate input split
InputSplit split = splits.get(i);
Assert.assertTrue(split instanceof TableSnapshotRegionSplit);
// validate record reader
TaskAttemptContext taskAttemptContext = mock(TaskAttemptContext.class);
when(taskAttemptContext.getConfiguration()).thenReturn(job.getConfiguration());
RecordReader<ImmutableBytesWritable, Result> rr = tsif.createRecordReader(split, taskAttemptContext);
rr.initialize(split, taskAttemptContext);
// validate we can read all the data back
while (rr.nextKeyValue()) {
byte[] row = rr.getCurrentKey().get();
verifyRowFromMap(rr.getCurrentKey(), rr.getCurrentValue());
rowTracker.addRow(row);
}
rr.close();
}
// validate all rows are seen
rowTracker.validate();
}
use of org.apache.hadoop.mapreduce.InputSplit in project hbase by apache.
the class TestTableInputFormatScanBase method testNumOfSplits.
/**
* Tests a MR scan using data skew auto-balance
*
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
public void testNumOfSplits(String ratio, int expectedNumOfSplits) throws IOException, InterruptedException, ClassNotFoundException {
String jobName = "TestJobForNumOfSplits";
LOG.info("Before map/reduce startup - job " + jobName);
Configuration c = new Configuration(TEST_UTIL.getConfiguration());
Scan scan = new Scan();
scan.addFamily(INPUT_FAMILYS[0]);
scan.addFamily(INPUT_FAMILYS[1]);
c.set("hbase.mapreduce.input.autobalance", "true");
c.set("hbase.mapreduce.input.autobalance.maxskewratio", ratio);
c.set(KEY_STARTROW, "");
c.set(KEY_LASTROW, "");
Job job = new Job(c, jobName);
TableMapReduceUtil.initTableMapperJob(TABLE_NAME.getNameAsString(), scan, ScanMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
TableInputFormat tif = new TableInputFormat();
tif.setConf(job.getConfiguration());
Assert.assertEquals(TABLE_NAME, table.getName());
List<InputSplit> splits = tif.getSplits(job);
Assert.assertEquals(expectedNumOfSplits, splits.size());
}
use of org.apache.hadoop.mapreduce.InputSplit in project hive by apache.
the class HCatBaseInputFormat method getSplits.
/**
* Logically split the set of input files for the job. Returns the
* underlying InputFormat's splits
* @param jobContext the job context object
* @return the splits, an HCatInputSplit wrapper over the storage
* handler InputSplits
* @throws IOException or InterruptedException
*/
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
Configuration conf = jobContext.getConfiguration();
//Get the job info from the configuration,
//throws exception if not initialized
InputJobInfo inputJobInfo;
try {
inputJobInfo = getJobInfo(conf);
} catch (Exception e) {
throw new IOException(e);
}
List<InputSplit> splits = new ArrayList<InputSplit>();
List<PartInfo> partitionInfoList = inputJobInfo.getPartitions();
if (partitionInfoList == null) {
//No partitions match the specified partition filter
return splits;
}
HiveStorageHandler storageHandler;
JobConf jobConf;
//For each matching partition, call getSplits on the underlying InputFormat
for (PartInfo partitionInfo : partitionInfoList) {
jobConf = HCatUtil.getJobConfFromContext(jobContext);
List<String> setInputPath = setInputPath(jobConf, partitionInfo.getLocation());
if (setInputPath.isEmpty()) {
continue;
}
Map<String, String> jobProperties = partitionInfo.getJobProperties();
HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);
storageHandler = HCatUtil.getStorageHandler(jobConf, partitionInfo);
//Get the input format
Class inputFormatClass = storageHandler.getInputFormatClass();
org.apache.hadoop.mapred.InputFormat inputFormat = getMapRedInputFormat(jobConf, inputFormatClass);
//Call getSplit on the InputFormat, create an HCatSplit for each
//underlying split. When the desired number of input splits is missing,
//use a default number (denoted by zero).
//TODO(malewicz): Currently each partition is split independently into
//a desired number. However, we want the union of all partitions to be
//split into a desired number while maintaining balanced sizes of input
//splits.
int desiredNumSplits = conf.getInt(HCatConstants.HCAT_DESIRED_PARTITION_NUM_SPLITS, 0);
org.apache.hadoop.mapred.InputSplit[] baseSplits = inputFormat.getSplits(jobConf, desiredNumSplits);
for (org.apache.hadoop.mapred.InputSplit split : baseSplits) {
splits.add(new HCatSplit(partitionInfo, split));
}
}
return splits;
}
use of org.apache.hadoop.mapreduce.InputSplit in project hive by apache.
the class HCatSplit method readFields.
/* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
*/
@SuppressWarnings("unchecked")
@Override
public void readFields(DataInput input) throws IOException {
String partitionInfoString = WritableUtils.readString(input);
partitionInfo = (PartInfo) HCatUtil.deserialize(partitionInfoString);
String baseSplitClassName = WritableUtils.readString(input);
org.apache.hadoop.mapred.InputSplit split;
try {
Class<? extends org.apache.hadoop.mapred.InputSplit> splitClass = (Class<? extends org.apache.hadoop.mapred.InputSplit>) JavaUtils.loadClass(baseSplitClassName);
//Class.forName().newInstance() does not work if the underlying
//InputSplit has package visibility
Constructor<? extends org.apache.hadoop.mapred.InputSplit> constructor = splitClass.getDeclaredConstructor(new Class[] {});
constructor.setAccessible(true);
split = constructor.newInstance();
// read baseSplit from input
((Writable) split).readFields(input);
this.baseMapRedSplit = split;
} catch (Exception e) {
throw new IOException("Exception from " + baseSplitClassName, e);
}
}
Aggregations