use of org.apache.hadoop.mapreduce.InputSplit in project phoenix by apache.
the class PhoenixInputFormat method generateSplits.
private List<InputSplit> generateSplits(final QueryPlan qplan, final List<KeyRange> splits, Configuration config) throws IOException {
Preconditions.checkNotNull(qplan);
Preconditions.checkNotNull(splits);
// Get the RegionSizeCalculator
org.apache.hadoop.hbase.client.Connection connection = ConnectionFactory.createConnection(config);
RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(qplan.getTableRef().getTable().getPhysicalName().toString()));
RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, connection.getAdmin());
final List<InputSplit> psplits = Lists.newArrayListWithExpectedSize(splits.size());
for (List<Scan> scans : qplan.getScans()) {
// Get the region location
HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
String regionLocation = location.getHostname();
// Get the region size
long regionSize = sizeCalculator.getRegionSize(location.getRegionInfo().getRegionName());
// Generate splits based off statistics, or just region splits?
boolean splitByStats = PhoenixConfigurationUtil.getSplitByStats(config);
if (splitByStats) {
for (Scan aScan : scans) {
if (LOG.isDebugEnabled()) {
LOG.debug("Split for scan : " + aScan + "with scanAttribute : " + aScan.getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : [" + aScan.getCaching() + ", " + aScan.getCacheBlocks() + ", " + aScan.getBatch() + "] and regionLocation : " + regionLocation);
}
psplits.add(new PhoenixInputSplit(Collections.singletonList(aScan), regionSize, regionLocation));
}
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Scan count[" + scans.size() + "] : " + Bytes.toStringBinary(scans.get(0).getStartRow()) + " ~ " + Bytes.toStringBinary(scans.get(scans.size() - 1).getStopRow()));
LOG.debug("First scan : " + scans.get(0) + "with scanAttribute : " + scans.get(0).getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : " + "[" + scans.get(0).getCaching() + ", " + scans.get(0).getCacheBlocks() + ", " + scans.get(0).getBatch() + "] and regionLocation : " + regionLocation);
for (int i = 0, limit = scans.size(); i < limit; i++) {
LOG.debug("EXPECTED_UPPER_REGION_KEY[" + i + "] : " + Bytes.toStringBinary(scans.get(i).getAttribute(BaseScannerRegionObserver.EXPECTED_UPPER_REGION_KEY)));
}
}
psplits.add(new PhoenixInputSplit(scans, regionSize, regionLocation));
}
}
return psplits;
}
use of org.apache.hadoop.mapreduce.InputSplit in project cdap by caskdata.
the class MapperWrapper method createAutoFlushingContext.
private WrappedMapper.Context createAutoFlushingContext(final Context context, final BasicMapReduceTaskContext basicMapReduceContext) {
// NOTE: we will change auto-flush to take into account size of buffered data, so no need to do/test a lot with
// current approach
final int flushFreq = context.getConfiguration().getInt("c.mapper.flush.freq", 10000);
@SuppressWarnings("unchecked") WrappedMapper.Context flushingContext = new WrappedMapper().new Context(context) {
private int processedRecords = 0;
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
boolean result = super.nextKeyValue();
if (++processedRecords > flushFreq) {
try {
LOG.trace("Flushing dataset operations...");
basicMapReduceContext.flushOperations();
} catch (Exception e) {
LOG.error("Failed to persist changes", e);
throw Throwables.propagate(e);
}
processedRecords = 0;
}
return result;
}
@Override
public InputSplit getInputSplit() {
InputSplit inputSplit = super.getInputSplit();
if (inputSplit instanceof TaggedInputSplit) {
// expose the delegate InputSplit to the user
inputSplit = ((TaggedInputSplit) inputSplit).getInputSplit();
}
return inputSplit;
}
@Override
public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException {
InputSplit inputSplit = super.getInputSplit();
if (inputSplit instanceof MultiInputTaggedSplit) {
// expose the delegate InputFormat to the user
return ((MultiInputTaggedSplit) inputSplit).getInputFormatClass();
}
return super.getInputFormatClass();
}
};
return flushingContext;
}
use of org.apache.hadoop.mapreduce.InputSplit in project cdap by caskdata.
the class StreamInputFormatTest method testStreamRecordReader.
@Test
public void testStreamRecordReader() throws Exception {
File inputDir = tmpFolder.newFolder();
File partition = new File(inputDir, "1.1000");
partition.mkdirs();
File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());
// write 1 event
StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile), Files.newOutputStreamSupplier(indexFile), 100L);
writer.append(StreamFileTestUtils.createEvent(1000, "test"));
writer.flush();
// get splits from the input format. Expect to get 2 splits,
// one from 0 - some offset and one from offset - Long.MAX_VALUE.
Configuration conf = new Configuration();
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
AbstractStreamInputFormat.setStreamId(conf, DUMMY_ID);
AbstractStreamInputFormat.setStreamPath(conf, inputDir.toURI());
AbstractStreamInputFormat format = new AbstractStreamInputFormat() {
@Override
public AuthorizationEnforcer getAuthorizationEnforcer(TaskAttemptContext context) {
return new NoOpAuthorizer();
}
@Override
public AuthenticationContext getAuthenticationContext(TaskAttemptContext context) {
return new AuthenticationTestContext();
}
};
List<InputSplit> splits = format.getSplits(new JobContextImpl(new JobConf(conf), new JobID()));
Assert.assertEquals(2, splits.size());
// write another event so that the 2nd split has something to read
writer.append(StreamFileTestUtils.createEvent(1001, "test"));
writer.close();
// create a record reader for the 2nd split
StreamRecordReader<LongWritable, StreamEvent> recordReader = new StreamRecordReader<>(new IdentityStreamEventDecoder(), new NoOpAuthorizer(), new AuthenticationTestContext(), DUMMY_ID);
recordReader.initialize(splits.get(1), context);
// check that we read the 2nd stream event
Assert.assertTrue(recordReader.nextKeyValue());
StreamEvent output = recordReader.getCurrentValue();
Assert.assertEquals(1001, output.getTimestamp());
Assert.assertEquals("test", Bytes.toString(output.getBody()));
// check that there is nothing more to read
Assert.assertFalse(recordReader.nextKeyValue());
}
use of org.apache.hadoop.mapreduce.InputSplit in project cdap by caskdata.
the class AbstractStreamInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
long ttl = conf.getLong(STREAM_TTL, Long.MAX_VALUE);
long endTime = conf.getLong(EVENT_END_TIME, Long.MAX_VALUE);
long startTime = Math.max(conf.getLong(EVENT_START_TIME, 0L), getCurrentTime() - ttl);
long maxSplitSize = conf.getLong(MAX_SPLIT_SIZE, Long.MAX_VALUE);
long minSplitSize = Math.min(conf.getLong(MIN_SPLIT_SIZE, 1L), maxSplitSize);
StreamInputSplitFinder<InputSplit> splitFinder = StreamInputSplitFinder.builder(URI.create(conf.get(STREAM_PATH))).setStartTime(startTime).setEndTime(endTime).setMinSplitSize(minSplitSize).setMaxSplitSize(maxSplitSize).build(splitFactory);
return splitFinder.getSplits(conf);
}
use of org.apache.hadoop.mapreduce.InputSplit in project cdap by caskdata.
the class MultiInputFormat method getSplits.
@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
List<InputSplit> splits = new ArrayList<>();
Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration());
for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) {
String inputName = mapperInputEntry.getKey();
MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue();
String mapperClassName = mapperInput.getMapperClassName();
Job jobCopy = new Job(job.getConfiguration());
Configuration confCopy = jobCopy.getConfiguration();
// set configuration specific for this input onto the jobCopy
ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy);
Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName());
Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ", mapperInput.getInputFormatClassName());
InputFormat<K, V> inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy);
//some input format need a jobId to getSplits
jobCopy.setJobID(new JobID(inputName, inputName.hashCode()));
// Get splits for each input path and tag with InputFormat
// and Mapper types by wrapping in a MultiInputTaggedSplit.
List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy);
for (InputSplit split : formatSplits) {
splits.add(new MultiInputTaggedSplit(split, confCopy, inputName, mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName));
}
}
return splits;
}
Aggregations