use of org.apache.hadoop.mapreduce.JobContext in project druid by druid-io.
the class DatasourceInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
JobConf conf = new JobConf(context.getConfiguration());
List<String> dataSources = getDataSources(conf);
List<InputSplit> splits = new ArrayList<>();
for (String dataSource : dataSources) {
List<WindowedDataSegment> segments = getSegments(conf, dataSource);
if (segments == null || segments.size() == 0) {
throw new ISE("No segments found to read for dataSource[%s]", dataSource);
}
// Note: Each segment is logged separately to avoid creating a huge String if we are loading lots of segments.
for (int i = 0; i < segments.size(); i++) {
final WindowedDataSegment segment = segments.get(i);
logger.info("Segment %,d/%,d for dataSource[%s] has identifier[%s], interval[%s]", i, segments.size(), dataSource, segment.getSegment().getId(), segment.getInterval());
}
long maxSize = getMaxSplitSize(conf, dataSource);
if (maxSize < 0) {
long totalSize = 0;
for (WindowedDataSegment segment : segments) {
totalSize += segment.getSegment().getSize();
}
int mapTask = conf.getNumMapTasks();
if (mapTask > 0) {
maxSize = totalSize / mapTask;
}
}
if (maxSize > 0) {
// combining is to happen, let us sort the segments list by size so that they
// are combined appropriately
segments.sort(Comparator.comparingLong(s -> s.getSegment().getSize()));
}
List<WindowedDataSegment> list = new ArrayList<>();
long size = 0;
org.apache.hadoop.mapred.InputFormat fio = supplier.get();
for (WindowedDataSegment segment : segments) {
if (size + segment.getSegment().getSize() > maxSize && size > 0) {
splits.add(toDataSourceSplit(list, fio, conf));
list = new ArrayList<>();
size = 0;
}
list.add(segment);
size += segment.getSegment().getSize();
}
if (list.size() > 0) {
splits.add(toDataSourceSplit(list, fio, conf));
}
}
logger.info("Number of splits [%d]", splits.size());
return splits;
}
use of org.apache.hadoop.mapreduce.JobContext in project mongo-hadoop by mongodb.
the class GridFSInputFormatTest method mockJobContext.
private static JobContext mockJobContext(final Configuration conf) {
JobContext context = mock(JobContext.class);
when(context.getConfiguration()).thenReturn(conf);
return context;
}
use of org.apache.hadoop.mapreduce.JobContext in project mongo-hadoop by mongodb.
the class GridFSInputFormatTest method testReadBinaryFiles.
@Test
public void testReadBinaryFiles() throws IOException, InterruptedException, URISyntaxException {
Configuration conf = getConfiguration();
MongoConfigUtil.setQuery(conf, new BasicDBObject("filename", "orders.bson"));
MongoConfigUtil.setGridFSWholeFileSplit(conf, true);
MongoConfigUtil.setGridFSReadBinary(conf, true);
JobContext context = mockJobContext(conf);
TaskAttemptContext taskContext = mockTaskAttemptContext(conf);
List<InputSplit> splits = inputFormat.getSplits(context);
assertEquals(1, splits.size());
int i = 0;
byte[] buff = null;
for (InputSplit split : splits) {
GridFSInputFormat.GridFSBinaryRecordReader reader = new GridFSInputFormat.GridFSBinaryRecordReader();
reader.initialize(split, taskContext);
for (; reader.nextKeyValue(); ++i) {
buff = new byte[reader.getCurrentValue().getLength()];
// BytesWritable.copyBytes does not exist in Hadoop 1.2
System.arraycopy(reader.getCurrentValue().getBytes(), 0, buff, 0, buff.length);
}
}
// Only one record to read on the split.
assertEquals(1, i);
assertNotNull(buff);
assertEquals(bson.getLength(), buff.length);
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class KuduInputFormat method computeSplits.
private List<KuduInputSplit> computeSplits(Configuration conf) throws IOException {
try (KuduClient client = KuduHiveUtils.getKuduClient(conf)) {
// Hive depends on FileSplits so we get the dummy Path for the Splits.
Job job = Job.getInstance(conf);
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
Path[] paths = FileInputFormat.getInputPaths(jobContext);
Path dummyPath = paths[0];
String tableName = conf.get(KUDU_TABLE_NAME_KEY);
if (StringUtils.isEmpty(tableName)) {
throw new IllegalArgumentException(KUDU_TABLE_NAME_KEY + " is not set.");
}
if (!client.tableExists(tableName)) {
throw new IllegalArgumentException("Kudu table does not exist: " + tableName);
}
KuduTable table = client.openTable(tableName);
List<KuduPredicate> predicates = KuduPredicateHandler.getPredicates(conf, table.getSchema());
KuduScanToken.KuduScanTokenBuilder tokenBuilder = client.newScanTokenBuilder(table).setProjectedColumnNames(getProjectedColumns(conf));
for (KuduPredicate predicate : predicates) {
tokenBuilder.addPredicate(predicate);
}
List<KuduScanToken> tokens = tokenBuilder.build();
List<KuduInputSplit> splits = new ArrayList<>(tokens.size());
for (KuduScanToken token : tokens) {
List<String> locations = new ArrayList<>(token.getTablet().getReplicas().size());
for (LocatedTablet.Replica replica : token.getTablet().getReplicas()) {
locations.add(replica.getRpcHost());
}
splits.add(new KuduInputSplit(token, dummyPath, locations.toArray(new String[0])));
}
return splits;
}
}
use of org.apache.hadoop.mapreduce.JobContext in project hive by apache.
the class HiveHFileOutputFormat method checkOutputSpecs.
@Override
public void checkOutputSpecs(FileSystem ignored, JobConf jc) throws IOException {
// delegate to the new api
Job job = new Job(jc);
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
checkOutputSpecs(jobContext);
}
Aggregations