use of org.apache.hadoop.mapred.InputSplit in project apex-malhar by apache.
the class MapOperator method definePartitions.
@SuppressWarnings("rawtypes")
@Override
public Collection<Partition<MapOperator<K1, V1, K2, V2>>> definePartitions(Collection<Partition<MapOperator<K1, V1, K2, V2>>> partitions, PartitioningContext context) {
int tempPartitionCount = partitionCount;
Collection c = partitions;
Collection<Partition<MapOperator<K1, V1, K2, V2>>> operatorPartitions = c;
Partition<MapOperator<K1, V1, K2, V2>> template;
Iterator<Partition<MapOperator<K1, V1, K2, V2>>> itr = operatorPartitions.iterator();
template = itr.next();
Configuration conf = new Configuration();
SerializationFactory serializationFactory = new SerializationFactory(conf);
if (outstream.size() == 0) {
InputSplit[] splits;
try {
splits = getSplits(new JobConf(conf), tempPartitionCount, template.getPartitionedInstance().getDirName());
} catch (Exception e1) {
logger.info(" can't get splits {}", e1.getMessage());
throw new RuntimeException(e1);
}
Collection<Partition<MapOperator<K1, V1, K2, V2>>> operList = new ArrayList<Partition<MapOperator<K1, V1, K2, V2>>>();
itr = operatorPartitions.iterator();
int size = splits.length;
Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
while (size > 0 && itr.hasNext()) {
Partition<MapOperator<K1, V1, K2, V2>> p = itr.next();
MapOperator<K1, V1, K2, V2> opr = p.getPartitionedInstance();
opr.setInputFormatClass(inputFormatClass);
opr.setMapClass(mapClass);
opr.setCombineClass(combineClass);
opr.setConfigFile(configFile);
try {
keySerializer.open(opr.getOutstream());
keySerializer.serialize(splits[size - 1]);
opr.setInputSplitClass(splits[size - 1].getClass());
} catch (IOException e) {
logger.info("error while serializing {}", e.getMessage());
}
size--;
operList.add(p);
}
while (size > 0) {
MapOperator<K1, V1, K2, V2> opr = new MapOperator<K1, V1, K2, V2>();
opr.setInputFormatClass(inputFormatClass);
opr.setMapClass(mapClass);
opr.setCombineClass(combineClass);
opr.setConfigFile(configFile);
try {
keySerializer.open(opr.getOutstream());
keySerializer.serialize(splits[size - 1]);
opr.setInputSplitClass(splits[size - 1].getClass());
} catch (IOException e) {
logger.info("error while serializing {}", e.getMessage());
}
size--;
operList.add(new DefaultPartition<MapOperator<K1, V1, K2, V2>>(opr));
}
try {
keySerializer.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
return operList;
}
return null;
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class FetchOperator method getNextSplits.
protected FetchInputFormatSplit[] getNextSplits() throws Exception {
while (getNextPath()) {
// not using FileInputFormat.setInputPaths() here because it forces a connection to the
// default file system - which may or may not be online during pure metadata operations
job.set("mapred.input.dir", StringUtils.escapeString(currPath.toString()));
// Fetch operator is not vectorized and as such turn vectorization flag off so that
// non-vectorized record reader is created below.
HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
Class<? extends InputFormat> formatter = currDesc.getInputFileFormatClass();
Utilities.copyTableJobPropertiesToConf(currDesc.getTableDesc(), job);
InputFormat inputFormat = getInputFormatFromCache(formatter, job);
String inputs = processCurrPathForMmWriteIds(inputFormat);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Setting fetch inputs to " + inputs);
}
if (inputs == null)
return null;
job.set("mapred.input.dir", inputs);
InputSplit[] splits = inputFormat.getSplits(job, 1);
FetchInputFormatSplit[] inputSplits = new FetchInputFormatSplit[splits.length];
for (int i = 0; i < splits.length; i++) {
inputSplits[i] = new FetchInputFormatSplit(splits[i], inputFormat);
}
if (work.getSplitSample() != null) {
inputSplits = splitSampling(work.getSplitSample(), inputSplits);
}
if (inputSplits.length > 0) {
if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_IN_TEST)) {
Arrays.sort(inputSplits, new FetchInputFormatSplitComparator());
}
return inputSplits;
}
}
return null;
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class SplitGrouper method group.
/**
* group splits for each bucket separately - while evenly filling all the
* available slots with tasks
*/
public Multimap<Integer, InputSplit> group(Configuration conf, Multimap<Integer, InputSplit> bucketSplitMultimap, int availableSlots, float waves, SplitLocationProvider splitLocationProvider) throws IOException {
// figure out how many tasks we want for each bucket
Map<Integer, Integer> bucketTaskMap = estimateBucketSizes(availableSlots, waves, bucketSplitMultimap.asMap());
// allocate map bucket id to grouped splits
Multimap<Integer, InputSplit> bucketGroupedSplitMultimap = ArrayListMultimap.<Integer, InputSplit>create();
// use the tez grouper to combine splits once per bucket
for (int bucketId : bucketSplitMultimap.keySet()) {
Collection<InputSplit> inputSplitCollection = bucketSplitMultimap.get(bucketId);
InputSplit[] rawSplits = inputSplitCollection.toArray(new InputSplit[0]);
InputSplit[] groupedSplits = tezGrouper.getGroupedSplits(conf, rawSplits, bucketTaskMap.get(bucketId), HiveInputFormat.class.getName(), new ColumnarSplitSizeEstimator(), splitLocationProvider);
LOG.info("Original split count is " + rawSplits.length + " grouped split count is " + groupedSplits.length + ", for bucket: " + bucketId);
for (InputSplit inSplit : groupedSplits) {
bucketGroupedSplitMultimap.put(bucketId, inSplit);
}
}
return bucketGroupedSplitMultimap;
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class CustomPartitionVertex method processAllEvents.
private void processAllEvents(String inputName, Multimap<Integer, InputSplit> bucketToGroupedSplitMap, boolean secondLevelGroupingDone) throws IOException {
int totalInputsCount = 0;
List<Integer> numSplitsForTask = new ArrayList<Integer>();
for (Entry<Integer, Collection<InputSplit>> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
int bucketNum = entry.getKey();
Collection<InputSplit> initialSplits = entry.getValue();
finalSplits.addAll(initialSplits);
for (InputSplit inputSplit : initialSplits) {
bucketToTaskMap.put(bucketNum, taskCount);
if (secondLevelGroupingDone) {
TezGroupedSplit groupedSplit = (TezGroupedSplit) inputSplit;
numSplitsForTask.add(groupedSplit.getGroupedSplits().size());
totalInputsCount += groupedSplit.getGroupedSplits().size();
} else {
numSplitsForTask.add(1);
totalInputsCount += 1;
}
taskCount++;
}
}
inputNameInputSpecMap.put(inputName, InputSpecUpdate.createPerTaskInputSpecUpdate(numSplitsForTask));
// Construct the EdgeManager descriptor to be used by all edges which need
// the routing table.
EdgeManagerPluginDescriptor hiveEdgeManagerDesc = null;
if ((vertexType == VertexType.MULTI_INPUT_INITIALIZED_EDGES) || (vertexType == VertexType.INITIALIZED_EDGES)) {
hiveEdgeManagerDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
UserPayload payload = getBytePayload(bucketToTaskMap);
hiveEdgeManagerDesc.setUserPayload(payload);
}
// Replace the edge manager for all vertices which have routing type custom.
for (Entry<String, EdgeProperty> edgeEntry : context.getInputVertexEdgeProperties().entrySet()) {
if (edgeEntry.getValue().getDataMovementType() == DataMovementType.CUSTOM && edgeEntry.getValue().getEdgeManagerDescriptor().getClassName().equals(CustomPartitionEdge.class.getName())) {
emMap.put(edgeEntry.getKey(), hiveEdgeManagerDesc);
}
}
LOG.info("Task count is " + taskCount + " for input name: " + inputName);
List<InputDataInformationEvent> taskEvents = Lists.newArrayListWithCapacity(totalInputsCount);
// Re-serialize the splits after grouping.
int count = 0;
for (InputSplit inputSplit : finalSplits) {
if (secondLevelGroupingDone) {
TezGroupedSplit tezGroupedSplit = (TezGroupedSplit) inputSplit;
for (InputSplit subSplit : tezGroupedSplit.getGroupedSplits()) {
if ((subSplit instanceof TezGroupedSplit) == false) {
throw new IOException("Unexpected split type found: " + subSplit.getClass().getCanonicalName());
}
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(subSplit);
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
diEvent.setTargetIndex(count);
taskEvents.add(diEvent);
}
} else {
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(inputSplit);
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
diEvent.setTargetIndex(count);
taskEvents.add(diEvent);
}
count++;
}
// Set the actual events for the tasks.
LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
context.addRootInputEvents(inputName, taskEvents);
if (!inputToGroupedSplitMap.isEmpty()) {
for (Entry<String, Multimap<Integer, InputSplit>> entry : inputToGroupedSplitMap.entrySet()) {
processAllSideEvents(entry.getKey(), entry.getValue());
}
setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
inputToGroupedSplitMap.clear();
}
// Only done when it is a bucket map join only no SMB.
if (numInputsAffectingRootInputSpecUpdate == 1) {
setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
// Send the bucket IDs associated with the tasks, must happen after parallelism is set.
sendBucketIdsToProcessor();
}
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class LlapBaseInputFormat method getSplits.
/**
* Calling getSplits() will open a HiveServer2 connection which should be closed by the calling application
* using LlapBaseInputFormat.close() when the application is done with the splits.
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
List<InputSplit> ins = new ArrayList<InputSplit>();
if (url == null)
url = job.get(URL_KEY);
if (query == null)
query = job.get(QUERY_KEY);
if (user == null)
user = job.get(USER_KEY);
if (pwd == null)
pwd = job.get(PWD_KEY);
String database = job.get(DB_KEY);
if (url == null || query == null) {
throw new IllegalStateException();
}
String handleId = job.get(HANDLE_ID);
if (handleId == null) {
handleId = UUID.randomUUID().toString();
LOG.info("Handle ID not specified - generated handle ID {}", handleId);
}
try {
Class.forName(driverName);
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
LOG.info("Handle ID {}: query={}", handleId, query);
String escapedQuery = StringUtils.escapeString(query, ESCAPE_CHAR, escapedChars);
String sql = String.format(SPLIT_QUERY, escapedQuery, numSplits);
try {
Connection conn = DriverManager.getConnection(url, user, pwd);
try (Statement stmt = conn.createStatement()) {
if (database != null && !database.isEmpty()) {
stmt.execute("USE " + database);
}
ResultSet res = stmt.executeQuery(sql);
while (res.next()) {
// deserialize split
DataInput in = new DataInputStream(res.getBinaryStream(1));
InputSplitWithLocationInfo is = new LlapInputSplit();
is.readFields(in);
ins.add(is);
}
res.close();
} catch (Exception e) {
LOG.error("Closing connection due to error", e);
conn.close();
throw e;
}
// Keep connection open to hang on to associated resources (temp tables, locks).
// Save to connectionMap so it can be closed at user's convenience.
addConnection(handleId, conn);
} catch (Exception e) {
throw new IOException(e);
}
return ins.toArray(new InputSplit[ins.size()]);
}
Aggregations