use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class SparkCompiler method setInputFormat.
@Override
protected void setInputFormat(Task<? extends Serializable> task) {
if (task instanceof SparkTask) {
SparkWork work = ((SparkTask) task).getWork();
List<BaseWork> all = work.getAllWork();
for (BaseWork w : all) {
if (w instanceof MapWork) {
MapWork mapWork = (MapWork) w;
HashMap<String, Operator<? extends OperatorDesc>> opMap = mapWork.getAliasToWork();
if (!opMap.isEmpty()) {
for (Operator<? extends OperatorDesc> op : opMap.values()) {
setInputFormat(mapWork, op);
}
}
}
}
} else if (task instanceof ConditionalTask) {
List<Task<? extends Serializable>> listTasks = ((ConditionalTask) task).getListTasks();
for (Task<? extends Serializable> tsk : listTasks) {
setInputFormat(tsk);
}
}
if (task.getChildTasks() != null) {
for (Task<? extends Serializable> childTask : task.getChildTasks()) {
setInputFormat(childTask);
}
}
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class GenMapRedUtils method createMRWorkForMergingFiles.
/**
* Create a MapredWork based on input path, the top operator and the input
* table descriptor.
*
* @param conf
* @param topOp
* the table scan operator that is the root of the MapReduce task.
* @param fsDesc
* the file sink descriptor that serves as the input to this merge task.
* @param parentMR
* the parent MapReduce work
* @param parentFS
* the last FileSinkOperator in the parent MapReduce work
* @return the MapredWork
*/
private static MapWork createMRWorkForMergingFiles(HiveConf conf, TableScanOperator topOp, FileSinkDesc fsDesc) {
ArrayList<String> aliases = new ArrayList<String>();
Path inputDir = StringInternUtils.internUriStringsInPath(fsDesc.getMergeInputDirName());
String inputDirStr = inputDir.toString().intern();
TableDesc tblDesc = fsDesc.getTableInfo();
// dummy alias: just use the input path
aliases.add(inputDirStr);
// constructing the default MapredWork
MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf);
MapWork cplan = cMrPlan.getMapWork();
cplan.addPathToAlias(inputDir, aliases);
cplan.addPathToPartitionInfo(inputDir, new PartitionDesc(tblDesc, null));
cplan.getAliasToWork().put(inputDirStr, topOp);
cplan.setMapperCannotSpanPartns(true);
return cplan;
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class GenMapRedUtils method setUnionPlan.
private static void setUnionPlan(GenMRProcContext opProcCtx, boolean local, Task<? extends Serializable> currTask, GenMRUnionCtx uCtx, boolean mergeTask) throws SemanticException {
TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
if (currTopOp != null) {
String currAliasId = opProcCtx.getCurrAliasId();
if (mergeTask || !opProcCtx.isSeenOp(currTask, currTopOp)) {
setTaskPlan(currAliasId, currTopOp, currTask, local, opProcCtx);
}
currTopOp = null;
opProcCtx.setCurrTopOp(currTopOp);
} else {
List<String> taskTmpDirLst = uCtx.getTaskTmpDir();
if ((taskTmpDirLst != null) && !(taskTmpDirLst.isEmpty())) {
List<TableDesc> tt_descLst = uCtx.getTTDesc();
assert !taskTmpDirLst.isEmpty() && !tt_descLst.isEmpty();
assert taskTmpDirLst.size() == tt_descLst.size();
int size = taskTmpDirLst.size();
assert local == false;
List<TableScanOperator> topOperators = uCtx.getListTopOperators();
MapredWork plan = (MapredWork) currTask.getWork();
for (int pos = 0; pos < size; pos++) {
String taskTmpDir = taskTmpDirLst.get(pos);
Path taskTmpDirPath = new Path(taskTmpDir);
MapWork mWork = plan.getMapWork();
if (!mWork.getPathToAliases().containsKey(taskTmpDirPath)) {
taskTmpDir = taskTmpDir.intern();
StringInternUtils.internUriStringsInPath(taskTmpDirPath);
TableDesc tt_desc = tt_descLst.get(pos);
mWork.addPathToAlias(taskTmpDirPath, taskTmpDir);
mWork.addPathToPartitionInfo(taskTmpDirPath, new PartitionDesc(tt_desc, null));
mWork.getAliasToWork().put(taskTmpDir, topOperators.get(pos));
}
}
}
}
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class SparkPlanGenerator method generateParentTran.
// Generate (possibly get from a cached result) parent SparkTran
private SparkTran generateParentTran(SparkPlan sparkPlan, SparkWork sparkWork, BaseWork work) throws Exception {
if (cloneToWork.containsKey(work)) {
BaseWork originalWork = cloneToWork.get(work);
if (workToParentWorkTranMap.containsKey(originalWork)) {
return workToParentWorkTranMap.get(originalWork);
}
}
SparkTran result;
if (work instanceof MapWork) {
result = generateMapInput(sparkPlan, (MapWork) work);
sparkPlan.addTran(result);
} else if (work instanceof ReduceWork) {
boolean toCache = cloneToWork.containsKey(work);
List<BaseWork> parentWorks = sparkWork.getParents(work);
SparkEdgeProperty sparkEdgeProperty = sparkWork.getEdgeProperty(parentWorks.get(0), work);
result = generate(sparkPlan, sparkEdgeProperty, toCache, work.getName());
sparkPlan.addTran(result);
for (BaseWork parentWork : parentWorks) {
sparkPlan.connect(workToTranMap.get(parentWork), result);
}
} else {
throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
}
if (cloneToWork.containsKey(work)) {
workToParentWorkTranMap.put(cloneToWork.get(work), result);
}
return result;
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class DagUtils method createVertex.
private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
Utilities.setMergeWork(conf, mergeJoinWork, mrScratchDir, false);
if (mergeJoinWork.getMainWork() instanceof MapWork) {
List<BaseWork> mapWorkList = mergeJoinWork.getBaseWorkList();
MapWork mapWork = (MapWork) (mergeJoinWork.getMainWork());
Vertex mergeVx = createVertex(conf, mapWork, fs, mrScratchDir, ctx, vertexType, localResources);
conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set
// to false when using this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
for (int i = 0; i < mapWorkList.size(); i++) {
mapWork = (MapWork) (mapWorkList.get(i));
conf.set(TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX, mapWork.getName());
conf.set(Utilities.INPUT_NAME, mapWork.getName());
LOG.info("Going through each work and adding MultiMRInput");
mergeVx.addDataSource(mapWork.getName(), MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build());
}
// To be populated for SMB joins only for all the small tables
Map<String, Integer> inputToBucketMap = new HashMap<>();
if (mergeJoinWork.getMergeJoinOperator().getParentOperators().size() == 1 && mergeJoinWork.getMergeJoinOperator().getOpTraits() != null) {
// This is an SMB join.
for (BaseWork work : mapWorkList) {
MapWork mw = (MapWork) work;
Map<String, Operator<?>> aliasToWork = mw.getAliasToWork();
Preconditions.checkState(aliasToWork.size() == 1, "More than 1 alias in SMB mapwork");
inputToBucketMap.put(mw.getName(), mw.getWorks().get(0).getOpTraits().getNumBuckets());
}
}
VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName());
// the +1 to the size is because of the main work.
CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf().getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), mapWorkList.size() + 1, inputToBucketMap);
DataOutputBuffer dob = new DataOutputBuffer();
vertexConf.write(dob);
byte[] userPayload = dob.getData();
desc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
mergeVx.setVertexManagerPlugin(desc);
return mergeVx;
} else {
return createVertex(conf, (ReduceWork) mergeJoinWork.getMainWork(), fs, mrScratchDir, ctx, localResources);
}
}
Aggregations