use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class SparkMapRecordHandler method init.
@Override
public <K, V> void init(JobConf job, OutputCollector<K, V> output, Reporter reporter) throws Exception {
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
super.init(job, output, reporter);
try {
jc = job;
execContext = new ExecMapperContext(jc);
// create map and fetch operators
MapWork mrwork = Utilities.getMapWork(job);
CompilationOpContext runtimeCtx = new CompilationOpContext();
if (mrwork.getVectorMode()) {
mo = new VectorMapOperator(runtimeCtx);
} else {
mo = new MapOperator(runtimeCtx);
}
mo.setConf(mrwork);
// initialize map operator
mo.initialize(jc, null);
mo.setChildren(job);
LOG.info(mo.dump(0));
// initialize map local work
localWork = mrwork.getMapRedLocalWork();
execContext.setLocalWork(localWork);
MapredContext.init(true, new JobConf(jc));
MapredContext.get().setReporter(reporter);
mo.passExecContext(execContext);
mo.initializeLocalWork(jc);
mo.initializeMapOperator(jc);
mo.setReporter(rp);
if (localWork == null) {
return;
}
// The following code is for mapjoin
// initialize all the dummy ops
LOG.info("Initializing dummy operator");
List<Operator<? extends OperatorDesc>> dummyOps = localWork.getDummyParentOp();
for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
dummyOp.setExecContext(execContext);
dummyOp.initialize(jc, null);
}
} catch (Throwable e) {
abort = true;
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else {
throw new RuntimeException("Map operator initialization failed: " + e, e);
}
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class SparkPlanGenerator method generateMapInput.
@SuppressWarnings("unchecked")
private MapInput generateMapInput(SparkPlan sparkPlan, MapWork mapWork) throws Exception {
JobConf jobConf = cloneJobConf(mapWork);
Class ifClass = getInputFormat(jobConf, mapWork);
sc.sc().setCallSite(CallSite.apply(mapWork.getName(), ""));
JavaPairRDD<WritableComparable, Writable> hadoopRDD;
if (mapWork.getNumMapTasks() != null) {
jobConf.setNumMapTasks(mapWork.getNumMapTasks());
hadoopRDD = sc.hadoopRDD(jobConf, ifClass, WritableComparable.class, Writable.class, mapWork.getNumMapTasks());
} else {
hadoopRDD = sc.hadoopRDD(jobConf, ifClass, WritableComparable.class, Writable.class);
}
boolean toCache = false;
String tables = mapWork.getAllRootOperators().stream().filter(op -> op instanceof TableScanOperator).map(ts -> ((TableScanDesc) ts.getConf()).getAlias()).collect(Collectors.joining(", "));
String rddName = mapWork.getName() + " (" + tables + ", " + hadoopRDD.getNumPartitions() + (toCache ? ", cached)" : ")");
// Caching is disabled for MapInput due to HIVE-8920
MapInput result = new MapInput(sparkPlan, hadoopRDD, toCache, rddName);
return result;
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class SparkPlanGenerator method cloneJobConf.
@SuppressWarnings({ "unchecked" })
private JobConf cloneJobConf(BaseWork work) throws Exception {
if (workToJobConf.containsKey(work)) {
return workToJobConf.get(work);
}
JobConf cloned = new JobConf(jobConf);
// Make sure we'll use a different plan path from the original one
HiveConf.setVar(cloned, HiveConf.ConfVars.PLAN, "");
try {
cloned.setPartitionerClass(JavaUtils.loadClass(HiveConf.getVar(cloned, HiveConf.ConfVars.HIVEPARTITIONER)));
} catch (ClassNotFoundException e) {
String msg = "Could not find partitioner class: " + e.getMessage() + " which is specified by: " + HiveConf.ConfVars.HIVEPARTITIONER.varname;
throw new IllegalArgumentException(msg, e);
}
if (work instanceof MapWork) {
MapWork mapWork = (MapWork) work;
cloned.setBoolean("mapred.task.is.map", true);
List<Path> inputPaths = Utilities.getInputPaths(cloned, mapWork, scratchDir, context, false);
Utilities.setInputPaths(cloned, inputPaths);
Utilities.setMapWork(cloned, mapWork, scratchDir, false);
Utilities.createTmpDirs(cloned, mapWork);
if (work instanceof MergeFileWork) {
MergeFileWork mergeFileWork = (MergeFileWork) work;
cloned.set(Utilities.MAPRED_MAPPER_CLASS, MergeFileMapper.class.getName());
cloned.set("mapred.input.format.class", mergeFileWork.getInputformat());
cloned.setClass("mapred.output.format.class", MergeFileOutputFormat.class, FileOutputFormat.class);
} else {
cloned.set(Utilities.MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
}
if (mapWork.getMaxSplitSize() != null) {
HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mapWork.getMaxSplitSize());
}
if (mapWork.getMinSplitSize() != null) {
HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mapWork.getMinSplitSize());
}
if (mapWork.getMinSplitSizePerNode() != null) {
HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE, mapWork.getMinSplitSizePerNode());
}
if (mapWork.getMinSplitSizePerRack() != null) {
HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK, mapWork.getMinSplitSizePerRack());
}
// remember the JobConf cloned for each MapWork, so we won't clone for it again
workToJobConf.put(work, cloned);
} else if (work instanceof ReduceWork) {
cloned.setBoolean("mapred.task.is.map", false);
Utilities.setReduceWork(cloned, (ReduceWork) work, scratchDir, false);
Utilities.createTmpDirs(cloned, (ReduceWork) work);
cloned.set(Utilities.MAPRED_REDUCER_CLASS, ExecReducer.class.getName());
}
return cloned;
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class SparkPlanGenerator method generate.
private SparkTran generate(BaseWork work, SparkWork sparkWork) throws Exception {
initStatsPublisher(work);
JobConf newJobConf = cloneJobConf(work);
checkSpecs(work, newJobConf);
byte[] confBytes = KryoSerializer.serializeJobConf(newJobConf);
boolean caching = isCachingWork(work, sparkWork);
if (work instanceof MapWork) {
// Create tmp dir for MergeFileWork
if (work instanceof MergeFileWork) {
Path outputPath = ((MergeFileWork) work).getOutputDir();
Path tempOutPath = Utilities.toTempPath(outputPath);
FileSystem fs = outputPath.getFileSystem(jobConf);
try {
if (!fs.exists(tempOutPath)) {
fs.mkdirs(tempOutPath);
}
} catch (IOException e) {
throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage());
}
}
MapTran mapTran = new MapTran(caching, work.getName());
HiveMapFunction mapFunc = new HiveMapFunction(confBytes, sparkReporter);
mapTran.setMapFunction(mapFunc);
return mapTran;
} else if (work instanceof ReduceWork) {
ReduceTran reduceTran = new ReduceTran(caching, work.getName());
HiveReduceFunction reduceFunc = new HiveReduceFunction(confBytes, sparkReporter);
reduceTran.setReduceFunction(reduceFunc);
return reduceTran;
} else {
throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
}
}
use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.
the class DagUtils method createVertex.
/**
* Create a vertex from a given work object.
*
* @param conf JobConf to be used to this execution unit
* @param work The instance of BaseWork representing the actual work to be performed
* by this vertex.
* @param scratchDir HDFS scratch dir for this execution unit.
* @param fileSystem FS corresponding to scratchDir and LocalResources
* @param ctx This query's context
* @return Vertex
*/
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
Vertex v = null;
// BaseWork.
if (work instanceof MapWork) {
v = createVertex(conf, (MapWork) work, fileSystem, scratchDir, ctx, vertexType, localResources);
} else if (work instanceof ReduceWork) {
v = createVertex(conf, (ReduceWork) work, fileSystem, scratchDir, ctx, localResources);
} else if (work instanceof MergeJoinWork) {
v = createVertex(conf, (MergeJoinWork) work, fileSystem, scratchDir, ctx, vertexType, localResources);
// set VertexManagerPlugin if whether it's a cross product destination vertex
List<String> crossProductSources = new ArrayList<>();
for (BaseWork parentWork : tezWork.getParents(work)) {
if (tezWork.getEdgeType(parentWork, work) == EdgeType.XPROD_EDGE) {
crossProductSources.add(parentWork.getName());
}
}
if (!crossProductSources.isEmpty()) {
CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
v.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
// parallelism shouldn't be set for cartesian product vertex
}
} else {
// something is seriously wrong if this is happening
throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
}
// initialize stats publisher if necessary
if (work.isGatheringStats()) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(conf);
if (factory != null) {
StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(sCntxt)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
// final vertices need to have at least one output
if (!hasChildren) {
v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
}
return v;
}
Aggregations