Search in sources :

Example 6 with VertexExecutionContext

use of org.apache.tez.dag.api.Vertex.VertexExecutionContext in project tez by apache.

the class DAG method createDag.

// create protobuf message describing DAG
@Private
public synchronized DAGPlan createDag(Configuration tezConf, Credentials extraCredentials, Map<String, LocalResource> tezJarResources, LocalResource binaryConfig, boolean tezLrsAsArchive, ServicePluginsDescriptor servicePluginsDescriptor, JavaOptsChecker javaOptsChecker) {
    Deque<String> topologicalVertexStack = verify(true);
    verifyLocalResources(tezConf);
    DAGPlan.Builder dagBuilder = DAGPlan.newBuilder();
    dagBuilder.setName(this.name);
    if (this.callerContext != null) {
        dagBuilder.setCallerContext(DagTypeConverters.convertCallerContextToProto(callerContext));
    }
    if (this.dagInfo != null && !this.dagInfo.isEmpty()) {
        dagBuilder.setDagInfo(this.dagInfo);
    }
    // Setup default execution context.
    VertexExecutionContext defaultContext = getDefaultExecutionContext();
    verifyExecutionContext(defaultContext, servicePluginsDescriptor, "DAGDefault");
    if (defaultContext != null) {
        DAGProtos.VertexExecutionContextProto contextProto = DagTypeConverters.convertToProto(defaultContext);
        dagBuilder.setDefaultExecutionContext(contextProto);
    }
    if (!vertexGroups.isEmpty()) {
        for (VertexGroup av : vertexGroups) {
            GroupInfo groupInfo = av.getGroupInfo();
            PlanVertexGroupInfo.Builder groupBuilder = PlanVertexGroupInfo.newBuilder();
            groupBuilder.setGroupName(groupInfo.getGroupName());
            for (Vertex v : groupInfo.getMembers()) {
                groupBuilder.addGroupMembers(v.getName());
            }
            groupBuilder.addAllOutputs(groupInfo.outputs);
            for (Map.Entry<String, InputDescriptor> entry : groupInfo.edgeMergedInputs.entrySet()) {
                groupBuilder.addEdgeMergedInputs(PlanGroupInputEdgeInfo.newBuilder().setDestVertexName(entry.getKey()).setMergedInput(DagTypeConverters.convertToDAGPlan(entry.getValue())));
            }
            dagBuilder.addVertexGroups(groupBuilder);
        }
    }
    Credentials dagCredentials = new Credentials();
    if (extraCredentials != null) {
        dagCredentials.mergeAll(extraCredentials);
    }
    dagCredentials.mergeAll(credentials);
    if (!commonTaskLocalFiles.isEmpty()) {
        dagBuilder.addAllLocalResource(DagTypeConverters.convertToDAGPlan(commonTaskLocalFiles));
    }
    Preconditions.checkArgument(topologicalVertexStack.size() == vertices.size(), "size of topologicalVertexStack is:" + topologicalVertexStack.size() + " while size of vertices is:" + vertices.size() + ", make sure they are the same in order to sort the vertices");
    while (!topologicalVertexStack.isEmpty()) {
        Vertex vertex = vertices.get(topologicalVertexStack.pop());
        // infer credentials, resources and parallelism from data source
        Resource vertexTaskResource = vertex.getTaskResource();
        if (vertexTaskResource == null) {
            vertexTaskResource = Resource.newInstance(tezConf.getInt(TezConfiguration.TEZ_TASK_RESOURCE_MEMORY_MB, TezConfiguration.TEZ_TASK_RESOURCE_MEMORY_MB_DEFAULT), tezConf.getInt(TezConfiguration.TEZ_TASK_RESOURCE_CPU_VCORES, TezConfiguration.TEZ_TASK_RESOURCE_CPU_VCORES_DEFAULT));
        }
        Map<String, LocalResource> vertexLRs = Maps.newHashMap();
        vertexLRs.putAll(vertex.getTaskLocalFiles());
        List<DataSourceDescriptor> dataSources = vertex.getDataSources();
        for (DataSourceDescriptor dataSource : dataSources) {
            if (dataSource.getCredentials() != null) {
                dagCredentials.addAll(dataSource.getCredentials());
            }
            if (dataSource.getAdditionalLocalFiles() != null) {
                TezCommonUtils.addAdditionalLocalResources(dataSource.getAdditionalLocalFiles(), vertexLRs, "Vertex " + vertex.getName());
            }
        }
        if (tezJarResources != null) {
            TezCommonUtils.addAdditionalLocalResources(tezJarResources, vertexLRs, "Vertex " + vertex.getName());
        }
        if (binaryConfig != null) {
            vertexLRs.put(TezConstants.TEZ_PB_BINARY_CONF_NAME, binaryConfig);
        }
        int vertexParallelism = vertex.getParallelism();
        VertexLocationHint vertexLocationHint = vertex.getLocationHint();
        if (dataSources.size() == 1) {
            DataSourceDescriptor dataSource = dataSources.get(0);
            if (vertexParallelism == -1 && dataSource.getNumberOfShards() > -1) {
                vertexParallelism = dataSource.getNumberOfShards();
            }
            if (vertexLocationHint == null && dataSource.getLocationHint() != null) {
                vertexLocationHint = dataSource.getLocationHint();
            }
        }
        if (vertexParallelism == -1) {
            Preconditions.checkState(vertexLocationHint == null, "Cannot specify vertex location hint without specifying vertex parallelism. Vertex: " + vertex.getName());
        } else if (vertexLocationHint != null) {
            Preconditions.checkState(vertexParallelism == vertexLocationHint.getTaskLocationHints().size(), "vertex task location hint must equal vertex parallelism. Vertex: " + vertex.getName());
        }
        for (DataSinkDescriptor dataSink : vertex.getDataSinks()) {
            if (dataSink.getCredentials() != null) {
                dagCredentials.addAll(dataSink.getCredentials());
            }
        }
        VertexPlan.Builder vertexBuilder = VertexPlan.newBuilder();
        vertexBuilder.setName(vertex.getName());
        // vertex type is implicitly NORMAL until  TEZ-46.
        vertexBuilder.setType(PlanVertexType.NORMAL);
        vertexBuilder.setProcessorDescriptor(DagTypeConverters.convertToDAGPlan(vertex.getProcessorDescriptor()));
        // Vertex ExecutionContext setup
        VertexExecutionContext execContext = vertex.getVertexExecutionContext();
        verifyExecutionContext(execContext, servicePluginsDescriptor, vertex.getName());
        if (execContext != null) {
            DAGProtos.VertexExecutionContextProto contextProto = DagTypeConverters.convertToProto(execContext);
            vertexBuilder.setExecutionContext(contextProto);
        }
        if (vertex.getInputs().size() > 0) {
            for (RootInputLeafOutput<InputDescriptor, InputInitializerDescriptor> input : vertex.getInputs()) {
                vertexBuilder.addInputs(DagTypeConverters.convertToDAGPlan(input));
            }
        }
        if (vertex.getOutputs().size() > 0) {
            for (RootInputLeafOutput<OutputDescriptor, OutputCommitterDescriptor> output : vertex.getOutputs()) {
                vertexBuilder.addOutputs(DagTypeConverters.convertToDAGPlan(output));
            }
        }
        if (vertex.getConf() != null && vertex.getConf().size() > 0) {
            ConfigurationProto.Builder confBuilder = ConfigurationProto.newBuilder();
            TezUtils.populateConfProtoFromEntries(vertex.getConf().entrySet(), confBuilder);
            vertexBuilder.setVertexConf(confBuilder);
        }
        // task config
        PlanTaskConfiguration.Builder taskConfigBuilder = PlanTaskConfiguration.newBuilder();
        taskConfigBuilder.setNumTasks(vertexParallelism);
        taskConfigBuilder.setMemoryMb(vertexTaskResource.getMemory());
        taskConfigBuilder.setVirtualCores(vertexTaskResource.getVirtualCores());
        try {
            taskConfigBuilder.setJavaOpts(TezClientUtils.addDefaultsToTaskLaunchCmdOpts(vertex.getTaskLaunchCmdOpts(), tezConf, javaOptsChecker));
        } catch (TezException e) {
            throw new TezUncheckedException("Invalid TaskLaunchCmdOpts defined for Vertex " + vertex.getName() + " : " + e.getMessage(), e);
        }
        taskConfigBuilder.setTaskModule(vertex.getName());
        if (!vertexLRs.isEmpty()) {
            taskConfigBuilder.addAllLocalResource(DagTypeConverters.convertToDAGPlan(vertexLRs));
        }
        Map<String, String> taskEnv = Maps.newHashMap(vertex.getTaskEnvironment());
        TezYARNUtils.setupDefaultEnv(taskEnv, tezConf, TezConfiguration.TEZ_TASK_LAUNCH_ENV, TezConfiguration.TEZ_TASK_LAUNCH_ENV_DEFAULT, TezConfiguration.TEZ_TASK_LAUNCH_CLUSTER_DEFAULT_ENV, TezConfiguration.TEZ_TASK_LAUNCH_CLUSTER_DEFAULT_ENV_DEFAULT, tezLrsAsArchive);
        for (Map.Entry<String, String> entry : taskEnv.entrySet()) {
            PlanKeyValuePair.Builder envSettingBuilder = PlanKeyValuePair.newBuilder();
            envSettingBuilder.setKey(entry.getKey());
            envSettingBuilder.setValue(entry.getValue());
            taskConfigBuilder.addEnvironmentSetting(envSettingBuilder);
        }
        if (vertexLocationHint != null) {
            if (vertexLocationHint.getTaskLocationHints() != null) {
                for (TaskLocationHint hint : vertexLocationHint.getTaskLocationHints()) {
                    PlanTaskLocationHint.Builder taskLocationHintBuilder = PlanTaskLocationHint.newBuilder();
                    // we can allow this later on if needed
                    if (hint.getAffinitizedTask() != null) {
                        throw new TezUncheckedException("Task based affinity may not be specified via the DAG API");
                    }
                    if (hint.getHosts() != null) {
                        taskLocationHintBuilder.addAllHost(hint.getHosts());
                    }
                    if (hint.getRacks() != null) {
                        taskLocationHintBuilder.addAllRack(hint.getRacks());
                    }
                    vertexBuilder.addTaskLocationHint(taskLocationHintBuilder);
                }
            }
        }
        if (vertex.getVertexManagerPlugin() != null) {
            vertexBuilder.setVertexManagerPlugin(DagTypeConverters.convertToDAGPlan(vertex.getVertexManagerPlugin()));
        }
        for (Edge inEdge : vertex.getInputEdges()) {
            vertexBuilder.addInEdgeId(inEdge.getId());
        }
        for (Edge outEdge : vertex.getOutputEdges()) {
            vertexBuilder.addOutEdgeId(outEdge.getId());
        }
        vertexBuilder.setTaskConfig(taskConfigBuilder);
        dagBuilder.addVertex(vertexBuilder);
    }
    for (Edge edge : edges) {
        EdgePlan.Builder edgeBuilder = EdgePlan.newBuilder();
        edgeBuilder.setId(edge.getId());
        edgeBuilder.setInputVertexName(edge.getInputVertex().getName());
        edgeBuilder.setOutputVertexName(edge.getOutputVertex().getName());
        edgeBuilder.setDataMovementType(DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getDataMovementType()));
        edgeBuilder.setDataSourceType(DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getDataSourceType()));
        edgeBuilder.setSchedulingType(DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getSchedulingType()));
        edgeBuilder.setEdgeSource(DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getEdgeSource()));
        edgeBuilder.setEdgeDestination(DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getEdgeDestination()));
        if (edge.getEdgeProperty().getDataMovementType() == DataMovementType.CUSTOM) {
            if (edge.getEdgeProperty().getEdgeManagerDescriptor() != null) {
                edgeBuilder.setEdgeManager(DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getEdgeManagerDescriptor()));
            }
        // else the AM will deal with this.
        }
        dagBuilder.addEdge(edgeBuilder);
    }
    if (dagAccessControls != null) {
        dagBuilder.setAclInfo(DagTypeConverters.convertDAGAccessControlsToProto(dagAccessControls));
    }
    ConfigurationProto.Builder confProtoBuilder = ConfigurationProto.newBuilder();
    if (!this.dagConf.isEmpty()) {
        TezUtils.populateConfProtoFromEntries(this.dagConf.entrySet(), confProtoBuilder);
    }
    // Copy historyLogLevel from tezConf into dagConf if its not overridden in dagConf.
    String logLevel = this.dagConf.get(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL);
    if (logLevel != null) {
        // the value is valid.
        if (!HistoryLogLevel.validateLogLevel(logLevel)) {
            throw new IllegalArgumentException("Config: " + TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL + " is set to invalid value: " + logLevel);
        }
    } else {
        // Validate and set value from tezConf.
        logLevel = tezConf.get(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL);
        if (logLevel != null) {
            if (!HistoryLogLevel.validateLogLevel(logLevel)) {
                throw new IllegalArgumentException("Config: " + TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL + " is set to invalid value: " + logLevel);
            }
            PlanKeyValuePair.Builder kvp = PlanKeyValuePair.newBuilder();
            kvp.setKey(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL);
            kvp.setValue(logLevel);
            confProtoBuilder.addConfKeyValues(kvp);
        }
    }
    dagBuilder.setDagConf(confProtoBuilder);
    if (dagCredentials != null) {
        dagBuilder.setCredentialsBinary(DagTypeConverters.convertCredentialsToProto(dagCredentials));
        TezCommonUtils.logCredentials(LOG, dagCredentials, "dag");
    }
    return dagBuilder.build();
}
Also used : PlanTaskLocationHint(org.apache.tez.dag.api.records.DAGProtos.PlanTaskLocationHint) PlanTaskConfiguration(org.apache.tez.dag.api.records.DAGProtos.PlanTaskConfiguration) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) ConfigurationProto(org.apache.tez.dag.api.records.DAGProtos.ConfigurationProto) PlanTaskLocationHint(org.apache.tez.dag.api.records.DAGProtos.PlanTaskLocationHint) DAGPlan(org.apache.tez.dag.api.records.DAGProtos.DAGPlan) PlanVertexGroupInfo(org.apache.tez.dag.api.records.DAGProtos.PlanVertexGroupInfo) VertexPlan(org.apache.tez.dag.api.records.DAGProtos.VertexPlan) PlanVertexGroupInfo(org.apache.tez.dag.api.records.DAGProtos.PlanVertexGroupInfo) GroupInfo(org.apache.tez.dag.api.VertexGroup.GroupInfo) DAGProtos(org.apache.tez.dag.api.records.DAGProtos) Resource(org.apache.hadoop.yarn.api.records.Resource) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) PlanTaskLocationHint(org.apache.tez.dag.api.records.DAGProtos.PlanTaskLocationHint) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) PlanKeyValuePair(org.apache.tez.dag.api.records.DAGProtos.PlanKeyValuePair) Map(java.util.Map) DualLinkedHashBidiMap(org.apache.commons.collections4.bidimap.DualLinkedHashBidiMap) HashMap(java.util.HashMap) BidiMap(org.apache.commons.collections4.BidiMap) Credentials(org.apache.hadoop.security.Credentials) EdgePlan(org.apache.tez.dag.api.records.DAGProtos.EdgePlan) Private(org.apache.hadoop.classification.InterfaceAudience.Private)

Example 7 with VertexExecutionContext

use of org.apache.tez.dag.api.Vertex.VertexExecutionContext in project hive by apache.

the class DagUtils method createVertex.

/**
 * Create a vertex from a given work object.
 *
 * @param conf JobConf to be used to this execution unit
 * @param workUnit The instance of BaseWork representing the actual work to be performed
 * by this vertex.
 * @param scratchDir HDFS scratch dir for this execution unit.
 * @return Vertex
 */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork workUnit, Path scratchDir, TezWork tezWork, Map<String, LocalResource> localResources) throws Exception {
    Vertex vertex;
    // simply dispatch the call to the right method for the actual (sub-) type of
    // BaseWork.
    VertexType vertexType = tezWork.getVertexType(workUnit);
    if (workUnit instanceof MapWork) {
        vertex = createVertexFromMapWork(conf, (MapWork) workUnit, scratchDir, vertexType);
    } else if (workUnit instanceof ReduceWork) {
        vertex = createVertexFromReduceWork(conf, (ReduceWork) workUnit, scratchDir);
    } else if (workUnit instanceof MergeJoinWork) {
        vertex = createVertexFromMergeWork(conf, (MergeJoinWork) workUnit, scratchDir, vertexType);
        // set VertexManagerPlugin if whether it's a cross product destination vertex
        List<String> crossProductSources = new ArrayList<>();
        for (BaseWork parentWork : tezWork.getParents(workUnit)) {
            if (tezWork.getEdgeType(parentWork, workUnit) == EdgeType.XPROD_EDGE) {
                crossProductSources.add(parentWork.getName());
            }
        }
        if (!crossProductSources.isEmpty()) {
            CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
            vertex.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
        // parallelism shouldn't be set for cartesian product vertex
        }
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    VertexExecutionContext vertexExecutionContext = createVertexExecutionContext(workUnit);
    vertex.addTaskLocalFiles(localResources);
    vertex.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    vertex.setExecutionContext(vertexExecutionContext);
    // initialize stats publisher if necessary
    if (workUnit.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(workUnit, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    final Class outputKlass;
    if (HiveOutputFormatImpl.class.getName().equals(conf.get("mapred.output.format.class"))) {
        // Hive uses this output format, when it is going to write all its data through FS operator
        outputKlass = NullMROutput.class;
    } else {
        outputKlass = MROutput.class;
    }
    // If there is a fileSink add a DataSink to the vertex
    boolean hasFileSink = workUnit.getAllOperators().stream().anyMatch(o -> o instanceof FileSinkOperator);
    // final vertices need to have at least one output
    boolean endVertex = tezWork.getLeaves().contains(workUnit);
    if (endVertex || hasFileSink) {
        OutputCommitterDescriptor ocd = null;
        String committer = HiveConf.getVar(conf, ConfVars.TEZ_MAPREDUCE_OUTPUT_COMMITTER);
        if (committer != null && !committer.isEmpty()) {
            ocd = OutputCommitterDescriptor.create(committer);
        }
        vertex.addDataSink("out_" + workUnit.getName(), new DataSinkDescriptor(OutputDescriptor.create(outputKlass.getName()).setUserPayload(vertex.getProcessorDescriptor().getUserPayload()), ocd, null));
    }
    return vertex;
}
Also used : StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) ArrayList(java.util.ArrayList) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) HiveOutputFormatImpl(org.apache.hadoop.hive.ql.io.HiveOutputFormatImpl) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) VertexType(org.apache.hadoop.hive.ql.plan.TezWork.VertexType) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Example 8 with VertexExecutionContext

use of org.apache.tez.dag.api.Vertex.VertexExecutionContext in project hive by apache.

the class DagUtils method createVertex.

/*
   * Helper function to create Vertex from MapWork.
   */
private Vertex createVertex(JobConf conf, MapWork mapWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
    Path tezDir = getTezDir(mrScratchDir);
    // set up the operator plan
    Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
    // create the directories FileSinkOperators need
    Utilities.createTmpDirs(conf, mapWork);
    // finally create the vertex
    Vertex map = null;
    // use tez to combine splits
    boolean groupSplitsInInputInitializer;
    DataSourceDescriptor dataSource;
    int numTasks = -1;
    @SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
    boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
    LOG.info("Vertex has custom input? " + vertexHasCustomInput);
    if (vertexHasCustomInput) {
        groupSplitsInInputInitializer = false;
        // grouping happens in execution phase. The input payload should not enable grouping here,
        // it will be enabled in the CustomVertex.
        inputFormatClass = HiveInputFormat.class;
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
        // this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
        // is HiveInputFormat
        if (inputFormatClass == HiveInputFormat.class) {
            groupSplitsInInputInitializer = true;
        } else {
            groupSplitsInInputInitializer = false;
        }
    }
    if (mapWork instanceof MergeFileWork) {
        Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
        // prepare the tmp output directory. The output tmp directory should
        // exist before jobClose (before renaming after job completion)
        Path tempOutPath = Utilities.toTempPath(outputPath);
        try {
            FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
            if (!tmpOutFS.exists(tempOutPath)) {
                tmpOutFS.mkdirs(tempOutPath);
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
        }
    }
    // remember mapping of plan to input
    conf.set(Utilities.INPUT_NAME, mapWork.getName());
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
        // set up the operator plan. (before setting up splits on the AM)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
        // the correct plugin.
        if (groupSplitsInInputInitializer) {
            // Not setting a payload, since the MRInput payload is the same and can be accessed.
            InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
            dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
        } else {
            // Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
            if (vertexHasCustomInput) {
                dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            } else {
                dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            }
        }
    } else {
        // Setup client side split generation.
        // we need to set this, because with HS2 and client side split
        // generation we end up not finding the map work. This is
        // because of thread local madness (tez split generation is
        // multi-threaded - HS2 plan cache uses thread locals). Setting
        // VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
        // of the map work.
        conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
        conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
        dataSource = MRInputHelpers.configureMRInputWithLegacySplitGeneration(conf, new Path(tezDir, "split_" + mapWork.getName().replaceAll(" ", "_")), true);
        numTasks = dataSource.getNumberOfShards();
        // set up the operator plan. (after generating splits - that changes configs)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
    }
    UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
    String procClassName = MapTezProcessor.class.getName();
    if (mapWork instanceof MergeFileWork) {
        procClassName = MergeFileTezProcessor.class.getName();
    }
    VertexExecutionContext executionContext = createVertexExecutionContext(mapWork);
    map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
    map.setTaskEnvironment(getContainerEnvironment(conf, true));
    map.setExecutionContext(executionContext);
    map.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    assert mapWork.getAliasToWork().keySet().size() == 1;
    // Add the actual source input
    String alias = mapWork.getAliasToWork().keySet().iterator().next();
    map.addDataSource(alias, dataSource);
    Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
    localResources.put(getBaseName(appJarLr), appJarLr);
    for (LocalResource lr : additionalLr) {
        localResources.put(getBaseName(lr), lr);
    }
    map.addTaskLocalFiles(localResources);
    return map;
}
Also used : Path(org.apache.hadoop.fs.Path) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) UserPayload(org.apache.tez.dag.api.UserPayload) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) IOException(java.io.IOException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) FileSystem(org.apache.hadoop.fs.FileSystem) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 9 with VertexExecutionContext

use of org.apache.tez.dag.api.Vertex.VertexExecutionContext in project hive by apache.

the class DagUtils method createVertex.

/*
   * Helper function to create Vertex from MapWork.
   */
private Vertex createVertex(JobConf conf, MapWork mapWork, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
    // set up the operator plan
    Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
    // create the directories FileSinkOperators need
    Utilities.createTmpDirs(conf, mapWork);
    // finally create the vertex
    Vertex map = null;
    // use tez to combine splits
    boolean groupSplitsInInputInitializer;
    DataSourceDescriptor dataSource;
    int numTasks = -1;
    @SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
    boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
    LOG.info("Vertex has custom input? " + vertexHasCustomInput);
    if (vertexHasCustomInput) {
        groupSplitsInInputInitializer = false;
        // grouping happens in execution phase. The input payload should not enable grouping here,
        // it will be enabled in the CustomVertex.
        inputFormatClass = HiveInputFormat.class;
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
        // this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
        // is HiveInputFormat
        if (inputFormatClass == HiveInputFormat.class) {
            groupSplitsInInputInitializer = true;
        } else {
            groupSplitsInInputInitializer = false;
        }
    }
    if (mapWork instanceof MergeFileWork) {
        Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
        // prepare the tmp output directory. The output tmp directory should
        // exist before jobClose (before renaming after job completion)
        Path tempOutPath = Utilities.toTempPath(outputPath);
        try {
            FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
            if (!tmpOutFS.exists(tempOutPath)) {
                tmpOutFS.mkdirs(tempOutPath);
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
        }
    }
    // remember mapping of plan to input
    conf.set(Utilities.INPUT_NAME, mapWork.getName());
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
        // set up the operator plan. (before setting up splits on the AM)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
        // the correct plugin.
        if (groupSplitsInInputInitializer) {
            // Not setting a payload, since the MRInput payload is the same and can be accessed.
            InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
            dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
        } else {
            // Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
            if (vertexHasCustomInput && vertexType == VertexType.MULTI_INPUT_UNINITIALIZED_EDGES) {
                // SMB Join.
                dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            } else {
                dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            }
        }
    } else {
        // Setup client side split generation.
        // we need to set this, because with HS2 and client side split
        // generation we end up not finding the map work. This is
        // because of thread local madness (tez split generation is
        // multi-threaded - HS2 plan cache uses thread locals). Setting
        // VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
        // of the map work.
        conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
        conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
        InputSplitInfo inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(conf, false, 0);
        InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName());
        InputDescriptor inputDescriptor = InputDescriptor.create(MRInputLegacy.class.getName()).setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(conf)).setSplits(inputSplitInfo.getSplitsProto()).build().toByteString().asReadOnlyByteBuffer()));
        dataSource = DataSourceDescriptor.create(inputDescriptor, descriptor, null);
        numTasks = inputSplitInfo.getNumTasks();
        // set up the operator plan. (after generating splits - that changes configs)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
    }
    UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
    String procClassName = MapTezProcessor.class.getName();
    if (mapWork instanceof MergeFileWork) {
        procClassName = MergeFileTezProcessor.class.getName();
    }
    VertexExecutionContext executionContext = createVertexExecutionContext(mapWork);
    map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
    map.setTaskEnvironment(getContainerEnvironment(conf, true));
    map.setExecutionContext(executionContext);
    map.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    assert mapWork.getAliasToWork().keySet().size() == 1;
    // Add the actual source input
    String alias = mapWork.getAliasToWork().keySet().iterator().next();
    map.addDataSource(alias, dataSource);
    map.addTaskLocalFiles(localResources);
    return map;
}
Also used : Path(org.apache.hadoop.fs.Path) InputDescriptor(org.apache.tez.dag.api.InputDescriptor) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) UserPayload(org.apache.tez.dag.api.UserPayload) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) InputSplitInfo(org.apache.tez.mapreduce.hadoop.InputSplitInfo) IOException(java.io.IOException) MRInputSplitDistributor(org.apache.tez.mapreduce.common.MRInputSplitDistributor) FileSystem(org.apache.hadoop.fs.FileSystem) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 10 with VertexExecutionContext

use of org.apache.tez.dag.api.Vertex.VertexExecutionContext in project tez by apache.

the class TestVertexImpl2 method testDefaultExecContextViaDag.

@Test(timeout = 5000)
public void testDefaultExecContextViaDag() {
    VertexExecutionContext defaultExecContext = VertexExecutionContext.create(ExecutionContextTestInfoHolder.append(ExecutionContextTestInfoHolder.TASK_SCHEDULER_NAME_BASE, 0), ExecutionContextTestInfoHolder.append(ExecutionContextTestInfoHolder.CONTAINER_LAUNCHER_NAME_BASE, 2), ExecutionContextTestInfoHolder.append(ExecutionContextTestInfoHolder.TASK_COMM_NAME_BASE, 2));
    ExecutionContextTestInfoHolder info = new ExecutionContextTestInfoHolder(null, defaultExecContext, 3);
    VertexWrapper vertexWrapper = createVertexWrapperForExecutionContextTest(info);
    assertEquals(0, vertexWrapper.vertex.taskSchedulerIdentifier);
    assertEquals(2, vertexWrapper.vertex.containerLauncherIdentifier);
    assertEquals(2, vertexWrapper.vertex.taskCommunicatorIdentifier);
}
Also used : VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) Test(org.junit.Test)

Aggregations

VertexExecutionContext (org.apache.tez.dag.api.Vertex.VertexExecutionContext)12 Test (org.junit.Test)6 PreWarmVertex (org.apache.tez.dag.api.PreWarmVertex)5 Vertex (org.apache.tez.dag.api.Vertex)5 HashMap (java.util.HashMap)4 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)4 IOException (java.io.IOException)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 MergeFileWork (org.apache.hadoop.hive.ql.io.merge.MergeFileWork)2 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)2 InputInitializerDescriptor (org.apache.tez.dag.api.InputInitializerDescriptor)2 UserPayload (org.apache.tez.dag.api.UserPayload)2 DAGPlan (org.apache.tez.dag.api.records.DAGProtos.DAGPlan)2 VertexExecutionContextProto (org.apache.tez.dag.api.records.DAGProtos.VertexExecutionContextProto)2 VertexPlan (org.apache.tez.dag.api.records.DAGProtos.VertexPlan)2 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 BidiMap (org.apache.commons.collections4.BidiMap)1