Search in sources :

Example 1 with Event

use of org.apache.tez.runtime.api.Event in project hive by apache.

the class GenericUDTFGetSplits method getSplits.

public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
    DAG dag = DAG.create(work.getName());
    dag.setCredentials(job.getCredentials());
    DagUtils utils = DagUtils.getInstance();
    Context ctx = new Context(job);
    MapWork mapWork = (MapWork) work.getAllWork().get(0);
    // bunch of things get setup in the context based on conf but we need only the MR tmp directory
    // for the following method.
    JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
    // TODO: should we also whitelist input formats here? from mapred.input.format.class
    Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
    FileSystem fs = scratchDir.getFileSystem(job);
    try {
        LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
        Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
        String vertexName = wx.getName();
        dag.addVertex(wx);
        utils.addCredentials(mapWork, dag);
        // we have the dag now proceed to get the splits:
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
        HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
        List<Event> eventList = splitGenerator.initialize();
        InputSplit[] result = new InputSplit[eventList.size() - 1];
        InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
        List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
        Preconditions.checkState(hints.size() == eventList.size() - 1);
        if (LOG.isDebugEnabled()) {
            LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
        }
        LlapCoordinator coordinator = LlapCoordinator.getInstance();
        if (coordinator == null) {
            throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
        }
        // See the discussion in the implementation as to why we generate app ID.
        ApplicationId applicationId = coordinator.createExtClientAppId();
        // This assumes LLAP cluster owner is always the HS2 user.
        String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
        String queryUser = null;
        byte[] tokenBytes = null;
        LlapSigner signer = null;
        if (UserGroupInformation.isSecurityEnabled()) {
            signer = coordinator.getLlapSigner(job);
            // 1. Generate the token for query user (applies to all splits).
            queryUser = SessionState.getUserFromAuthenticator();
            if (queryUser == null) {
                queryUser = UserGroupInformation.getCurrentUser().getUserName();
                LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
            }
            LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
            // We put the query user, not LLAP user, into the message and token.
            Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
            LOG.info("Created the token for remote user: {}", token);
            bos.reset();
            token.write(dos);
            tokenBytes = bos.toByteArray();
        } else {
            queryUser = UserGroupInformation.getCurrentUser().getUserName();
        }
        LOG.info("Number of splits: " + (eventList.size() - 1));
        SignedMessage signedSvs = null;
        for (int i = 0; i < eventList.size() - 1; i++) {
            TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
            // 2. Generate the vertex/submit information for all events.
            if (i == 0) {
                // The queryId could either be picked up from the current request being processed, or
                // generated. The current request isn't exactly correct since the query is 'done' once we
                // return the results. Generating a new one has the added benefit of working once this
                // is moved out of a UDTF into a proper API.
                // Setting this to the generated AppId which is unique.
                // Despite the differences in TaskSpec, the vertex spec should be the same.
                signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
            }
            SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
            byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
            // 3. Generate input event.
            SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
            // 4. Make location hints.
            SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
            result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
        }
        return result;
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) SubmitWorkInfo(org.apache.hadoop.hive.llap.SubmitWorkInfo) LlapTokenIdentifier(org.apache.hadoop.hive.llap.security.LlapTokenIdentifier) SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) HiveSplitGenerator(org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator) TaskSpecBuilder(org.apache.tez.dag.api.TaskSpecBuilder) LlapSigner(org.apache.hadoop.hive.llap.security.LlapSigner) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LlapTokenLocalClient(org.apache.hadoop.hive.llap.security.LlapTokenLocalClient) DagUtils(org.apache.hadoop.hive.ql.exec.tez.DagUtils) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) Context(org.apache.hadoop.hive.ql.Context) Path(org.apache.hadoop.fs.Path) TaskSpec(org.apache.tez.runtime.api.impl.TaskSpec) SignedMessage(org.apache.hadoop.hive.llap.security.LlapSigner.SignedMessage) DAG(org.apache.tez.dag.api.DAG) IOException(java.io.IOException) LlapCoordinator(org.apache.hadoop.hive.llap.coordinator.LlapCoordinator) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LoginException(javax.security.auth.login.LoginException) URISyntaxException(java.net.URISyntaxException) UDFArgumentLengthException(org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) UDFArgumentException(org.apache.hadoop.hive.ql.exec.UDFArgumentException) UDFArgumentTypeException(org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException) CommandNeedRetryException(org.apache.hadoop.hive.ql.CommandNeedRetryException) IOException(java.io.IOException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Event(org.apache.tez.runtime.api.Event) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId)

Example 2 with Event

use of org.apache.tez.runtime.api.Event in project hive by apache.

the class AppMasterEventOperator method closeOp.

@Override
public void closeOp(boolean abort) throws HiveException {
    if (!abort) {
        TezContext context = (TezContext) TezContext.get();
        String vertexName = getConf().getVertexName();
        String inputName = getConf().getInputName();
        byte[] payload = null;
        if (hasReachedMaxSize) {
            initDataBuffer(true);
        }
        payload = new byte[buffer.getLength()];
        System.arraycopy(buffer.getData(), 0, payload, 0, buffer.getLength());
        Event event = InputInitializerEvent.create(vertexName, inputName, ByteBuffer.wrap(payload, 0, payload.length));
        if (isLogInfoEnabled) {
            LOG.info("Sending Tez event to vertex = " + vertexName + ", input = " + inputName + ". Payload size = " + payload.length);
        }
        context.getTezProcessorContext().sendEvents(Collections.singletonList(event));
    }
}
Also used : InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) Event(org.apache.tez.runtime.api.Event) TezContext(org.apache.hadoop.hive.ql.exec.tez.TezContext)

Example 3 with Event

use of org.apache.tez.runtime.api.Event in project hive by apache.

the class HiveSplitGenerator method createEventList.

private List<Event> createEventList(boolean sendSerializedEvents, InputSplitInfoMem inputSplitInfo) {
    List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo.getNumTasks() + 1);
    InputConfigureVertexTasksEvent configureVertexEvent = InputConfigureVertexTasksEvent.create(inputSplitInfo.getNumTasks(), VertexLocationHint.create(inputSplitInfo.getTaskLocationHints()), InputSpecUpdate.getDefaultSinglePhysicalInputSpecUpdate());
    events.add(configureVertexEvent);
    if (sendSerializedEvents) {
        MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
        int count = 0;
        for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
            InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count++, mrSplit.toByteString().asReadOnlyByteBuffer());
            events.add(diEvent);
        }
    } else {
        int count = 0;
        for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
            InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
            events.add(diEvent);
        }
    }
    return events;
}
Also used : MRSplitsProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitsProto) InputSplit(org.apache.hadoop.mapred.InputSplit) InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent)

Example 4 with Event

use of org.apache.tez.runtime.api.Event in project hive by apache.

the class CustomPartitionVertex method onRootVertexInitialized.

// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) {
    numInputsSeenSoFar++;
    LOG.info("On root vertex initialized " + inputName);
    try {
        // This is using the payload from the RootVertexInitializer corresponding
        // to InputName. Ideally it should be using it's own configuration class -
        // but that
        // means serializing another instance.
        MRInputUserPayloadProto protoPayload = MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
        this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
        /*
       * Currently in tez, the flow of events is thus:
       * "Generate Splits -> Initialize Vertex" (with parallelism info obtained
       * from the generate splits phase). The generate splits phase groups
       * splits using the TezGroupedSplitsInputFormat. However, for bucket map
       * joins the grouping done by this input format results in incorrect
       * results as the grouper has no knowledge of buckets. So, we initially
       * set the input format to be HiveInputFormat (in DagUtils) for the case
       * of bucket map joins so as to obtain un-grouped splits. We then group
       * the splits corresponding to buckets using the tez grouper which returns
       * TezGroupedSplits.
       */
        // This assumes that Grouping will always be used.
        // Enabling grouping on the payload.
        MRInputUserPayloadProto updatedPayload = MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
        inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
    boolean dataInformationEventSeen = false;
    Map<String, Set<FileSplit>> pathFileSplitsMap = new TreeMap<String, Set<FileSplit>>();
    for (Event event : events) {
        if (event instanceof InputConfigureVertexTasksEvent) {
            // No tasks should have been started yet. Checked by initial state
            // check.
            LOG.info("Got a input configure vertex event for input: " + inputName);
            Preconditions.checkState(dataInformationEventSeen == false);
            InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
            // The vertex cannot be configured until all DataEvents are seen - to
            // build the routing table.
            configureVertexTaskEvent = cEvent;
            LOG.info("Configure task for input name: " + inputName + " num tasks: " + configureVertexTaskEvent.getNumTasks());
        }
        if (event instanceof InputUpdatePayloadEvent) {
            // this event can never occur. If it does, fail.
            Preconditions.checkState(false);
        } else if (event instanceof InputDataInformationEvent) {
            dataInformationEventSeen = true;
            InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
            FileSplit fileSplit;
            try {
                fileSplit = getFileSplitFromEvent(diEvent);
            } catch (IOException e) {
                throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
            }
            Set<FileSplit> fsList = pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()));
            if (fsList == null) {
                fsList = new TreeSet<FileSplit>(new PathComparatorForSplit());
                pathFileSplitsMap.put(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
            }
            fsList.add(fileSplit);
        }
    }
    LOG.info("Path file splits map for input name: " + inputName + " is " + pathFileSplitsMap);
    Multimap<Integer, InputSplit> bucketToInitialSplitMap = getBucketSplitMapForPath(pathFileSplitsMap);
    try {
        int totalResource = context.getTotalAvailableResource().getMemory();
        int taskResource = context.getVertexTaskResource().getMemory();
        float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
        int availableSlots = totalResource / taskResource;
        LOG.info("Grouping splits. " + availableSlots + " available slots, " + waves + " waves. Bucket initial splits map: " + bucketToInitialSplitMap);
        JobConf jobConf = new JobConf(conf);
        ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
        Multimap<Integer, InputSplit> bucketToGroupedSplitMap = HashMultimap.<Integer, InputSplit>create();
        boolean secondLevelGroupingDone = false;
        if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, mainWorkName.isEmpty(), splitLocationProvider);
                if (mainWorkName.isEmpty() == false) {
                    Multimap<Integer, InputSplit> singleBucketToGroupedSplit = HashMultimap.<Integer, InputSplit>create();
                    singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
                    groupedSplit = grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots, HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES), null);
                    secondLevelGroupingDone = true;
                }
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
        } else {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            // all the bucket files.
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, false, splitLocationProvider);
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            /*
         * this is the small table side. In case of SMB join, we need to send each split to the
         * corresponding bucket-based task on the other side. In case a split needs to go to
         * multiple downstream tasks, we need to clone the event and send it to the right
         * destination.
         */
            LOG.info("This is the side work - multi-mr work.");
            processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Set(java.util.Set) TreeSet(java.util.TreeSet) IOException(java.io.IOException) ByteString(com.google.protobuf.ByteString) TreeMap(java.util.TreeMap) FileSplit(org.apache.hadoop.mapred.FileSplit) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) IOException(java.io.IOException) InputUpdatePayloadEvent(org.apache.tez.runtime.api.events.InputUpdatePayloadEvent) TreeSet(java.util.TreeSet) MRInputUserPayloadProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputUpdatePayloadEvent(org.apache.tez.runtime.api.events.InputUpdatePayloadEvent) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) SplitLocationProvider(org.apache.hadoop.mapred.split.SplitLocationProvider) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent)

Aggregations

Event (org.apache.tez.runtime.api.Event)4 InputSplit (org.apache.hadoop.mapred.InputSplit)3 InputConfigureVertexTasksEvent (org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent)3 InputDataInformationEvent (org.apache.tez.runtime.api.events.InputDataInformationEvent)3 IOException (java.io.IOException)2 JobConf (org.apache.hadoop.mapred.JobConf)2 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)2 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)2 InputInitializerEvent (org.apache.tez.runtime.api.events.InputInitializerEvent)2 ByteString (com.google.protobuf.ByteString)1 FileNotFoundException (java.io.FileNotFoundException)1 URISyntaxException (java.net.URISyntaxException)1 Set (java.util.Set)1 TreeMap (java.util.TreeMap)1 TreeSet (java.util.TreeSet)1 LoginException (javax.security.auth.login.LoginException)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 LlapInputSplit (org.apache.hadoop.hive.llap.LlapInputSplit)1 SubmitWorkInfo (org.apache.hadoop.hive.llap.SubmitWorkInfo)1