Search in sources :

Example 1 with TaskLocationHint

use of org.apache.tez.dag.api.TaskLocationHint in project hive by apache.

the class GenericUDTFGetSplits method getSplits.

public InputSplit[] getSplits(JobConf job, int numSplits, TezWork work, Schema schema) throws IOException {
    DAG dag = DAG.create(work.getName());
    dag.setCredentials(job.getCredentials());
    DagUtils utils = DagUtils.getInstance();
    Context ctx = new Context(job);
    MapWork mapWork = (MapWork) work.getAllWork().get(0);
    // bunch of things get setup in the context based on conf but we need only the MR tmp directory
    // for the following method.
    JobConf wxConf = utils.initializeVertexConf(job, ctx, mapWork);
    // TODO: should we also whitelist input formats here? from mapred.input.format.class
    Path scratchDir = utils.createTezDir(ctx.getMRScratchDir(), job);
    FileSystem fs = scratchDir.getFileSystem(job);
    try {
        LocalResource appJarLr = createJarLocalResource(utils.getExecJarPathLocal(), utils, job);
        Vertex wx = utils.createVertex(wxConf, mapWork, scratchDir, appJarLr, new ArrayList<LocalResource>(), fs, ctx, false, work, work.getVertexType(mapWork));
        String vertexName = wx.getName();
        dag.addVertex(wx);
        utils.addCredentials(mapWork, dag);
        // we have the dag now proceed to get the splits:
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS));
        Preconditions.checkState(HiveConf.getBoolVar(wxConf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS));
        HiveSplitGenerator splitGenerator = new HiveSplitGenerator(wxConf, mapWork);
        List<Event> eventList = splitGenerator.initialize();
        InputSplit[] result = new InputSplit[eventList.size() - 1];
        InputConfigureVertexTasksEvent configureEvent = (InputConfigureVertexTasksEvent) eventList.get(0);
        List<TaskLocationHint> hints = configureEvent.getLocationHint().getTaskLocationHints();
        Preconditions.checkState(hints.size() == eventList.size() - 1);
        if (LOG.isDebugEnabled()) {
            LOG.debug("NumEvents=" + eventList.size() + ", NumSplits=" + result.length);
        }
        LlapCoordinator coordinator = LlapCoordinator.getInstance();
        if (coordinator == null) {
            throw new IOException("LLAP coordinator is not initialized; must be running in HS2 with " + ConfVars.LLAP_HS2_ENABLE_COORDINATOR.varname + " enabled");
        }
        // See the discussion in the implementation as to why we generate app ID.
        ApplicationId applicationId = coordinator.createExtClientAppId();
        // This assumes LLAP cluster owner is always the HS2 user.
        String llapUser = UserGroupInformation.getLoginUser().getShortUserName();
        String queryUser = null;
        byte[] tokenBytes = null;
        LlapSigner signer = null;
        if (UserGroupInformation.isSecurityEnabled()) {
            signer = coordinator.getLlapSigner(job);
            // 1. Generate the token for query user (applies to all splits).
            queryUser = SessionState.getUserFromAuthenticator();
            if (queryUser == null) {
                queryUser = UserGroupInformation.getCurrentUser().getUserName();
                LOG.warn("Cannot determine the session user; using " + queryUser + " instead");
            }
            LlapTokenLocalClient tokenClient = coordinator.getLocalTokenClient(job, llapUser);
            // We put the query user, not LLAP user, into the message and token.
            Token<LlapTokenIdentifier> token = tokenClient.createToken(applicationId.toString(), queryUser, true);
            LOG.info("Created the token for remote user: {}", token);
            bos.reset();
            token.write(dos);
            tokenBytes = bos.toByteArray();
        } else {
            queryUser = UserGroupInformation.getCurrentUser().getUserName();
        }
        LOG.info("Number of splits: " + (eventList.size() - 1));
        SignedMessage signedSvs = null;
        for (int i = 0; i < eventList.size() - 1; i++) {
            TaskSpec taskSpec = new TaskSpecBuilder().constructTaskSpec(dag, vertexName, eventList.size() - 1, applicationId, i);
            // 2. Generate the vertex/submit information for all events.
            if (i == 0) {
                // The queryId could either be picked up from the current request being processed, or
                // generated. The current request isn't exactly correct since the query is 'done' once we
                // return the results. Generating a new one has the added benefit of working once this
                // is moved out of a UDTF into a proper API.
                // Setting this to the generated AppId which is unique.
                // Despite the differences in TaskSpec, the vertex spec should be the same.
                signedSvs = createSignedVertexSpec(signer, taskSpec, applicationId, queryUser, applicationId.toString());
            }
            SubmitWorkInfo submitWorkInfo = new SubmitWorkInfo(applicationId, System.currentTimeMillis(), taskSpec.getVertexParallelism(), signedSvs.message, signedSvs.signature);
            byte[] submitWorkBytes = SubmitWorkInfo.toBytes(submitWorkInfo);
            // 3. Generate input event.
            SignedMessage eventBytes = makeEventBytes(wx, vertexName, eventList.get(i + 1), signer);
            // 4. Make location hints.
            SplitLocationInfo[] locations = makeLocationHints(hints.get(i));
            result[i] = new LlapInputSplit(i, submitWorkBytes, eventBytes.message, eventBytes.signature, locations, schema, llapUser, tokenBytes);
        }
        return result;
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : Vertex(org.apache.tez.dag.api.Vertex) SubmitWorkInfo(org.apache.hadoop.hive.llap.SubmitWorkInfo) LlapTokenIdentifier(org.apache.hadoop.hive.llap.security.LlapTokenIdentifier) SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) HiveSplitGenerator(org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator) TaskSpecBuilder(org.apache.tez.dag.api.TaskSpecBuilder) LlapSigner(org.apache.hadoop.hive.llap.security.LlapSigner) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LlapTokenLocalClient(org.apache.hadoop.hive.llap.security.LlapTokenLocalClient) DagUtils(org.apache.hadoop.hive.ql.exec.tez.DagUtils) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) LlapInputSplit(org.apache.hadoop.hive.llap.LlapInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) Context(org.apache.hadoop.hive.ql.Context) Path(org.apache.hadoop.fs.Path) TaskSpec(org.apache.tez.runtime.api.impl.TaskSpec) SignedMessage(org.apache.hadoop.hive.llap.security.LlapSigner.SignedMessage) DAG(org.apache.tez.dag.api.DAG) IOException(java.io.IOException) LlapCoordinator(org.apache.hadoop.hive.llap.coordinator.LlapCoordinator) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) LoginException(javax.security.auth.login.LoginException) URISyntaxException(java.net.URISyntaxException) UDFArgumentLengthException(org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) UDFArgumentException(org.apache.hadoop.hive.ql.exec.UDFArgumentException) UDFArgumentTypeException(org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException) CommandNeedRetryException(org.apache.hadoop.hive.ql.CommandNeedRetryException) IOException(java.io.IOException) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Event(org.apache.tez.runtime.api.Event) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId)

Example 2 with TaskLocationHint

use of org.apache.tez.dag.api.TaskLocationHint in project hive by apache.

the class GenericUDTFGetSplits method makeLocationHints.

private SplitLocationInfo[] makeLocationHints(TaskLocationHint hint) {
    Set<String> hosts = hint.getHosts();
    if (hosts.size() != 1) {
        LOG.warn("Bad # of locations: " + hosts.size());
    }
    SplitLocationInfo[] locations = new SplitLocationInfo[hosts.size()];
    int j = 0;
    for (String host : hosts) {
        locations[j++] = new SplitLocationInfo(host, false);
    }
    return locations;
}
Also used : SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint)

Example 3 with TaskLocationHint

use of org.apache.tez.dag.api.TaskLocationHint in project hive by apache.

the class HiveSplitGenerator method initialize.

@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
    // Setup the map work for this thread. Pruning modified the work instance to potentially remove
    // partitions. The same work instance must be used when generating splits.
    Utilities.setMapWork(jobConf, work);
    try {
        boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
        // perform dynamic partition pruning
        if (pruner != null) {
            pruner.prune();
        }
        InputSplitInfoMem inputSplitInfo = null;
        boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
        LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
        String realInputFormatName = conf.get("mapred.input.format.class");
        boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
        if (groupingEnabled) {
            // Need to instantiate the realInputFormat
            InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
            int totalResource = 0;
            int taskResource = 0;
            int availableSlots = 0;
            // FIXME. Do the right thing Luke.
            if (getContext() == null) {
                // for now, totalResource = taskResource for llap
                availableSlots = 1;
            }
            if (getContext() != null) {
                totalResource = getContext().getTotalAvailableResource().getMemory();
                taskResource = getContext().getVertexTaskResource().getMemory();
                availableSlots = totalResource / taskResource;
            }
            if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
                // broken configuration from mapred-default.xml
                final long blockSize = conf.getLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
                final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
                final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
                HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
                LOG.info("The preferred split size is " + preferredSplitSize);
            }
            // Create the un-grouped splits
            float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
            // Raw splits
            InputSplit[] splits = inputFormat.getSplits(jobConf, (int) (availableSlots * waves));
            // Sort the splits, so that subsequent grouping is consistent.
            Arrays.sort(splits, new InputSplitComparator());
            LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
            if (work.getIncludedBuckets() != null) {
                splits = pruneBuckets(work, splits);
            }
            Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
            // And finally return them in a flat array
            InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
            LOG.info("Number of split groups: " + flatSplits.length);
            List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
            inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
        } else {
            // If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
            throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
        // inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
        }
        return createEventList(sendSerializedEvents, inputSplitInfo);
    } finally {
        Utilities.clearWork(jobConf);
    }
}
Also used : TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) InputSplitInfoMem(org.apache.tez.mapreduce.hadoop.InputSplitInfoMem) InputFormat(org.apache.hadoop.mapred.InputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 4 with TaskLocationHint

use of org.apache.tez.dag.api.TaskLocationHint in project hive by apache.

the class SplitGrouper method createTaskLocationHints.

/**
   * Create task location hints from a set of input splits
   * @param splits the actual splits
   * @param consistentLocations whether to re-order locations for each split, if it's a file split
   * @return taskLocationHints - 1 per input split specified
   * @throws IOException
   */
public List<TaskLocationHint> createTaskLocationHints(InputSplit[] splits, boolean consistentLocations) throws IOException {
    List<TaskLocationHint> locationHints = Lists.newArrayListWithCapacity(splits.length);
    for (InputSplit split : splits) {
        String rack = (split instanceof TezGroupedSplit) ? ((TezGroupedSplit) split).getRack() : null;
        if (rack == null) {
            String[] locations = split.getLocations();
            if (locations != null && locations.length > 0) {
                // Worthwhile only if more than 1 split, consistentGroupingEnabled and is a FileSplit
                if (consistentLocations && locations.length > 1 && split instanceof FileSplit) {
                    Arrays.sort(locations);
                    FileSplit fileSplit = (FileSplit) split;
                    Path path = fileSplit.getPath();
                    long startLocation = fileSplit.getStart();
                    int hashCode = Objects.hash(path, startLocation);
                    int startIndex = hashCode % locations.length;
                    LinkedHashSet<String> locationSet = new LinkedHashSet<>(locations.length);
                    // Set up the locations starting from startIndex, and wrapping around the sorted array.
                    for (int i = 0; i < locations.length; i++) {
                        int index = (startIndex + i) % locations.length;
                        locationSet.add(locations[index]);
                    }
                    locationHints.add(TaskLocationHint.createTaskLocationHint(locationSet, null));
                } else {
                    locationHints.add(TaskLocationHint.createTaskLocationHint(new LinkedHashSet<String>(Arrays.asList(split.getLocations())), null));
                }
            } else {
                locationHints.add(TaskLocationHint.createTaskLocationHint(null, null));
            }
        } else {
            locationHints.add(TaskLocationHint.createTaskLocationHint(null, Collections.singleton(rack)));
        }
    }
    return locationHints;
}
Also used : TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) Path(org.apache.hadoop.fs.Path) LinkedHashSet(java.util.LinkedHashSet) TezGroupedSplit(org.apache.hadoop.mapred.split.TezGroupedSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint)

Aggregations

TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)4 InputSplit (org.apache.hadoop.mapred.InputSplit)3 Path (org.apache.hadoop.fs.Path)2 SplitLocationInfo (org.apache.hadoop.mapred.SplitLocationInfo)2 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 URISyntaxException (java.net.URISyntaxException)1 LinkedHashSet (java.util.LinkedHashSet)1 LoginException (javax.security.auth.login.LoginException)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 LlapInputSplit (org.apache.hadoop.hive.llap.LlapInputSplit)1 SubmitWorkInfo (org.apache.hadoop.hive.llap.SubmitWorkInfo)1 LlapCoordinator (org.apache.hadoop.hive.llap.coordinator.LlapCoordinator)1 LlapSigner (org.apache.hadoop.hive.llap.security.LlapSigner)1 SignedMessage (org.apache.hadoop.hive.llap.security.LlapSigner.SignedMessage)1 LlapTokenIdentifier (org.apache.hadoop.hive.llap.security.LlapTokenIdentifier)1 LlapTokenLocalClient (org.apache.hadoop.hive.llap.security.LlapTokenLocalClient)1 CommandNeedRetryException (org.apache.hadoop.hive.ql.CommandNeedRetryException)1 Context (org.apache.hadoop.hive.ql.Context)1 UDFArgumentException (org.apache.hadoop.hive.ql.exec.UDFArgumentException)1