Search in sources :

Example 1 with InputSplitInfoMem

use of org.apache.tez.mapreduce.hadoop.InputSplitInfoMem in project tez by apache.

the class MRInputAMSplitGenerator method initialize.

@Override
public List<Event> initialize() throws Exception {
    StopWatch sw = new StopWatch().start();
    MRInputUserPayloadProto userPayloadProto = MRInputHelpers.parseMRInputPayload(getContext().getInputUserPayload());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time to parse MRInput payload into prot: " + sw.now(TimeUnit.MILLISECONDS));
    }
    sw.reset().start();
    Configuration conf = TezUtils.createConfFromByteString(userPayloadProto.getConfigurationBytes());
    sendSerializedEvents = conf.getBoolean(MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD, MRJobConfig.MR_TEZ_INPUT_INITIALIZER_SERIALIZE_EVENT_PAYLOAD_DEFAULT);
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Emitting serialized splits: " + sendSerializedEvents + " for input " + getContext().getInputName());
        LOG.debug("Time converting ByteString to configuration: " + sw.now(TimeUnit.MILLISECONDS));
    }
    sw.reset().start();
    int totalResource = getContext().getTotalAvailableResource().getMemory();
    int taskResource = getContext().getVertexTaskResource().getMemory();
    float waves = conf.getFloat(TezSplitGrouper.TEZ_GROUPING_SPLIT_WAVES, TezSplitGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
    int numTasks = (int) ((totalResource * waves) / taskResource);
    boolean groupSplits = userPayloadProto.getGroupingEnabled();
    boolean sortSplits = userPayloadProto.getSortSplitsEnabled();
    LOG.info("Input " + getContext().getInputName() + " asking for " + numTasks + " tasks. Headroom: " + totalResource + ". Task Resource: " + taskResource + ". waves: " + waves + ". groupingEnabled: " + groupSplits + ". SortSplitsEnabled: " + sortSplits);
    // Read all credentials into the credentials instance stored in JobConf.
    JobConf jobConf = new JobConf(conf);
    jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());
    InputSplitInfoMem inputSplitInfo = null;
    inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, groupSplits, sortSplits, groupSplits ? numTasks : 0);
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time to create splits to mem: " + sw.now(TimeUnit.MILLISECONDS));
    }
    List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo.getNumTasks() + 1);
    InputConfigureVertexTasksEvent configureVertexEvent = InputConfigureVertexTasksEvent.create(inputSplitInfo.getNumTasks(), VertexLocationHint.create(inputSplitInfo.getTaskLocationHints()), InputSpecUpdate.getDefaultSinglePhysicalInputSpecUpdate());
    events.add(configureVertexEvent);
    if (sendSerializedEvents) {
        MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
        int count = 0;
        for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
            // Unnecessary array copy, can be avoided by using ByteBuffer instead of a raw array.
            InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count++, mrSplit.toByteString().asReadOnlyByteBuffer());
            events.add(diEvent);
        }
    } else {
        int count = 0;
        if (inputSplitInfo.holdsNewFormatSplits()) {
            for (org.apache.hadoop.mapreduce.InputSplit split : inputSplitInfo.getNewFormatSplits()) {
                InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
                events.add(diEvent);
            }
        } else {
            for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
                InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(count++, split);
                events.add(diEvent);
            }
        }
    }
    return events;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) StopWatch(org.apache.tez.util.StopWatch) MRSplitsProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitsProto) InputSplitInfoMem(org.apache.tez.mapreduce.hadoop.InputSplitInfoMem) MRInputUserPayloadProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto) InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) InputConfigureVertexTasksEvent(org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent) JobConf(org.apache.hadoop.mapred.JobConf) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent)

Example 2 with InputSplitInfoMem

use of org.apache.tez.mapreduce.hadoop.InputSplitInfoMem in project hive by apache.

the class HiveSplitGenerator method initialize.

@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
    if (getContext() != null) {
        // called from Tez AM.
        prepare(getContext());
    }
    // Setup the map work for this thread. Pruning modified the work instance to potentially remove
    // partitions. The same work instance must be used when generating splits.
    Utilities.setMapWork(jobConf, work);
    try {
        boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
        // perform dynamic partition pruning
        if (pruner != null) {
            pruner.initialize(getContext(), work, jobConf);
            pruner.prune();
        }
        InputSplitInfoMem inputSplitInfo = null;
        boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
        LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
        String realInputFormatName = conf.get("mapred.input.format.class");
        boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
        if (groupingEnabled) {
            // Need to instantiate the realInputFormat
            InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
            int totalResource = 0;
            int taskResource = 0;
            int availableSlots = 0;
            // FIXME. Do the right thing Luke.
            if (getContext() == null) {
                // for now, totalResource = taskResource for llap
                availableSlots = 1;
            }
            if (getContext() != null) {
                totalResource = getContext().getTotalAvailableResource().getMemory();
                taskResource = getContext().getVertexTaskResource().getMemory();
                availableSlots = totalResource / taskResource;
            }
            if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
                // broken configuration from mapred-default.xml
                final long blockSize = conf.getLongBytes(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
                final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
                final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
                HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
                LOG.info("The preferred split size is " + preferredSplitSize);
            }
            float waves;
            // Create the un-grouped splits
            if (numSplits.isPresent()) {
                waves = numSplits.get().floatValue() / availableSlots;
            } else {
                waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
            }
            InputSplit[] splits;
            if (generateSingleSplit && conf.get(HiveConf.ConfVars.HIVETEZINPUTFORMAT.varname).equals(HiveInputFormat.class.getName())) {
                MapWork mapWork = Utilities.getMapWork(jobConf);
                List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
                FileSystem fs = paths.get(0).getFileSystem(jobConf);
                FileStatus[] fileStatuses = fs.listStatus(paths.get(0));
                if (fileStatuses.length == 0) {
                    // generate single split typically happens when reading data out of order by queries.
                    // if order by query returns no rows, no files will exists in input path
                    splits = new InputSplit[0];
                } else {
                    // if files exists in input path then it has to be 1 as this code path gets triggered only
                    // of order by queries which is expected to write only one file (written by one reducer)
                    Preconditions.checkState(paths.size() == 1 && fileStatuses.length == 1 && mapWork.getAliasToPartnInfo().size() == 1, "Requested to generate single split. Paths and fileStatuses are expected to be 1. " + "Got paths: " + paths.size() + " fileStatuses: " + fileStatuses.length);
                    splits = new InputSplit[1];
                    FileStatus fileStatus = fileStatuses[0];
                    BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
                    Set<String> hostsSet = new HashSet<>();
                    for (BlockLocation location : locations) {
                        hostsSet.addAll(Lists.newArrayList(location.getHosts()));
                    }
                    String[] hosts = hostsSet.toArray(new String[0]);
                    FileSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), hosts);
                    String alias = mapWork.getAliases().get(0);
                    PartitionDesc partDesc = mapWork.getAliasToPartnInfo().get(alias);
                    String partIF = partDesc.getInputFileFormatClassName();
                    splits[0] = new HiveInputFormat.HiveInputSplit(fileSplit, partIF);
                }
            } else {
                // Raw splits
                splits = inputFormat.getSplits(jobConf, numSplits.orElse(Math.multiplyExact(availableSlots, (int) waves)));
            }
            // Sort the splits, so that subsequent grouping is consistent.
            Arrays.sort(splits, new InputSplitComparator());
            LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
            // increment/set input counters
            InputInitializerContext inputInitializerContext = getContext();
            TezCounters tezCounters = null;
            String counterName;
            String groupName = null;
            String vertexName = null;
            if (inputInitializerContext != null) {
                try {
                    tezCounters = new TezCounters();
                    groupName = HiveInputCounters.class.getName();
                    vertexName = jobConf.get(Operator.CONTEXT_NAME_KEY, "");
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.RAW_INPUT_SPLITS.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(splits.length);
                    final List<Path> paths = Utilities.getInputPathsTez(jobConf, work);
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_DIRECTORIES.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(paths.size());
                    final Set<String> files = new HashSet<>();
                    for (InputSplit inputSplit : splits) {
                        if (inputSplit instanceof FileSplit) {
                            final FileSplit fileSplit = (FileSplit) inputSplit;
                            final Path path = fileSplit.getPath();
                            // The assumption here is the path is a file. Only case this is different is ACID deltas.
                            // The isFile check is avoided here for performance reasons.
                            final String fileStr = path.toString();
                            if (!files.contains(fileStr)) {
                                files.add(fileStr);
                            }
                        }
                    }
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_FILES.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(files.size());
                } catch (Exception e) {
                    LOG.warn("Caught exception while trying to update Tez counters", e);
                }
            }
            if (work.getIncludedBuckets() != null) {
                splits = pruneBuckets(work, splits);
            }
            Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
            // And finally return them in a flat array
            InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
            LOG.info("Number of split groups: " + flatSplits.length);
            if (inputInitializerContext != null) {
                try {
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.GROUPED_INPUT_SPLITS.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).setValue(flatSplits.length);
                    LOG.debug("Published tez counters: {}", tezCounters);
                    inputInitializerContext.addCounters(tezCounters);
                } catch (Exception e) {
                    LOG.warn("Caught exception while trying to update Tez counters", e);
                }
            }
            List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
            inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
        } else {
            // If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
            throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
        // inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
        }
        return createEventList(sendSerializedEvents, inputSplitInfo);
    } finally {
        Utilities.clearWork(jobConf);
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSplit(org.apache.hadoop.mapred.FileSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) InputSplitInfoMem(org.apache.tez.mapreduce.hadoop.InputSplitInfoMem) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) TezCounters(org.apache.tez.common.counters.TezCounters) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InputFormat(org.apache.hadoop.mapred.InputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Aggregations

VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)2 InputSplitInfoMem (org.apache.tez.mapreduce.hadoop.InputSplitInfoMem)2 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Configuration (org.apache.hadoop.conf.Configuration)1 BlockLocation (org.apache.hadoop.fs.BlockLocation)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)1 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)1 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)1 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1 InputFormat (org.apache.hadoop.mapred.InputFormat)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1 JobConf (org.apache.hadoop.mapred.JobConf)1 TezCounters (org.apache.tez.common.counters.TezCounters)1 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)1 MRInputUserPayloadProto (org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto)1