Search in sources :

Example 16 with InputInitializerContext

use of org.apache.tez.runtime.api.InputInitializerContext in project hive by apache.

the class TestDynamicPartitionPruner method testMissingEvent.

@Test(timeout = 20000)
public void testMissingEvent() throws InterruptedException, IOException, HiveException, SerDeException {
    InputInitializerContext mockInitContext = mock(InputInitializerContext.class);
    doReturn(1).when(mockInitContext).getVertexNumTasks("v1");
    MapWork mapWork = createMockMapWork(new TestSource("v1", 1));
    DynamicPartitionPrunerForEventTesting pruner = new DynamicPartitionPrunerForEventTesting();
    pruner.initialize(mockInitContext, mapWork, new JobConf());
    PruneRunnable pruneRunnable = new PruneRunnable(pruner);
    Thread t = new Thread(pruneRunnable);
    t.start();
    try {
        pruneRunnable.start();
        InputInitializerEvent event = InputInitializerEvent.create("FakeTarget", "TargetInput", ByteBuffer.allocate(0));
        event.setSourceVertexName("v1");
        pruner.processVertex("v1");
        Thread.sleep(3000l);
        // The pruner should not have completed.
        assertFalse(pruneRunnable.ended.get());
        assertNoError(pruneRunnable);
        assertEquals(0, pruner.eventsProceessed.intValue());
        assertEquals(0, pruner.filteredSources.intValue());
    } finally {
        t.interrupt();
        t.join();
    }
}
Also used : InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 17 with InputInitializerContext

use of org.apache.tez.runtime.api.InputInitializerContext in project hive by apache.

the class HiveSplitGenerator method initialize.

@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
    if (getContext() != null) {
        // called from Tez AM.
        prepare(getContext());
    }
    // Setup the map work for this thread. Pruning modified the work instance to potentially remove
    // partitions. The same work instance must be used when generating splits.
    Utilities.setMapWork(jobConf, work);
    try {
        boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
        // perform dynamic partition pruning
        if (pruner != null) {
            pruner.initialize(getContext(), work, jobConf);
            pruner.prune();
        }
        InputSplitInfoMem inputSplitInfo = null;
        boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
        LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
        String realInputFormatName = conf.get("mapred.input.format.class");
        boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
        if (groupingEnabled) {
            // Need to instantiate the realInputFormat
            InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
            int totalResource = 0;
            int taskResource = 0;
            int availableSlots = 0;
            // FIXME. Do the right thing Luke.
            if (getContext() == null) {
                // for now, totalResource = taskResource for llap
                availableSlots = 1;
            }
            if (getContext() != null) {
                totalResource = getContext().getTotalAvailableResource().getMemory();
                taskResource = getContext().getVertexTaskResource().getMemory();
                availableSlots = totalResource / taskResource;
            }
            if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
                // broken configuration from mapred-default.xml
                final long blockSize = conf.getLongBytes(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
                final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
                final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
                HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
                LOG.info("The preferred split size is " + preferredSplitSize);
            }
            float waves;
            // Create the un-grouped splits
            if (numSplits.isPresent()) {
                waves = numSplits.get().floatValue() / availableSlots;
            } else {
                waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
            }
            InputSplit[] splits;
            if (generateSingleSplit && conf.get(HiveConf.ConfVars.HIVETEZINPUTFORMAT.varname).equals(HiveInputFormat.class.getName())) {
                MapWork mapWork = Utilities.getMapWork(jobConf);
                List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
                FileSystem fs = paths.get(0).getFileSystem(jobConf);
                FileStatus[] fileStatuses = fs.listStatus(paths.get(0));
                if (fileStatuses.length == 0) {
                    // generate single split typically happens when reading data out of order by queries.
                    // if order by query returns no rows, no files will exists in input path
                    splits = new InputSplit[0];
                } else {
                    // if files exists in input path then it has to be 1 as this code path gets triggered only
                    // of order by queries which is expected to write only one file (written by one reducer)
                    Preconditions.checkState(paths.size() == 1 && fileStatuses.length == 1 && mapWork.getAliasToPartnInfo().size() == 1, "Requested to generate single split. Paths and fileStatuses are expected to be 1. " + "Got paths: " + paths.size() + " fileStatuses: " + fileStatuses.length);
                    splits = new InputSplit[1];
                    FileStatus fileStatus = fileStatuses[0];
                    BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
                    Set<String> hostsSet = new HashSet<>();
                    for (BlockLocation location : locations) {
                        hostsSet.addAll(Lists.newArrayList(location.getHosts()));
                    }
                    String[] hosts = hostsSet.toArray(new String[0]);
                    FileSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), hosts);
                    String alias = mapWork.getAliases().get(0);
                    PartitionDesc partDesc = mapWork.getAliasToPartnInfo().get(alias);
                    String partIF = partDesc.getInputFileFormatClassName();
                    splits[0] = new HiveInputFormat.HiveInputSplit(fileSplit, partIF);
                }
            } else {
                // Raw splits
                splits = inputFormat.getSplits(jobConf, numSplits.orElse(Math.multiplyExact(availableSlots, (int) waves)));
            }
            // Sort the splits, so that subsequent grouping is consistent.
            Arrays.sort(splits, new InputSplitComparator());
            LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
            // increment/set input counters
            InputInitializerContext inputInitializerContext = getContext();
            TezCounters tezCounters = null;
            String counterName;
            String groupName = null;
            String vertexName = null;
            if (inputInitializerContext != null) {
                try {
                    tezCounters = new TezCounters();
                    groupName = HiveInputCounters.class.getName();
                    vertexName = jobConf.get(Operator.CONTEXT_NAME_KEY, "");
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.RAW_INPUT_SPLITS.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(splits.length);
                    final List<Path> paths = Utilities.getInputPathsTez(jobConf, work);
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_DIRECTORIES.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(paths.size());
                    final Set<String> files = new HashSet<>();
                    for (InputSplit inputSplit : splits) {
                        if (inputSplit instanceof FileSplit) {
                            final FileSplit fileSplit = (FileSplit) inputSplit;
                            final Path path = fileSplit.getPath();
                            // The assumption here is the path is a file. Only case this is different is ACID deltas.
                            // The isFile check is avoided here for performance reasons.
                            final String fileStr = path.toString();
                            if (!files.contains(fileStr)) {
                                files.add(fileStr);
                            }
                        }
                    }
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_FILES.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(files.size());
                } catch (Exception e) {
                    LOG.warn("Caught exception while trying to update Tez counters", e);
                }
            }
            if (work.getIncludedBuckets() != null) {
                splits = pruneBuckets(work, splits);
            }
            Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
            // And finally return them in a flat array
            InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
            LOG.info("Number of split groups: " + flatSplits.length);
            if (inputInitializerContext != null) {
                try {
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.GROUPED_INPUT_SPLITS.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).setValue(flatSplits.length);
                    LOG.debug("Published tez counters: {}", tezCounters);
                    inputInitializerContext.addCounters(tezCounters);
                } catch (Exception e) {
                    LOG.warn("Caught exception while trying to update Tez counters", e);
                }
            }
            List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
            inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
        } else {
            // If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
            throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
        // inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
        }
        return createEventList(sendSerializedEvents, inputSplitInfo);
    } finally {
        Utilities.clearWork(jobConf);
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSplit(org.apache.hadoop.mapred.FileSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) InputSplitInfoMem(org.apache.tez.mapreduce.hadoop.InputSplitInfoMem) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) TezCounters(org.apache.tez.common.counters.TezCounters) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InputFormat(org.apache.hadoop.mapred.InputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Aggregations

InputInitializerContext (org.apache.tez.runtime.api.InputInitializerContext)17 Test (org.junit.Test)14 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)11 InputInitializerEvent (org.apache.tez.runtime.api.events.InputInitializerEvent)11 JobConf (org.apache.hadoop.mapred.JobConf)10 ByteString (com.google.protobuf.ByteString)3 List (java.util.List)3 Configuration (org.apache.hadoop.conf.Configuration)3 InputSplit (org.apache.hadoop.mapred.InputSplit)3 InputDescriptor (org.apache.tez.dag.api.InputDescriptor)3 InputInitializerDescriptor (org.apache.tez.dag.api.InputInitializerDescriptor)3 UserPayload (org.apache.tez.dag.api.UserPayload)3 InputInitializer (org.apache.tez.runtime.api.InputInitializer)3 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)2 RootInputLeafOutput (org.apache.tez.dag.api.RootInputLeafOutput)2 AppContext (org.apache.tez.dag.app.AppContext)2 TezDAGID (org.apache.tez.dag.records.TezDAGID)2 TezTaskAttemptID (org.apache.tez.dag.records.TezTaskAttemptID)2 TezTaskID (org.apache.tez.dag.records.TezTaskID)2 TezVertexID (org.apache.tez.dag.records.TezVertexID)2