Search in sources :

Example 81 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class HiveSplitGenerator method initialize.

@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
    if (getContext() != null) {
        // called from Tez AM.
        prepare(getContext());
    }
    // Setup the map work for this thread. Pruning modified the work instance to potentially remove
    // partitions. The same work instance must be used when generating splits.
    Utilities.setMapWork(jobConf, work);
    try {
        boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
        // perform dynamic partition pruning
        if (pruner != null) {
            pruner.initialize(getContext(), work, jobConf);
            pruner.prune();
        }
        InputSplitInfoMem inputSplitInfo = null;
        boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
        LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
        String realInputFormatName = conf.get("mapred.input.format.class");
        boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
        if (groupingEnabled) {
            // Need to instantiate the realInputFormat
            InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
            int totalResource = 0;
            int taskResource = 0;
            int availableSlots = 0;
            // FIXME. Do the right thing Luke.
            if (getContext() == null) {
                // for now, totalResource = taskResource for llap
                availableSlots = 1;
            }
            if (getContext() != null) {
                totalResource = getContext().getTotalAvailableResource().getMemory();
                taskResource = getContext().getVertexTaskResource().getMemory();
                availableSlots = totalResource / taskResource;
            }
            if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
                // broken configuration from mapred-default.xml
                final long blockSize = conf.getLongBytes(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
                final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
                final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
                HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
                LOG.info("The preferred split size is " + preferredSplitSize);
            }
            float waves;
            // Create the un-grouped splits
            if (numSplits.isPresent()) {
                waves = numSplits.get().floatValue() / availableSlots;
            } else {
                waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
            }
            InputSplit[] splits;
            if (generateSingleSplit && conf.get(HiveConf.ConfVars.HIVETEZINPUTFORMAT.varname).equals(HiveInputFormat.class.getName())) {
                MapWork mapWork = Utilities.getMapWork(jobConf);
                List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
                FileSystem fs = paths.get(0).getFileSystem(jobConf);
                FileStatus[] fileStatuses = fs.listStatus(paths.get(0));
                if (fileStatuses.length == 0) {
                    // generate single split typically happens when reading data out of order by queries.
                    // if order by query returns no rows, no files will exists in input path
                    splits = new InputSplit[0];
                } else {
                    // if files exists in input path then it has to be 1 as this code path gets triggered only
                    // of order by queries which is expected to write only one file (written by one reducer)
                    Preconditions.checkState(paths.size() == 1 && fileStatuses.length == 1 && mapWork.getAliasToPartnInfo().size() == 1, "Requested to generate single split. Paths and fileStatuses are expected to be 1. " + "Got paths: " + paths.size() + " fileStatuses: " + fileStatuses.length);
                    splits = new InputSplit[1];
                    FileStatus fileStatus = fileStatuses[0];
                    BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
                    Set<String> hostsSet = new HashSet<>();
                    for (BlockLocation location : locations) {
                        hostsSet.addAll(Lists.newArrayList(location.getHosts()));
                    }
                    String[] hosts = hostsSet.toArray(new String[0]);
                    FileSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), hosts);
                    String alias = mapWork.getAliases().get(0);
                    PartitionDesc partDesc = mapWork.getAliasToPartnInfo().get(alias);
                    String partIF = partDesc.getInputFileFormatClassName();
                    splits[0] = new HiveInputFormat.HiveInputSplit(fileSplit, partIF);
                }
            } else {
                // Raw splits
                splits = inputFormat.getSplits(jobConf, numSplits.orElse(Math.multiplyExact(availableSlots, (int) waves)));
            }
            // Sort the splits, so that subsequent grouping is consistent.
            Arrays.sort(splits, new InputSplitComparator());
            LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
            // increment/set input counters
            InputInitializerContext inputInitializerContext = getContext();
            TezCounters tezCounters = null;
            String counterName;
            String groupName = null;
            String vertexName = null;
            if (inputInitializerContext != null) {
                try {
                    tezCounters = new TezCounters();
                    groupName = HiveInputCounters.class.getName();
                    vertexName = jobConf.get(Operator.CONTEXT_NAME_KEY, "");
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.RAW_INPUT_SPLITS.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(splits.length);
                    final List<Path> paths = Utilities.getInputPathsTez(jobConf, work);
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_DIRECTORIES.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(paths.size());
                    final Set<String> files = new HashSet<>();
                    for (InputSplit inputSplit : splits) {
                        if (inputSplit instanceof FileSplit) {
                            final FileSplit fileSplit = (FileSplit) inputSplit;
                            final Path path = fileSplit.getPath();
                            // The assumption here is the path is a file. Only case this is different is ACID deltas.
                            // The isFile check is avoided here for performance reasons.
                            final String fileStr = path.toString();
                            if (!files.contains(fileStr)) {
                                files.add(fileStr);
                            }
                        }
                    }
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_FILES.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(files.size());
                } catch (Exception e) {
                    LOG.warn("Caught exception while trying to update Tez counters", e);
                }
            }
            if (work.getIncludedBuckets() != null) {
                splits = pruneBuckets(work, splits);
            }
            Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
            // And finally return them in a flat array
            InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
            LOG.info("Number of split groups: " + flatSplits.length);
            if (inputInitializerContext != null) {
                try {
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.GROUPED_INPUT_SPLITS.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).setValue(flatSplits.length);
                    LOG.debug("Published tez counters: {}", tezCounters);
                    inputInitializerContext.addCounters(tezCounters);
                } catch (Exception e) {
                    LOG.warn("Caught exception while trying to update Tez counters", e);
                }
            }
            List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
            inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
        } else {
            // If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
            throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
        // inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
        }
        return createEventList(sendSerializedEvents, inputSplitInfo);
    } finally {
        Utilities.clearWork(jobConf);
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSplit(org.apache.hadoop.mapred.FileSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) InputSplitInfoMem(org.apache.tez.mapreduce.hadoop.InputSplitInfoMem) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) TezCounters(org.apache.tez.common.counters.TezCounters) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InputFormat(org.apache.hadoop.mapred.InputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Example 82 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class MapRecordProcessor method init.

@Override
void init(MRTaskReporter mrReporter, Map<String, LogicalInput> inputs, Map<String, LogicalOutput> outputs) throws Exception {
    perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
    super.init(mrReporter, inputs, outputs);
    checkAbortCondition();
    String key = processorContext.getTaskVertexName() + MAP_PLAN_KEY;
    cacheKeys.add(key);
    // create map and fetch operators
    if (!isInCompaction) {
        mapWork = cache.retrieve(key, () -> Utilities.getMapWork(jconf));
    } else {
        // During query-based compaction, we don't want to retrieve old MapWork from the cache, we want a new mapper
        // and new UDF validate_acid_sort_order instance for each bucket, otherwise validate_acid_sort_order will fail.
        mapWork = Utilities.getMapWork(jconf);
    }
    // TODO HIVE-14042. Cleanup may be required if exiting early.
    Utilities.setMapWork(jconf, mapWork);
    for (PartitionDesc part : mapWork.getAliasToPartnInfo().values()) {
        TableDesc tableDesc = part.getTableDesc();
        Utilities.copyJobSecretToTableProperties(tableDesc);
    }
    String prefixes = jconf.get(DagUtils.TEZ_MERGE_WORK_FILE_PREFIXES);
    if (prefixes != null) {
        mergeWorkList = new ArrayList<>();
        for (final String prefix : prefixes.split(",")) {
            if (prefix == null || prefix.isEmpty()) {
                continue;
            }
            key = processorContext.getTaskVertexName() + prefix;
            cacheKeys.add(key);
            checkAbortCondition();
            mergeWorkList.add((MapWork) cache.retrieve(key, () -> Utilities.getMergeWork(jconf, prefix)));
        }
    }
    MapredContext.init(true, new JobConf(jconf));
    ((TezContext) MapredContext.get()).setInputs(inputs);
    ((TezContext) MapredContext.get()).setTezProcessorContext(processorContext);
    // Update JobConf using MRInput, info like filename comes via this
    checkAbortCondition();
    legacyMRInput = getMRInput(inputs);
    if (legacyMRInput != null) {
        Configuration updatedConf = legacyMRInput.getConfigUpdates();
        if (updatedConf != null) {
            for (Entry<String, String> entry : updatedConf) {
                jconf.set(entry.getKey(), entry.getValue());
            }
        }
    }
    checkAbortCondition();
    createOutputMap();
    // Start all the Outputs.
    for (Entry<String, LogicalOutput> outputEntry : outputs.entrySet()) {
        LOG.debug("Starting Output: " + outputEntry.getKey());
        outputEntry.getValue().start();
        ((TezKVOutputCollector) outMap.get(outputEntry.getKey())).initialize();
    }
    checkAbortCondition();
    try {
        CompilationOpContext runtimeCtx = new CompilationOpContext();
        if (mapWork.getVectorMode()) {
            mapOp = new VectorMapOperator(runtimeCtx);
        } else {
            mapOp = new MapOperator(runtimeCtx);
        }
        // Not synchronizing creation of mapOp with an invocation. Check immediately
        // after creation in case abort has been set.
        // Relying on the regular flow to clean up the actual operator. i.e. If an exception is
        // thrown, an attempt will be made to cleanup the op.
        // If we are here - exit out via an exception. If we're in the middle of the opeartor.initialize
        // call further down, we rely upon op.abort().
        checkAbortCondition();
        mapOp.clearConnectedOperators();
        mapOp.setExecContext(execContext);
        boolean fromCache = false;
        if (mergeWorkList != null) {
            AbstractMapOperator mergeMapOp = null;
            for (BaseWork mergeWork : mergeWorkList) {
                // TODO HIVE-14042. What is mergeWork, and why is it not part of the regular operator chain.
                // The mergeMapOp.initialize call further down can block, and will not receive information
                // about an abort request.
                MapWork mergeMapWork = (MapWork) mergeWork;
                if (mergeMapWork.getVectorMode()) {
                    mergeMapOp = new VectorMapOperator(runtimeCtx);
                } else {
                    mergeMapOp = new MapOperator(runtimeCtx);
                }
                mergeMapOpList.add(mergeMapOp);
                // initialize the merge operators first.
                if (mergeMapOp != null) {
                    mergeMapOp.setConf(mergeMapWork);
                    LOG.info("Input name is {}", mergeMapWork.getName());
                    jconf.set(Utilities.INPUT_NAME, mergeMapWork.getName());
                    mergeMapOp.initialize(jconf, null);
                    // if there are no files/partitions to read, we need to skip trying to read
                    MultiMRInput multiMRInput = multiMRInputMap.get(mergeMapWork.getName());
                    boolean skipRead = false;
                    if (multiMRInput == null) {
                        LOG.info("Multi MR Input for work {} is null. Skipping read.", mergeMapWork.getName());
                        skipRead = true;
                    } else {
                        Collection<KeyValueReader> keyValueReaders = multiMRInput.getKeyValueReaders();
                        if ((keyValueReaders == null) || (keyValueReaders.isEmpty())) {
                            LOG.info("Key value readers are null or empty and hence skipping read. " + "KeyValueReaders = {}", keyValueReaders);
                            skipRead = true;
                        }
                    }
                    if (skipRead) {
                        List<Operator<?>> children = new ArrayList<>();
                        children.addAll(mergeMapOp.getConf().getAliasToWork().values());
                        // do the same thing as setChildren when there is nothing to read.
                        // the setChildren method initializes the object inspector needed by the operators
                        // based on path and partition information which we don't have in this case.
                        mergeMapOp.initEmptyInputChildren(children, jconf);
                    } else {
                        // the setChildren method initializes the object inspector needed by the operators
                        // based on path and partition information.
                        mergeMapOp.setChildren(jconf);
                    }
                    Operator<? extends OperatorDesc> finalOp = getFinalOp(mergeMapOp);
                    if (finalOp instanceof TezDummyStoreOperator) {
                        // we ensure that we don't try to read any data in case of skip read.
                        ((TezDummyStoreOperator) finalOp).setFetchDone(skipRead);
                        mapOp.setConnectedOperators(mergeMapWork.getTag(), (DummyStoreOperator) finalOp);
                    } else {
                        // found the plan is already connected which means this is derived from the cache.
                        fromCache = true;
                    }
                    mergeMapOp.passExecContext(new ExecMapperContext(jconf));
                    mergeMapOp.initializeLocalWork(jconf);
                }
            }
        }
        if (!fromCache) {
            // if not from cache, we still need to hook up the plans.
            ((TezContext) (MapredContext.get())).setDummyOpsMap(mapOp.getConnectedOperators());
        }
        // initialize map operator
        mapOp.setConf(mapWork);
        LOG.info("Main input name is " + mapWork.getName());
        jconf.set(Utilities.INPUT_NAME, mapWork.getName());
        mapOp.initialize(jconf, null);
        checkAbortCondition();
        mapOp.setChildren(jconf);
        mapOp.passExecContext(execContext);
        LOG.info(mapOp.dump(0));
        // set memory available for operators
        long memoryAvailableToTask = processorContext.getTotalMemoryAvailableToTask();
        if (mapOp.getConf() != null) {
            mapOp.getConf().setMaxMemoryAvailable(memoryAvailableToTask);
            LOG.info("Memory available for operators set to {}", LlapUtil.humanReadableByteCount(memoryAvailableToTask));
        }
        OperatorUtils.setMemoryAvailable(mapOp.getChildOperators(), memoryAvailableToTask);
        mapOp.initializeLocalWork(jconf);
        // Setup values registry
        checkAbortCondition();
        String valueRegistryKey = DynamicValue.DYNAMIC_VALUE_REGISTRY_CACHE_KEY;
        // On LLAP dynamic value registry might already be cached.
        final DynamicValueRegistryTez registryTez = dynamicValueCache.retrieve(valueRegistryKey, () -> new DynamicValueRegistryTez());
        dynamicValueCacheKeys.add(valueRegistryKey);
        RegistryConfTez registryConf = new RegistryConfTez(jconf, mapWork, processorContext, inputs);
        registryTez.init(registryConf);
        checkAbortCondition();
        initializeMapRecordSources();
        mapOp.initializeMapOperator(jconf);
        if ((mergeMapOpList != null) && !mergeMapOpList.isEmpty()) {
            for (AbstractMapOperator mergeMapOp : mergeMapOpList) {
                jconf.set(Utilities.INPUT_NAME, mergeMapOp.getConf().getName());
                // TODO HIVE-14042. abort handling: Handling of mergeMapOp
                mergeMapOp.initializeMapOperator(jconf);
            }
        }
        // Initialization isn't finished until all parents of all operators
        // are initialized. For broadcast joins that means initializing the
        // dummy parent operators as well.
        List<HashTableDummyOperator> dummyOps = mapWork.getDummyOps();
        jconf.set(Utilities.INPUT_NAME, mapWork.getName());
        if (dummyOps != null) {
            for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
                dummyOp.setExecContext(execContext);
                // TODO HIVE-14042. Handling of dummyOps, and propagating abort information to them
                dummyOp.initialize(jconf, null);
            }
        }
        OperatorUtils.setChildrenCollector(mapOp.getChildOperators(), outMap);
        mapOp.setReporter(reporter);
        MapredContext.get().setReporter(reporter);
    } catch (Throwable e) {
        setAborted(true);
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else if (e instanceof InterruptedException) {
            LOG.info("Hit an interrupt while initializing MapRecordProcessor. Message={}", e.getMessage());
            throw (InterruptedException) e;
        } else {
            throw new RuntimeException("Map operator initialization failed", e);
        }
    }
    perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
Also used : AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) Configuration(org.apache.hadoop.conf.Configuration) KeyValueReader(org.apache.tez.runtime.library.api.KeyValueReader) ArrayList(java.util.ArrayList) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) JobConf(org.apache.hadoop.mapred.JobConf) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) MultiMRInput(org.apache.tez.mapreduce.input.MultiMRInput) ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) LogicalOutput(org.apache.tez.runtime.api.LogicalOutput) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) TezKVOutputCollector(org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) RegistryConfTez(org.apache.hadoop.hive.ql.exec.tez.DynamicValueRegistryTez.RegistryConfTez) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 83 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class SymbolicInputFormat method rework.

public void rework(HiveConf job, MapredWork work) throws IOException {
    Map<Path, PartitionDesc> pathToParts = work.getMapWork().getPathToPartitionInfo();
    List<Path> toRemovePaths = new ArrayList<>();
    Map<Path, PartitionDesc> toAddPathToPart = new HashMap<>();
    Map<Path, List<String>> pathToAliases = work.getMapWork().getPathToAliases();
    for (Map.Entry<Path, PartitionDesc> pathPartEntry : pathToParts.entrySet()) {
        Path path = pathPartEntry.getKey();
        PartitionDesc partDesc = pathPartEntry.getValue();
        // this path points to a symlink path
        if (partDesc.getInputFileFormatClass().equals(SymlinkTextInputFormat.class)) {
            // change to TextInputFormat
            partDesc.setInputFileFormatClass(TextInputFormat.class);
            FileSystem fileSystem = path.getFileSystem(job);
            FileStatus fStatus = fileSystem.getFileStatus(path);
            FileStatus[] symlinks = null;
            if (!fStatus.isDir()) {
                symlinks = new FileStatus[] { fStatus };
            } else {
                symlinks = fileSystem.listStatus(path, FileUtils.HIDDEN_FILES_PATH_FILTER);
            }
            toRemovePaths.add(path);
            List<String> aliases = pathToAliases.remove(path);
            for (FileStatus symlink : symlinks) {
                BufferedReader reader = null;
                try {
                    reader = new BufferedReader(new InputStreamReader(fileSystem.open(symlink.getPath())));
                    partDesc.setInputFileFormatClass(TextInputFormat.class);
                    String line;
                    while ((line = reader.readLine()) != null) {
                        // no check for the line? How to check?
                        // if the line is invalid for any reason, the job will fail.
                        FileStatus[] matches = fileSystem.globStatus(new Path(line));
                        for (FileStatus fileStatus : matches) {
                            Path schemaLessPath = Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath());
                            StringInternUtils.internUriStringsInPath(schemaLessPath);
                            toAddPathToPart.put(schemaLessPath, partDesc);
                            pathToAliases.put(schemaLessPath, aliases);
                        }
                    }
                } finally {
                    org.apache.hadoop.io.IOUtils.closeStream(reader);
                }
            }
        }
    }
    for (Entry<Path, PartitionDesc> toAdd : toAddPathToPart.entrySet()) {
        work.getMapWork().addPathToPartitionInfo(toAdd.getKey(), toAdd.getValue());
    }
    for (Path toRemove : toRemovePaths) {
        work.getMapWork().removePathToPartitionInfo(toRemove);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) BufferedReader(java.io.BufferedReader) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map)

Example 84 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class AvroGenericRecordReader method getSchema.

/**
 * Attempt to retrieve the reader schema.  We have a couple opportunities
 * to provide this, depending on whether or not we're just selecting data
 * or running with a MR job.
 * @return  Reader schema for the Avro object, or null if it has not been provided.
 * @throws AvroSerdeException
 */
private Schema getSchema(JobConf job, FileSplit split) throws AvroSerdeException, IOException {
    // Inside of a MR job, we can pull out the actual properties
    if (AvroSerdeUtils.insideMRJob(job)) {
        MapWork mapWork = Utilities.getMapWork(job);
        // that matches our input split.
        for (Map.Entry<Path, PartitionDesc> pathsAndParts : mapWork.getPathToPartitionInfo().entrySet()) {
            Path partitionPath = pathsAndParts.getKey();
            if (pathIsInPartition(split.getPath(), partitionPath)) {
                LOG.info("Matching partition {} with input split {}", partitionPath, split);
                Properties props = pathsAndParts.getValue().getProperties();
                if (props.containsKey(AvroTableProperties.SCHEMA_LITERAL.getPropName()) || props.containsKey(AvroTableProperties.SCHEMA_URL.getPropName())) {
                    return AvroSerdeUtils.determineSchemaOrThrowException(job, props);
                } else {
                    // If it's not in this property, it won't be in any others
                    return null;
                }
            }
        }
        LOG.info("Unable to match filesplit {} with a partition.", split);
    }
    // In "select * from table" situations (non-MR), we can add things to the job
    // It's safe to add this to the job since it's not *actually* a mapred job.
    // Here the global state is confined to just this process.
    String s = job.get(AvroTableProperties.AVRO_SERDE_SCHEMA.getPropName());
    if (StringUtils.isNotBlank(s)) {
        LOG.info("Found the avro schema in the job");
        LOG.debug("Avro schema: {}", s);
        return AvroSerdeUtils.getSchemaFor(s);
    }
    // No more places to get the schema from. Give up.  May have to re-encode later.
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) AvroTableProperties(org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.AvroTableProperties) Properties(java.util.Properties) Map(java.util.Map)

Example 85 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class GenMRSkewJoinProcessor method processSkewJoin.

/**
 * Create tasks for processing skew joins. The idea is (HIVE-964) to use
 * separated jobs and map-joins to handle skew joins.
 * <p>
 * <ul>
 * <li>
 * Number of mr jobs to handle skew keys is the number of table minus 1 (we
 * can stream the last table, so big keys in the last table will not be a
 * problem).
 * <li>
 * At runtime in Join, we output big keys in one table into one corresponding
 * directories, and all same keys in other tables into different dirs(one for
 * each table). The directories will look like:
 * <ul>
 * <li>
 * dir-T1-bigkeys(containing big keys in T1), dir-T2-keys(containing keys
 * which is big in T1),dir-T3-keys(containing keys which is big in T1), ...
 * <li>
 * dir-T1-keys(containing keys which is big in T2), dir-T2-bigkeys(containing
 * big keys in T2),dir-T3-keys(containing keys which is big in T2), ...
 * <li>
 * dir-T1-keys(containing keys which is big in T3), dir-T2-keys(containing big
 * keys in T3),dir-T3-bigkeys(containing keys which is big in T3), ... .....
 * </ul>
 * </ul>
 * For each table, we launch one mapjoin job, taking the directory containing
 * big keys in this table and corresponding dirs in other tables as input.
 * (Actually one job for one row in the above.)
 *
 * <p>
 * For more discussions, please check
 * https://issues.apache.org/jira/browse/HIVE-964.
 */
@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<?> currTask, ParseContext parseCtx) throws SemanticException {
    // now does not work with outer joins
    if (!GenMRSkewJoinProcessor.skewJoinEnabled(parseCtx.getConf(), joinOp)) {
        return;
    }
    List<Task<?>> children = currTask.getChildTasks();
    Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
    JoinDesc joinDescriptor = joinOp.getConf();
    Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
    int numAliases = joinValues.size();
    Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
    Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
    Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
    Byte[] tags = joinDescriptor.getTagOrder();
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        bigKeysDirMap.put(alias, getBigKeysDir(baseTmpDir, alias));
        Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
        smallKeysDirMap.put(alias, smallKeysMap);
        for (Byte src2 : tags) {
            if (!src2.equals(alias)) {
                smallKeysMap.put(src2, getSmallKeysDir(baseTmpDir, alias, src2));
            }
        }
        skewJoinJobResultsDir.put(alias, getBigKeysSkewJoinResultDir(baseTmpDir, alias));
    }
    joinDescriptor.setHandleSkewJoin(true);
    joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
    joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
    joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
    HashMap<Path, Task<?>> bigKeysDirToTaskMap = new HashMap<Path, Task<?>>();
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<?>> listTasks = new ArrayList<Task<?>>();
    MapredWork currPlan = (MapredWork) currTask.getWork();
    TableDesc keyTblDesc = (TableDesc) currPlan.getReduceWork().getKeyDesc().clone();
    List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
    List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
    Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
    Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
    Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
    // used for create mapJoinDesc, should be in order
    List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
    for (Byte tag : tags) {
        newJoinValueTblDesc.add(null);
    }
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        List<ExprNodeDesc> valueCols = joinValues.get(alias);
        String colNames = "";
        String colTypes = "";
        int columnSize = valueCols.size();
        List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
        List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
        ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
        boolean first = true;
        for (int k = 0; k < columnSize; k++) {
            TypeInfo type = valueCols.get(k).getTypeInfo();
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
            columnInfos.add(columnInfo);
            newValueExpr.add(new ExprNodeColumnDesc(columnInfo));
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + newColName;
            colTypes = colTypes + valueCols.get(k).getTypeString();
        }
        // we are putting join keys at last part of the spilled table
        for (int k = 0; k < joinKeys.size(); k++) {
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + joinKeys.get(k);
            colTypes = colTypes + joinKeyTypes.get(k);
            ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
            columnInfos.add(columnInfo);
            newKeyExpr.add(new ExprNodeColumnDesc(columnInfo));
        }
        newJoinValues.put(alias, newValueExpr);
        newJoinKeys.put(alias, newKeyExpr);
        tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
        rowSchemaList.put(alias, new RowSchema(columnInfos));
        // construct value table Desc
        String valueColNames = "";
        String valueColTypes = "";
        first = true;
        for (int k = 0; k < columnSize; k++) {
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            if (!first) {
                valueColNames = valueColNames + ",";
                valueColTypes = valueColTypes + ",";
            }
            valueColNames = valueColNames + newColName;
            valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
            first = false;
        }
        newJoinValueTblDesc.set(Byte.valueOf((byte) i), Utilities.getTableDesc(valueColNames, valueColTypes));
    }
    joinDescriptor.setSkewKeysValuesTables(tableDescList);
    joinDescriptor.setKeyTableDesc(keyTblDesc);
    for (int i = 0; i < numAliases - 1; i++) {
        Byte src = tags[i];
        MapWork newPlan = PlanUtils.getMapRedWork().getMapWork();
        // This code has been only added for testing
        boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
        newPlan.setMapperCannotSpanPartns(mapperCannotSpanPartns);
        MapredWork clonePlan = SerializationUtilities.clonePlan(currPlan);
        Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
        for (int k = 0; k < tags.length; k++) {
            Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
            ((TableScanOperator) ts).setTableDescSkewJoin(tableDescList.get((byte) k));
            parentOps[k] = ts;
        }
        Operator<? extends OperatorDesc> tblScan_op = parentOps[i];
        ArrayList<String> aliases = new ArrayList<String>();
        String alias = src.toString().intern();
        aliases.add(alias);
        Path bigKeyDirPath = bigKeysDirMap.get(src);
        newPlan.addPathToAlias(bigKeyDirPath, aliases);
        newPlan.getAliasToWork().put(alias, tblScan_op);
        PartitionDesc part = new PartitionDesc(tableDescList.get(src), null);
        newPlan.addPathToPartitionInfo(bigKeyDirPath, part);
        newPlan.getAliasToPartnInfo().put(alias, part);
        Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
        assert reducer instanceof JoinOperator;
        JoinOperator cloneJoinOp = (JoinOperator) reducer;
        String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
        MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix, joinDescriptor.getMemoryMonitorInfo(), joinDescriptor.getInMemoryDataSize());
        mapJoinDescriptor.setTagOrder(tags);
        mapJoinDescriptor.setHandleSkewJoin(false);
        mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
        mapJoinDescriptor.setColumnExprMap(joinDescriptor.getColumnExprMap());
        MapredLocalWork localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
        Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
        for (int j = 0; j < numAliases; j++) {
            if (j == i) {
                continue;
            }
            Byte small_alias = tags[j];
            Operator<? extends OperatorDesc> tblScan_op2 = parentOps[j];
            localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2);
            Path tblDir = smallTblDirs.get(small_alias);
            localPlan.getAliasToFetchWork().put(small_alias.toString(), new FetchWork(tblDir, tableDescList.get(small_alias)));
        }
        newPlan.setMapRedLocalWork(localPlan);
        // construct a map join and set it as the child operator of tblScan_op
        MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, (RowSchema) null, parentOps);
        // change the children of the original join operator to point to the map
        // join operator
        List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
        for (Operator<? extends OperatorDesc> childOp : childOps) {
            childOp.replaceParent(cloneJoinOp, mapJoinOp);
        }
        mapJoinOp.setChildOperators(childOps);
        HiveConf jc = new HiveConf(parseCtx.getConf(), GenMRSkewJoinProcessor.class);
        newPlan.setNumMapTasks(HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
        newPlan.setMinSplitSize(HiveConf.getLongVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
        newPlan.setInputformat(HiveInputFormat.class.getName());
        MapredWork w = new MapredWork();
        w.setMapWork(newPlan);
        Task<?> skewJoinMapJoinTask = TaskFactory.get(w);
        skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
        bigKeysDirToTaskMap.put(bigKeyDirPath, skewJoinMapJoinTask);
        listWorks.add(skewJoinMapJoinTask.getWork());
        listTasks.add(skewJoinMapJoinTask);
    }
    if (children != null) {
        for (Task<?> tsk : listTasks) {
            for (Task<?> oldChild : children) {
                tsk.addDependentTask(oldChild);
            }
        }
        currTask.setChildTasks(new ArrayList<Task<?>>());
        for (Task<?> oldChild : children) {
            oldChild.getParentTasks().remove(currTask);
        }
        listTasks.addAll(children);
    }
    ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
    cndTsk.setListTasks(listTasks);
    cndTsk.setResolver(new ConditionalResolverSkewJoin());
    cndTsk.setResolverCtx(context);
    currTask.setChildTasks(new ArrayList<Task<?>>());
    currTask.addDependentTask(cndTsk);
    return;
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) List(java.util.List) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ConditionalResolverSkewJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) Path(org.apache.hadoop.fs.Path) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) ConditionalResolverSkewJoinCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)90 Path (org.apache.hadoop.fs.Path)67 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)41 ArrayList (java.util.ArrayList)39 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)27 LinkedHashMap (java.util.LinkedHashMap)24 List (java.util.List)23 JobConf (org.apache.hadoop.mapred.JobConf)21 Map (java.util.Map)18 Properties (java.util.Properties)18 HashMap (java.util.HashMap)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 IOException (java.io.IOException)15 Operator (org.apache.hadoop.hive.ql.exec.Operator)15 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)14 Configuration (org.apache.hadoop.conf.Configuration)13 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)9 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)9